From 9a56493f6942c0e2df1579986128721da96e00d8 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 3 Aug 2020 13:16:21 +0300 Subject: uts: Use generic ns_common::count Switch over uts namespaces to use the newly introduced common lifetime counter. Currently every namespace type has its own lifetime counter which is stored in the specific namespace struct. The lifetime counters are used identically for all namespaces types. Namespaces may of course have additional unrelated counters and these are not altered. This introduces a common lifetime counter into struct ns_common. The ns_common struct encompasses information that all namespaces share. That should include the lifetime counter since its common for all of them. It also allows us to unify the type of the counters across all namespaces. Most of them use refcount_t but one uses atomic_t and at least one uses kref. Especially the last one doesn't make much sense since it's just a wrapper around refcount_t since 2016 and actually complicates cleanup operations by having to use container_of() to cast the correct namespace struct out of struct ns_common. Having the lifetime counter for the namespaces in one place reduces maintenance cost. Not just because after switching all namespaces over we will have removed more code than we added but also because the logic is more easily understandable and we indicate to the user that the basic lifetime requirements for all namespaces are currently identical. Signed-off-by: Kirill Tkhai Reviewed-by: Kees Cook Acked-by: Christian Brauner Link: https://lore.kernel.org/r/159644978167.604812.1773586504374412107.stgit@localhost.localdomain Signed-off-by: Christian Brauner --- kernel/utsname.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/utsname.c b/kernel/utsname.c index e488d0e2ab45..b1ac3ca870f2 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -33,7 +33,7 @@ static struct uts_namespace *create_uts_ns(void) uts_ns = kmem_cache_alloc(uts_ns_cache, GFP_KERNEL); if (uts_ns) - kref_init(&uts_ns->kref); + refcount_set(&uts_ns->ns.count, 1); return uts_ns; } @@ -103,11 +103,8 @@ struct uts_namespace *copy_utsname(unsigned long flags, return new_ns; } -void free_uts_ns(struct kref *kref) +void free_uts_ns(struct uts_namespace *ns) { - struct uts_namespace *ns; - - ns = container_of(kref, struct uts_namespace, kref); dec_uts_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); -- cgit v1.2.3 From 8eb71d95f34a009cc22084e05e78eb9686f7ea28 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 3 Aug 2020 13:16:32 +0300 Subject: pid: Use generic ns_common::count Switch over pid namespaces to use the newly introduced common lifetime counter. Currently every namespace type has its own lifetime counter which is stored in the specific namespace struct. The lifetime counters are used identically for all namespaces types. Namespaces may of course have additional unrelated counters and these are not altered. This introduces a common lifetime counter into struct ns_common. The ns_common struct encompasses information that all namespaces share. That should include the lifetime counter since its common for all of them. It also allows us to unify the type of the counters across all namespaces. Most of them use refcount_t but one uses atomic_t and at least one uses kref. Especially the last one doesn't make much sense since it's just a wrapper around refcount_t since 2016 and actually complicates cleanup operations by having to use container_of() to cast the correct namespace struct out of struct ns_common. Having the lifetime counter for the namespaces in one place reduces maintenance cost. Not just because after switching all namespaces over we will have removed more code than we added but also because the logic is more easily understandable and we indicate to the user that the basic lifetime requirements for all namespaces are currently identical. Signed-off-by: Kirill Tkhai Reviewed-by: Kees Cook Acked-by: Christian Brauner Link: https://lore.kernel.org/r/159644979226.604812.7512601754841882036.stgit@localhost.localdomain Signed-off-by: Christian Brauner --- include/linux/pid_namespace.h | 4 +--- kernel/pid.c | 2 +- kernel/pid_namespace.c | 13 +++---------- 3 files changed, 5 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 5a5cb45ac57e..7c7e627503d2 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -8,7 +8,6 @@ #include #include #include -#include #include #include @@ -18,7 +17,6 @@ struct fs_pin; struct pid_namespace { - struct kref kref; struct idr idr; struct rcu_head rcu; unsigned int pid_allocated; @@ -43,7 +41,7 @@ extern struct pid_namespace init_pid_ns; static inline struct pid_namespace *get_pid_ns(struct pid_namespace *ns) { if (ns != &init_pid_ns) - kref_get(&ns->kref); + refcount_inc(&ns->ns.count); return ns; } diff --git a/kernel/pid.c b/kernel/pid.c index b2562a7ce525..2b97bedc1d9f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -72,7 +72,7 @@ int pid_max_max = PID_MAX_LIMIT; * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { - .kref = KREF_INIT(2), + .ns.count = REFCOUNT_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index ac135bd600eb..166a91cdd387 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -102,7 +102,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns goto out_free_idr; ns->ns.ops = &pidns_operations; - kref_init(&ns->kref); + refcount_set(&ns->ns.count, 1); ns->level = level; ns->parent = get_pid_ns(parent_pid_ns); ns->user_ns = get_user_ns(user_ns); @@ -148,22 +148,15 @@ struct pid_namespace *copy_pid_ns(unsigned long flags, return create_pid_namespace(user_ns, old_ns); } -static void free_pid_ns(struct kref *kref) -{ - struct pid_namespace *ns; - - ns = container_of(kref, struct pid_namespace, kref); - destroy_pid_namespace(ns); -} - void put_pid_ns(struct pid_namespace *ns) { struct pid_namespace *parent; while (ns != &init_pid_ns) { parent = ns->parent; - if (!kref_put(&ns->kref, free_pid_ns)) + if (!refcount_dec_and_test(&ns->ns.count)) break; + destroy_pid_namespace(ns); ns = parent; } } -- cgit v1.2.3 From 265cbd62e034cb09a9da7cbff9072c8082f8df65 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 3 Aug 2020 13:16:37 +0300 Subject: user: Use generic ns_common::count Switch over user namespaces to use the newly introduced common lifetime counter. Currently every namespace type has its own lifetime counter which is stored in the specific namespace struct. The lifetime counters are used identically for all namespaces types. Namespaces may of course have additional unrelated counters and these are not altered. This introduces a common lifetime counter into struct ns_common. The ns_common struct encompasses information that all namespaces share. That should include the lifetime counter since its common for all of them. It also allows us to unify the type of the counters across all namespaces. Most of them use refcount_t but one uses atomic_t and at least one uses kref. Especially the last one doesn't make much sense since it's just a wrapper around refcount_t since 2016 and actually complicates cleanup operations by having to use container_of() to cast the correct namespace struct out of struct ns_common. Having the lifetime counter for the namespaces in one place reduces maintenance cost. Not just because after switching all namespaces over we will have removed more code than we added but also because the logic is more easily understandable and we indicate to the user that the basic lifetime requirements for all namespaces are currently identical. Signed-off-by: Kirill Tkhai Reviewed-by: Kees Cook Acked-by: Christian Brauner Link: https://lore.kernel.org/r/159644979754.604812.601625186726406922.stgit@localhost.localdomain Signed-off-by: Christian Brauner --- include/linux/user_namespace.h | 5 ++--- kernel/user.c | 2 +- kernel/user_namespace.c | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 6ef1c7109fc4..64cf8ebdc4ec 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -57,7 +57,6 @@ struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; struct uid_gid_map projid_map; - atomic_t count; struct user_namespace *parent; int level; kuid_t owner; @@ -109,7 +108,7 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type); static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) - atomic_inc(&ns->count); + refcount_inc(&ns->ns.count); return ns; } @@ -119,7 +118,7 @@ extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { - if (ns && atomic_dec_and_test(&ns->count)) + if (ns && refcount_dec_and_test(&ns->ns.count)) __put_user_ns(ns); } diff --git a/kernel/user.c b/kernel/user.c index b1635d94a1f2..a2478cddf536 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -55,7 +55,7 @@ struct user_namespace init_user_ns = { }, }, }, - .count = ATOMIC_INIT(3), + .ns.count = REFCOUNT_INIT(3), .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .ns.inum = PROC_USER_INIT_INO, diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 87804e0371fe..7c2bbe8f3e45 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -111,7 +111,7 @@ int create_user_ns(struct cred *new) goto fail_free; ns->ns.ops = &userns_operations; - atomic_set(&ns->count, 1); + refcount_set(&ns->ns.count, 1); /* Leave the new->user_ns reference with the new user namespace. */ ns->parent = parent_ns; ns->level = parent_ns->level + 1; @@ -197,7 +197,7 @@ static void free_user_ns(struct work_struct *work) kmem_cache_free(user_ns_cachep, ns); dec_user_namespaces(ucounts); ns = parent; - } while (atomic_dec_and_test(&parent->count)); + } while (refcount_dec_and_test(&parent->ns.count)); } void __put_user_ns(struct user_namespace *ns) -- cgit v1.2.3 From f387882d8d3eda7c7b13e330c73907035569ce4a Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 3 Aug 2020 13:16:50 +0300 Subject: cgroup: Use generic ns_common::count Switch over cgroup namespaces to use the newly introduced common lifetime counter. Currently every namespace type has its own lifetime counter which is stored in the specific namespace struct. The lifetime counters are used identically for all namespaces types. Namespaces may of course have additional unrelated counters and these are not altered. This introduces a common lifetime counter into struct ns_common. The ns_common struct encompasses information that all namespaces share. That should include the lifetime counter since its common for all of them. It also allows us to unify the type of the counters across all namespaces. Most of them use refcount_t but one uses atomic_t and at least one uses kref. Especially the last one doesn't make much sense since it's just a wrapper around refcount_t since 2016 and actually complicates cleanup operations by having to use container_of() to cast the correct namespace struct out of struct ns_common. Having the lifetime counter for the namespaces in one place reduces maintenance cost. Not just because after switching all namespaces over we will have removed more code than we added but also because the logic is more easily understandable and we indicate to the user that the basic lifetime requirements for all namespaces are currently identical. Signed-off-by: Kirill Tkhai Reviewed-by: Kees Cook Acked-by: Christian Brauner Link: https://lore.kernel.org/r/159644980994.604812.383801057081594972.stgit@localhost.localdomain Signed-off-by: Christian Brauner --- include/linux/cgroup.h | 5 ++--- kernel/cgroup/cgroup.c | 2 +- kernel/cgroup/namespace.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 618838c48313..451c2d26a5db 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -854,7 +854,6 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ struct cgroup_namespace { - refcount_t count; struct ns_common ns; struct user_namespace *user_ns; struct ucounts *ucounts; @@ -889,12 +888,12 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, static inline void get_cgroup_ns(struct cgroup_namespace *ns) { if (ns) - refcount_inc(&ns->count); + refcount_inc(&ns->ns.count); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { - if (ns && refcount_dec_and_test(&ns->count)) + if (ns && refcount_dec_and_test(&ns->ns.count)) free_cgroup_ns(ns); } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index dd247747ec14..22e466926853 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -199,7 +199,7 @@ static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .count = REFCOUNT_INIT(2), + .ns.count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 812a61afd538..f5e8828c109c 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -32,7 +32,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) kfree(new_ns); return ERR_PTR(ret); } - refcount_set(&new_ns->count, 1); + refcount_set(&new_ns->ns.count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } -- cgit v1.2.3 From 28c41efd08bf97fc64f75304035ee3943995b68e Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 3 Aug 2020 13:17:00 +0300 Subject: time: Use generic ns_common::count Switch over time namespaces to use the newly introduced common lifetime counter. Currently every namespace type has its own lifetime counter which is stored in the specific namespace struct. The lifetime counters are used identically for all namespaces types. Namespaces may of course have additional unrelated counters and these are not altered. This introduces a common lifetime counter into struct ns_common. The ns_common struct encompasses information that all namespaces share. That should include the lifetime counter since its common for all of them. It also allows us to unify the type of the counters across all namespaces. Most of them use refcount_t but one uses atomic_t and at least one uses kref. Especially the last one doesn't make much sense since it's just a wrapper around refcount_t since 2016 and actually complicates cleanup operations by having to use container_of() to cast the correct namespace struct out of struct ns_common. Having the lifetime counter for the namespaces in one place reduces maintenance cost. Not just because after switching all namespaces over we will have removed more code than we added but also because the logic is more easily understandable and we indicate to the user that the basic lifetime requirements for all namespaces are currently identical. Signed-off-by: Kirill Tkhai Reviewed-by: Kees Cook Acked-by: Christian Brauner Link: https://lore.kernel.org/r/159644982033.604812.9406853013011123238.stgit@localhost.localdomain Signed-off-by: Christian Brauner --- include/linux/time_namespace.h | 9 ++++----- kernel/time/namespace.c | 9 +++------ 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 5b6031385db0..a51ffc089219 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -4,7 +4,6 @@ #include -#include #include #include #include @@ -18,7 +17,6 @@ struct timens_offsets { }; struct time_namespace { - struct kref kref; struct user_namespace *user_ns; struct ucounts *ucounts; struct ns_common ns; @@ -37,20 +35,21 @@ extern void timens_commit(struct task_struct *tsk, struct time_namespace *ns); static inline struct time_namespace *get_time_ns(struct time_namespace *ns) { - kref_get(&ns->kref); + refcount_inc(&ns->ns.count); return ns; } struct time_namespace *copy_time_ns(unsigned long flags, struct user_namespace *user_ns, struct time_namespace *old_ns); -void free_time_ns(struct kref *kref); +void free_time_ns(struct time_namespace *ns); int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); struct vdso_data *arch_get_vdso_data(void *vvar_page); static inline void put_time_ns(struct time_namespace *ns) { - kref_put(&ns->kref, free_time_ns); + if (refcount_dec_and_test(&ns->ns.count)) + free_time_ns(ns); } void proc_timens_show_offsets(struct task_struct *p, struct seq_file *m); diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index afc65e6be33e..c4c829eb3511 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -92,7 +92,7 @@ static struct time_namespace *clone_time_ns(struct user_namespace *user_ns, if (!ns) goto fail_dec; - kref_init(&ns->kref); + refcount_set(&ns->ns.count, 1); ns->vvar_page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!ns->vvar_page) @@ -226,11 +226,8 @@ out: mutex_unlock(&offset_lock); } -void free_time_ns(struct kref *kref) +void free_time_ns(struct time_namespace *ns) { - struct time_namespace *ns; - - ns = container_of(kref, struct time_namespace, kref); dec_time_namespaces(ns->ucounts); put_user_ns(ns->user_ns); ns_free_inum(&ns->ns); @@ -464,7 +461,7 @@ const struct proc_ns_operations timens_for_children_operations = { }; struct time_namespace init_time_ns = { - .kref = KREF_INIT(3), + .ns.count = REFCOUNT_INIT(3), .user_ns = &init_user_ns, .ns.inum = PROC_TIME_INIT_INO, .ns.ops = &timens_operations, -- cgit v1.2.3 From 76df441ade97556816e8fc522ae4c3b50a255fb4 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 14 Aug 2020 04:39:32 -0400 Subject: signal: Convert to the new fallthrough notation Switch from using the /* fall through */ comment style notation to the new, preferred notation as outlined in our docs. Signed-off-by: Miaohe Lin Acked-by: Christian Brauner [christian.brauner@ubuntu.com: rewrite commit message] Link: https://lore.kernel.org/r/20200814083932.4975-1-linmiaohe@huawei.com Signed-off-by: Christian Brauner --- kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 42b67d2cea37..a38b3edc6851 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -851,7 +851,7 @@ static int check_kill_permission(int sig, struct kernel_siginfo *info, */ if (!sid || sid == task_session(current)) break; - /* fall through */ + fallthrough; default: return -EPERM; } -- cgit v1.2.3 From bda4c60d02e9ceeee726f73250b808fad0663dd6 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 14 Aug 2020 04:57:18 -0400 Subject: sys: Convert to the new fallthrough notation Switch from using the /* fall through */ comment style notation to the new, preferred notation as outlined in our docs. Signed-off-by: Miaohe Lin Acked-by: Christian Brauner [christian.brauner@ubuntu.com: rewrite commit message] Link: https://lore.kernel.org/r/20200814085718.40326-1-linmiaohe@huawei.com Signed-off-by: Christian Brauner --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index ca11af9d815d..ab6c409b1159 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1753,7 +1753,7 @@ void getrusage(struct task_struct *p, int who, struct rusage *r) if (who == RUSAGE_CHILDREN) break; - /* fall through */ + fallthrough; case RUSAGE_SELF: thread_group_cputime_adjusted(p, &tgutime, &tgstime); -- cgit v1.2.3 From d741bf41d7c7db4898bacfcb020353cddc032fd8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Aug 2020 22:03:24 +0900 Subject: kprobes: Remove kretprobe hash The kretprobe hash is mostly superfluous, replace it with a per-task variable. This gets rid of the task hash and it's related locking. Note that this may change the kprobes module-exported API for kretprobe handlers. If any out-of-tree kretprobe user uses ri->rp, use get_kretprobe(ri) instead. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/159870620431.1229682.16325792502413731312.stgit@devnote2 --- include/linux/kprobes.h | 19 +++- include/linux/sched.h | 4 + kernel/fork.c | 4 + kernel/kprobes.c | 236 +++++++++++++------------------------------- kernel/trace/trace_kprobe.c | 3 +- 5 files changed, 97 insertions(+), 169 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 5c8c271fa1e9..00cf4421efd5 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #ifdef CONFIG_KPROBES @@ -144,6 +145,11 @@ static inline int kprobe_ftrace(struct kprobe *p) * ignored, due to maxactive being too low. * */ +struct kretprobe_holder { + struct kretprobe *rp; + refcount_t ref; +}; + struct kretprobe { struct kprobe kp; kretprobe_handler_t handler; @@ -152,17 +158,18 @@ struct kretprobe { int nmissed; size_t data_size; struct hlist_head free_instances; + struct kretprobe_holder *rph; raw_spinlock_t lock; }; struct kretprobe_instance { union { + struct llist_node llist; struct hlist_node hlist; struct rcu_head rcu; }; - struct kretprobe *rp; + struct kretprobe_holder *rph; kprobe_opcode_t *ret_addr; - struct task_struct *task; void *fp; char data[]; }; @@ -221,6 +228,14 @@ unsigned long kretprobe_trampoline_handler(struct pt_regs *regs, return ret; } +static nokprobe_inline struct kretprobe *get_kretprobe(struct kretprobe_instance *ri) +{ + RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), + "Kretprobe is accessed from instance under preemptive context"); + + return READ_ONCE(ri->rph->rp); +} + #else /* CONFIG_KRETPROBES */ static inline void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) diff --git a/include/linux/sched.h b/include/linux/sched.h index afe01e232935..5911805cafde 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1315,6 +1315,10 @@ struct task_struct { struct callback_head mce_kill_me; #endif +#ifdef CONFIG_KRETPROBES + struct llist_head kretprobe_instances; +#endif + /* * New fields for task_struct should be added above here, so that * they are included in the randomized portion of task_struct. diff --git a/kernel/fork.c b/kernel/fork.c index 49677d668de4..53a1f508a097 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2161,6 +2161,10 @@ static __latent_entropy struct task_struct *copy_process( INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; +#ifdef CONFIG_KRETPROBES + p->kretprobe_instances.first = NULL; +#endif + /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3b61ae8ff5da..850ee36a4051 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -53,7 +53,6 @@ static int kprobes_initialized; * - RCU hlist traversal under disabling preempt (breakpoint handlers) */ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; -static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; /* NOTE: change this value only with kprobe_mutex held */ static bool kprobes_all_disarmed; @@ -61,9 +60,6 @@ static bool kprobes_all_disarmed; /* This protects kprobe_table and optimizing_list */ static DEFINE_MUTEX(kprobe_mutex); static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; -static struct { - raw_spinlock_t lock ____cacheline_aligned_in_smp; -} kretprobe_table_locks[KPROBE_TABLE_SIZE]; kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, unsigned int __unused) @@ -71,11 +67,6 @@ kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, return ((kprobe_opcode_t *)(kallsyms_lookup_name(name))); } -static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) -{ - return &(kretprobe_table_locks[hash].lock); -} - /* Blacklist -- list of struct kprobe_blacklist_entry */ static LIST_HEAD(kprobe_blacklist); @@ -1223,65 +1214,30 @@ void kprobes_inc_nmissed_count(struct kprobe *p) } NOKPROBE_SYMBOL(kprobes_inc_nmissed_count); +static void free_rp_inst_rcu(struct rcu_head *head) +{ + struct kretprobe_instance *ri = container_of(head, struct kretprobe_instance, rcu); + + if (refcount_dec_and_test(&ri->rph->ref)) + kfree(ri->rph); + kfree(ri); +} +NOKPROBE_SYMBOL(free_rp_inst_rcu); + static void recycle_rp_inst(struct kretprobe_instance *ri) { - struct kretprobe *rp = ri->rp; + struct kretprobe *rp = get_kretprobe(ri); - /* remove rp inst off the rprobe_inst_table */ - hlist_del(&ri->hlist); INIT_HLIST_NODE(&ri->hlist); if (likely(rp)) { raw_spin_lock(&rp->lock); hlist_add_head(&ri->hlist, &rp->free_instances); raw_spin_unlock(&rp->lock); } else - kfree_rcu(ri, rcu); + call_rcu(&ri->rcu, free_rp_inst_rcu); } NOKPROBE_SYMBOL(recycle_rp_inst); -static void kretprobe_hash_lock(struct task_struct *tsk, - struct hlist_head **head, unsigned long *flags) -__acquires(hlist_lock) -{ - unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - raw_spinlock_t *hlist_lock; - - *head = &kretprobe_inst_table[hash]; - hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); -} -NOKPROBE_SYMBOL(kretprobe_hash_lock); - -static void kretprobe_table_lock(unsigned long hash, - unsigned long *flags) -__acquires(hlist_lock) -{ - raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); -} -NOKPROBE_SYMBOL(kretprobe_table_lock); - -static void kretprobe_hash_unlock(struct task_struct *tsk, - unsigned long *flags) -__releases(hlist_lock) -{ - unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); - raw_spinlock_t *hlist_lock; - - hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_unlock_irqrestore(hlist_lock, *flags); -} -NOKPROBE_SYMBOL(kretprobe_hash_unlock); - -static void kretprobe_table_unlock(unsigned long hash, - unsigned long *flags) -__releases(hlist_lock) -{ - raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_unlock_irqrestore(hlist_lock, *flags); -} -NOKPROBE_SYMBOL(kretprobe_table_unlock); - static struct kprobe kprobe_busy = { .addr = (void *) get_kprobe, }; @@ -1311,24 +1267,21 @@ void kprobe_busy_end(void) void kprobe_flush_task(struct task_struct *tk) { struct kretprobe_instance *ri; - struct hlist_head *head; - struct hlist_node *tmp; - unsigned long hash, flags = 0; + struct llist_node *node; + /* Early boot, not yet initialized. */ if (unlikely(!kprobes_initialized)) - /* Early boot. kretprobe_table_locks not yet initialized. */ return; kprobe_busy_begin(); - hash = hash_ptr(tk, KPROBE_HASH_BITS); - head = &kretprobe_inst_table[hash]; - kretprobe_table_lock(hash, &flags); - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task == tk) - recycle_rp_inst(ri); + node = __llist_del_all(&tk->kretprobe_instances); + while (node) { + ri = container_of(node, struct kretprobe_instance, llist); + node = node->next; + + recycle_rp_inst(ri); } - kretprobe_table_unlock(hash, &flags); kprobe_busy_end(); } @@ -1338,36 +1291,19 @@ static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; struct hlist_node *next; + int count = 0; hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) { hlist_del(&ri->hlist); kfree(ri); + count++; } -} - -static void cleanup_rp_inst(struct kretprobe *rp) -{ - unsigned long flags, hash; - struct kretprobe_instance *ri; - struct hlist_node *next; - struct hlist_head *head; - /* To avoid recursive kretprobe by NMI, set kprobe busy here */ - kprobe_busy_begin(); - for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) { - kretprobe_table_lock(hash, &flags); - head = &kretprobe_inst_table[hash]; - hlist_for_each_entry_safe(ri, next, head, hlist) { - if (ri->rp == rp) - ri->rp = NULL; - } - kretprobe_table_unlock(hash, &flags); + if (refcount_sub_and_test(count, &rp->rph->ref)) { + kfree(rp->rph); + rp->rph = NULL; } - kprobe_busy_end(); - - free_rp_inst(rp); } -NOKPROBE_SYMBOL(cleanup_rp_inst); /* Add the new probe to ap->list */ static int add_new_kprobe(struct kprobe *ap, struct kprobe *p) @@ -1928,88 +1864,56 @@ unsigned long __kretprobe_trampoline_handler(struct pt_regs *regs, void *trampoline_address, void *frame_pointer) { - struct kretprobe_instance *ri = NULL, *last = NULL; - struct hlist_head *head; - struct hlist_node *tmp; - unsigned long flags; kprobe_opcode_t *correct_ret_addr = NULL; - bool skipped = false; + struct kretprobe_instance *ri = NULL; + struct llist_node *first, *node; + struct kretprobe *rp; - kretprobe_hash_lock(current, &head, &flags); + /* Find all nodes for this frame. */ + first = node = current->kretprobe_instances.first; + while (node) { + ri = container_of(node, struct kretprobe_instance, llist); - /* - * It is possible to have multiple instances associated with a given - * task either because multiple functions in the call path have - * return probes installed on them, and/or more than one - * return probe was registered for a target function. - * - * We can handle this because: - * - instances are always pushed into the head of the list - * - when multiple return probes are registered for the same - * function, the (chronologically) first instance's ret_addr - * will be the real return address, and all the rest will - * point to kretprobe_trampoline. - */ - hlist_for_each_entry(ri, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - /* - * Return probes must be pushed on this hash list correct - * order (same as return order) so that it can be popped - * correctly. However, if we find it is pushed it incorrect - * order, this means we find a function which should not be - * probed, because the wrong order entry is pushed on the - * path of processing other kretprobe itself. - */ - if (ri->fp != frame_pointer) { - if (!skipped) - pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); - skipped = true; - continue; - } + BUG_ON(ri->fp != frame_pointer); - correct_ret_addr = ri->ret_addr; - if (skipped) - pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", - ri->rp->kp.addr); - - if (correct_ret_addr != trampoline_address) + if (ri->ret_addr != trampoline_address) { + correct_ret_addr = ri->ret_addr; /* * This is the real return address. Any other * instances associated with this task are for * other calls deeper on the call stack */ - break; + goto found; + } + + node = node->next; } + pr_err("Oops! Kretprobe fails to find correct return address.\n"); + BUG_ON(1); - BUG_ON(!correct_ret_addr || (correct_ret_addr == trampoline_address)); - last = ri; +found: + /* Unlink all nodes for this frame. */ + current->kretprobe_instances.first = node->next; + node->next = NULL; - hlist_for_each_entry_safe(ri, tmp, head, hlist) { - if (ri->task != current) - /* another task is sharing our hash bucket */ - continue; - if (ri->fp != frame_pointer) - continue; + /* Run them.. */ + while (first) { + ri = container_of(first, struct kretprobe_instance, llist); + first = first->next; - if (ri->rp && ri->rp->handler) { + rp = get_kretprobe(ri); + if (rp && rp->handler) { struct kprobe *prev = kprobe_running(); - __this_cpu_write(current_kprobe, &ri->rp->kp); + __this_cpu_write(current_kprobe, &rp->kp); ri->ret_addr = correct_ret_addr; - ri->rp->handler(ri, regs); + rp->handler(ri, regs); __this_cpu_write(current_kprobe, prev); } recycle_rp_inst(ri); - - if (ri == last) - break; } - kretprobe_hash_unlock(current, &flags); - return (unsigned long)correct_ret_addr; } NOKPROBE_SYMBOL(__kretprobe_trampoline_handler) @@ -2021,11 +1925,10 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler) static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); - unsigned long hash, flags = 0; + unsigned long flags = 0; struct kretprobe_instance *ri; /* TODO: consider to only swap the RA after the last pre_handler fired */ - hash = hash_ptr(current, KPROBE_HASH_BITS); raw_spin_lock_irqsave(&rp->lock, flags); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, @@ -2033,9 +1936,6 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) hlist_del(&ri->hlist); raw_spin_unlock_irqrestore(&rp->lock, flags); - ri->rp = rp; - ri->task = current; - if (rp->entry_handler && rp->entry_handler(ri, regs)) { raw_spin_lock_irqsave(&rp->lock, flags); hlist_add_head(&ri->hlist, &rp->free_instances); @@ -2045,11 +1945,8 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) arch_prepare_kretprobe(ri, regs); - /* XXX(hch): why is there no hlist_move_head? */ - INIT_HLIST_NODE(&ri->hlist); - kretprobe_table_lock(hash, &flags); - hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]); - kretprobe_table_unlock(hash, &flags); + __llist_add(&ri->llist, ¤t->kretprobe_instances); + } else { rp->nmissed++; raw_spin_unlock_irqrestore(&rp->lock, flags); @@ -2112,16 +2009,24 @@ int register_kretprobe(struct kretprobe *rp) } raw_spin_lock_init(&rp->lock); INIT_HLIST_HEAD(&rp->free_instances); + rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL); + if (!rp->rph) + return -ENOMEM; + + rp->rph->rp = rp; for (i = 0; i < rp->maxactive; i++) { - inst = kmalloc(sizeof(struct kretprobe_instance) + + inst = kzalloc(sizeof(struct kretprobe_instance) + rp->data_size, GFP_KERNEL); if (inst == NULL) { + refcount_set(&rp->rph->ref, i); free_rp_inst(rp); return -ENOMEM; } + inst->rph = rp->rph; INIT_HLIST_NODE(&inst->hlist); hlist_add_head(&inst->hlist, &rp->free_instances); } + refcount_set(&rp->rph->ref, i); rp->nmissed = 0; /* Establish function entry probe point */ @@ -2163,16 +2068,18 @@ void unregister_kretprobes(struct kretprobe **rps, int num) if (num <= 0) return; mutex_lock(&kprobe_mutex); - for (i = 0; i < num; i++) + for (i = 0; i < num; i++) { if (__unregister_kprobe_top(&rps[i]->kp) < 0) rps[i]->kp.addr = NULL; + rps[i]->rph->rp = NULL; + } mutex_unlock(&kprobe_mutex); synchronize_rcu(); for (i = 0; i < num; i++) { if (rps[i]->kp.addr) { __unregister_kprobe_bottom(&rps[i]->kp); - cleanup_rp_inst(rps[i]); + free_rp_inst(rps[i]); } } } @@ -2535,11 +2442,8 @@ static int __init init_kprobes(void) /* FIXME allocate the probe table, currently defined statically */ /* initialize all list heads */ - for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + for (i = 0; i < KPROBE_TABLE_SIZE; i++) INIT_HLIST_HEAD(&kprobe_table[i]); - INIT_HLIST_HEAD(&kretprobe_inst_table[i]); - raw_spin_lock_init(&(kretprobe_table_locks[i].lock)); - } err = populate_kprobe_blacklist(__start_kprobe_blacklist, __stop_kprobe_blacklist); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aefb6065b508..07baf6f6cecc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1714,7 +1714,8 @@ NOKPROBE_SYMBOL(kprobe_dispatcher); static int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { - struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); + struct kretprobe *rp = get_kretprobe(ri); + struct trace_kprobe *tk = container_of(rp, struct trace_kprobe, rp); raw_cpu_inc(*tk->nhit); -- cgit v1.2.3 From 6e426e0fcd20ce144bb93e00b70df51e9f2e08c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Aug 2020 22:03:56 +0900 Subject: kprobes: Replace rp->free_instance with freelist Gets rid of rp->lock, and as a result kretprobes are now fully lockless. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Masami Hiramatsu Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/159870623583.1229682.17472357584134058687.stgit@devnote2 --- include/linux/kprobes.h | 8 +++---- kernel/kprobes.c | 56 +++++++++++++++++++++---------------------------- 2 files changed, 28 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 00cf4421efd5..b7824e3f1ef5 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #ifdef CONFIG_KPROBES @@ -157,17 +158,16 @@ struct kretprobe { int maxactive; int nmissed; size_t data_size; - struct hlist_head free_instances; + struct freelist_head freelist; struct kretprobe_holder *rph; - raw_spinlock_t lock; }; struct kretprobe_instance { union { - struct llist_node llist; - struct hlist_node hlist; + struct freelist_node freelist; struct rcu_head rcu; }; + struct llist_node llist; struct kretprobe_holder *rph; kprobe_opcode_t *ret_addr; void *fp; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 850ee36a4051..30b8fe7d571d 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1228,11 +1228,8 @@ static void recycle_rp_inst(struct kretprobe_instance *ri) { struct kretprobe *rp = get_kretprobe(ri); - INIT_HLIST_NODE(&ri->hlist); if (likely(rp)) { - raw_spin_lock(&rp->lock); - hlist_add_head(&ri->hlist, &rp->free_instances); - raw_spin_unlock(&rp->lock); + freelist_add(&ri->freelist, &rp->freelist); } else call_rcu(&ri->rcu, free_rp_inst_rcu); } @@ -1290,11 +1287,14 @@ NOKPROBE_SYMBOL(kprobe_flush_task); static inline void free_rp_inst(struct kretprobe *rp) { struct kretprobe_instance *ri; - struct hlist_node *next; + struct freelist_node *node; int count = 0; - hlist_for_each_entry_safe(ri, next, &rp->free_instances, hlist) { - hlist_del(&ri->hlist); + node = rp->freelist.head; + while (node) { + ri = container_of(node, struct kretprobe_instance, freelist); + node = node->next; + kfree(ri); count++; } @@ -1925,32 +1925,26 @@ NOKPROBE_SYMBOL(__kretprobe_trampoline_handler) static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) { struct kretprobe *rp = container_of(p, struct kretprobe, kp); - unsigned long flags = 0; struct kretprobe_instance *ri; + struct freelist_node *fn; - /* TODO: consider to only swap the RA after the last pre_handler fired */ - raw_spin_lock_irqsave(&rp->lock, flags); - if (!hlist_empty(&rp->free_instances)) { - ri = hlist_entry(rp->free_instances.first, - struct kretprobe_instance, hlist); - hlist_del(&ri->hlist); - raw_spin_unlock_irqrestore(&rp->lock, flags); - - if (rp->entry_handler && rp->entry_handler(ri, regs)) { - raw_spin_lock_irqsave(&rp->lock, flags); - hlist_add_head(&ri->hlist, &rp->free_instances); - raw_spin_unlock_irqrestore(&rp->lock, flags); - return 0; - } - - arch_prepare_kretprobe(ri, regs); + fn = freelist_try_get(&rp->freelist); + if (!fn) { + rp->nmissed++; + return 0; + } - __llist_add(&ri->llist, ¤t->kretprobe_instances); + ri = container_of(fn, struct kretprobe_instance, freelist); - } else { - rp->nmissed++; - raw_spin_unlock_irqrestore(&rp->lock, flags); + if (rp->entry_handler && rp->entry_handler(ri, regs)) { + freelist_add(&ri->freelist, &rp->freelist); + return 0; } + + arch_prepare_kretprobe(ri, regs); + + __llist_add(&ri->llist, ¤t->kretprobe_instances); + return 0; } NOKPROBE_SYMBOL(pre_handler_kretprobe); @@ -2007,8 +2001,7 @@ int register_kretprobe(struct kretprobe *rp) rp->maxactive = num_possible_cpus(); #endif } - raw_spin_lock_init(&rp->lock); - INIT_HLIST_HEAD(&rp->free_instances); + rp->freelist.head = NULL; rp->rph = kzalloc(sizeof(struct kretprobe_holder), GFP_KERNEL); if (!rp->rph) return -ENOMEM; @@ -2023,8 +2016,7 @@ int register_kretprobe(struct kretprobe *rp) return -ENOMEM; } inst->rph = rp->rph; - INIT_HLIST_NODE(&inst->hlist); - hlist_add_head(&inst->hlist, &rp->free_instances); + freelist_add(&inst->freelist, &rp->freelist); } refcount_set(&rp->rph->ref, i); -- cgit v1.2.3 From 151a535171be6ff824a0a3875553ea38570f4c05 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Thu, 15 Oct 2020 21:41:44 +0100 Subject: genirq: Let GENERIC_IRQ_IPI select IRQ_DOMAIN_HIERARCHY kernel/irq/ipi.c otherwise fails to compile if nothing else selects it. Fixes: 379b656446a3 ("genirq: Add GENERIC_IRQ_IPI Kconfig symbol") Reported-by: Pavel Machek Tested-by: Pavel Machek Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201015101222.GA32747@amd --- kernel/irq/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 10a5aff4eecc..164a031cfdb6 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -82,6 +82,7 @@ config IRQ_FASTEOI_HIERARCHY_HANDLERS # Generic IRQ IPI support config GENERIC_IRQ_IPI bool + select IRQ_DOMAIN_HIERARCHY # Generic MSI interrupt support config GENERIC_MSI_IRQ -- cgit v1.2.3 From f8e48a3dca060e80f672d398d181db1298fbc86c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 22 Oct 2020 12:23:02 +0200 Subject: lockdep: Fix preemption WARN for spurious IRQ-enable It is valid (albeit uncommon) to call local_irq_enable() without first having called local_irq_disable(). In this case we enter lockdep_hardirqs_on*() with IRQs enabled and trip a preemption warning for using __this_cpu_read(). Use this_cpu_read() instead to avoid the warning. Fixes: 4d004099a6 ("lockdep: Fix lockdep recursion") Reported-by: syzbot+53f8ce8bbc07924b6417@syzkaller.appspotmail.com Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) --- kernel/locking/lockdep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 3e99dfef8408..fc206aefa970 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4057,7 +4057,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip) if (unlikely(in_nmi())) return; - if (unlikely(__this_cpu_read(lockdep_recursion))) + if (unlikely(this_cpu_read(lockdep_recursion))) return; if (unlikely(lockdep_hardirqs_enabled())) { @@ -4126,7 +4126,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip) goto skip_checks; } - if (unlikely(__this_cpu_read(lockdep_recursion))) + if (unlikely(this_cpu_read(lockdep_recursion))) return; if (lockdep_hardirqs_enabled()) { -- cgit v1.2.3 From 1a2b85f1e2a93a3f84243e654d225e4088735336 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 21 Oct 2020 12:07:49 -0700 Subject: timekeeping: Convert jiffies_seq to seqcount_raw_spinlock_t Use the new api and associate the seqcounter to the jiffies_lock enabling lockdep support - although for this particular case the write-side locking and non-preemptibility are quite obvious. Signed-off-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201021190749.19363-1-dave@stgolabs.net --- kernel/time/jiffies.c | 3 ++- kernel/time/timekeeping.h | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index eddcf4970444..a5cffe2a1770 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -59,7 +59,8 @@ static struct clocksource clocksource_jiffies = { }; __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock); -__cacheline_aligned_in_smp seqcount_t jiffies_seq; +__cacheline_aligned_in_smp seqcount_raw_spinlock_t jiffies_seq = + SEQCNT_RAW_SPINLOCK_ZERO(jiffies_seq, &jiffies_lock); #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void) diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 099737f6f10c..6c2cbd9ef999 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -26,7 +26,7 @@ extern void do_timer(unsigned long ticks); extern void update_wall_time(void); extern raw_spinlock_t jiffies_lock; -extern seqcount_t jiffies_seq; +extern seqcount_raw_spinlock_t jiffies_seq; #define CS_NAME_LEN 32 -- cgit v1.2.3 From 4cd2bb12981165f865d2b8ed92b446b52310ef74 Mon Sep 17 00:00:00 2001 From: Quanyang Wang Date: Tue, 29 Sep 2020 16:20:27 +0800 Subject: time/sched_clock: Mark sched_clock_read_begin/retry() as notrace Since sched_clock_read_begin() and sched_clock_read_retry() are called by notrace function sched_clock(), they shouldn't be traceable either, or else ftrace_graph_caller will run into a dead loop on the path as below (arm for instance): ftrace_graph_caller() prepare_ftrace_return() function_graph_enter() ftrace_push_return_trace() trace_clock_local() sched_clock() sched_clock_read_begin/retry() Fixes: 1b86abc1c645 ("sched_clock: Expose struct clock_read_data") Signed-off-by: Quanyang Wang Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200929082027.16787-1-quanyang.wang@windriver.com --- kernel/time/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 0642013dace4..b1b9b12899f5 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -68,13 +68,13 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) return (cyc * mult) >> shift; } -struct clock_read_data *sched_clock_read_begin(unsigned int *seq) +notrace struct clock_read_data *sched_clock_read_begin(unsigned int *seq) { *seq = raw_read_seqcount_latch(&cd.seq); return cd.read_data + (*seq & 1); } -int sched_clock_read_retry(unsigned int seq) +notrace int sched_clock_read_retry(unsigned int seq) { return read_seqcount_latch_retry(&cd.seq, seq); } -- cgit v1.2.3 From 5254cb87c0423f73c8036235795788a132e8956e Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 9 Sep 2020 21:48:50 +0800 Subject: hrtimer: Remove unused inline function debug_hrtimer_free() There is no caller in tree, remove it. Signed-off-by: YueHaibing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200909134850.21940-1-yuehaibing@huawei.com --- kernel/time/hrtimer.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3624b9b5835d..387b4bef7dd1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -425,11 +425,6 @@ static inline void debug_hrtimer_deactivate(struct hrtimer *timer) debug_object_deactivate(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_free(struct hrtimer *timer) -{ - debug_object_free(timer, &hrtimer_debug_descr); -} - static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode); -- cgit v1.2.3 From 9010e3876e1c3f7b1c3769bee519d6a871589aca Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 9 Sep 2020 21:47:49 +0800 Subject: timers: Remove unused inline funtion debug_timer_free() There is no caller in tree, remove it. Signed-off-by: YueHaibing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200909134749.32300-1-yuehaibing@huawei.com --- kernel/time/timer.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index de37e33a868d..c3ad64fb9d8b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -732,11 +732,6 @@ static inline void debug_timer_deactivate(struct timer_list *timer) debug_object_deactivate(timer, &timer_debug_descr); } -static inline void debug_timer_free(struct timer_list *timer) -{ - debug_object_free(timer, &timer_debug_descr); -} - static inline void debug_timer_assert_init(struct timer_list *timer) { debug_object_assert_init(timer, &timer_debug_descr); -- cgit v1.2.3 From cb47755725da7b90fecbb2aa82ac3b24a7adb89b Mon Sep 17 00:00:00 2001 From: Zeng Tao Date: Tue, 1 Sep 2020 17:30:13 +0800 Subject: time: Prevent undefined behaviour in timespec64_to_ns() UBSAN reports: Undefined behaviour in ./include/linux/time64.h:127:27 signed integer overflow: 17179869187 * 1000000000 cannot be represented in type 'long long int' Call Trace: timespec64_to_ns include/linux/time64.h:127 [inline] set_cpu_itimer+0x65c/0x880 kernel/time/itimer.c:180 do_setitimer+0x8e/0x740 kernel/time/itimer.c:245 __x64_sys_setitimer+0x14c/0x2c0 kernel/time/itimer.c:336 do_syscall_64+0xa1/0x540 arch/x86/entry/common.c:295 Commit bd40a175769d ("y2038: itimer: change implementation to timespec64") replaced the original conversion which handled time clamping correctly with timespec64_to_ns() which has no overflow protection. Fix it in timespec64_to_ns() as this is not necessarily limited to the usage in itimers. [ tglx: Added comment and adjusted the fixes tag ] Fixes: 361a3bf00582 ("time64: Add time64.h header and define struct timespec64") Signed-off-by: Zeng Tao Signed-off-by: Thomas Gleixner Reviewed-by: Arnd Bergmann Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/1598952616-6416-1-git-send-email-prime.zeng@hisilicon.com --- include/linux/time64.h | 4 ++++ kernel/time/itimer.c | 4 ---- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/time64.h b/include/linux/time64.h index c9dcb3e5781f..5117cb5b5656 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -124,6 +124,10 @@ static inline bool timespec64_valid_settod(const struct timespec64 *ts) */ static inline s64 timespec64_to_ns(const struct timespec64 *ts) { + /* Prevent multiplication overflow */ + if ((unsigned long long)ts->tv_sec >= KTIME_SEC_MAX) + return KTIME_MAX; + return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; } diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index ca4e6d57d68b..00629e658ca1 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -172,10 +172,6 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id, u64 oval, nval, ointerval, ninterval; struct cpu_itimer *it = &tsk->signal->it[clock_id]; - /* - * Use the to_ktime conversion because that clamps the maximum - * value to KTIME_MAX and avoid multiplication overflows. - */ nval = timespec64_to_ns(&value->it_value); ninterval = timespec64_to_ns(&value->it_interval); -- cgit v1.2.3 From 4230e2deaa484b385aa01d598b2aea8e7f2660a6 Mon Sep 17 00:00:00 2001 From: Zong Li Date: Wed, 21 Oct 2020 15:38:39 +0800 Subject: stop_machine, rcu: Mark functions as notrace Some architectures assume that the stopped CPUs don't make function calls to traceable functions when they are in the stopped state. See also commit cb9d7fd51d9f ("watchdog: Mark watchdog touch functions as notrace"). Violating this assumption causes kernel crashes when switching tracer on RISC-V. Mark rcu_momentary_dyntick_idle() and stop_machine_yield() notrace to prevent this. Fixes: 4ecf0a43e729 ("processor: get rid of cpu_relax_yield") Fixes: 366237e7b083 ("stop_machine: Provide RCU quiescent state in multi_cpu_stop()") Signed-off-by: Zong Li Signed-off-by: Thomas Gleixner Tested-by: Atish Patra Tested-by: Colin Ian King Acked-by: Steven Rostedt (VMware) Acked-by: Paul E. McKenney Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201021073839.43935-1-zong.li@sifive.com --- kernel/rcu/tree.c | 2 +- kernel/stop_machine.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 06895ef85d69..2a52f42f64b6 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -409,7 +409,7 @@ bool rcu_eqs_special_set(int cpu) * * The caller must have disabled interrupts and must not be idle. */ -void rcu_momentary_dyntick_idle(void) +notrace void rcu_momentary_dyntick_idle(void) { int special; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 865bb0228ab6..890b79cf0e7c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -178,7 +178,7 @@ static void ack_state(struct multi_stop_data *msdata) set_state(msdata, msdata->state + 1); } -void __weak stop_machine_yield(const struct cpumask *cpumask) +notrace void __weak stop_machine_yield(const struct cpumask *cpumask) { cpu_relax(); } -- cgit v1.2.3 From 4d4ce8053bfac9a72b9094c6879119938efaa05d Mon Sep 17 00:00:00 2001 From: Jackie Zamow Date: Tue, 27 Oct 2020 07:43:19 -0500 Subject: PM: sleep: fix typo in kernel/power/process.c Fix a typo in a comment in freeze_processes(). Signed-off-by: Jackie Zamow [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 4b6a54da7e65..45b054b7b5ec 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -146,7 +146,7 @@ int freeze_processes(void) BUG_ON(in_atomic()); /* - * Now that the whole userspace is frozen we need to disbale + * Now that the whole userspace is frozen we need to disable * the OOM killer to disallow any further interference with * killable tasks. There is no guarantee oom victims will * ever reach a point they go away we have to wait with a timeout. -- cgit v1.2.3 From cbb5262192d9a367d89d24e54388f54069ffd2b8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Fri, 23 Oct 2020 18:33:38 +0200 Subject: audit: fix a kernel-doc markup typo: kauditd_print_skb -> kauditd_printk_skb Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 68cee3bc8cfe..0be42cac086b 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -523,7 +523,7 @@ static int auditd_set(struct pid *pid, u32 portid, struct net *net) } /** - * kauditd_print_skb - Print the audit record to the ring buffer + * kauditd_printk_skb - Print the audit record to the ring buffer * @skb: audit record * * Whatever the reason, this packet may not make it to the auditd connection -- cgit v1.2.3 From 6d915476e67d99b73a57bceb83cff1cf153d8bf6 Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 22 Sep 2020 08:44:50 -0400 Subject: audit: trigger accompanying records when no rules present When there are no audit rules registered, mandatory records (config, etc.) are missing their accompanying records (syscall, proctitle, etc.). This is due to audit context dummy set on syscall entry based on absence of rules that signals that no other records are to be printed. Clear the dummy bit if any record is generated, open coding this in audit_log_start(). The proctitle context and dummy checks are pointless since the proctitle record will not be printed if no syscall records are printed. The fds array is reset to -1 after the first syscall to indicate it isn't valid any more, but was never set to -1 when the context was allocated to indicate it wasn't yet valid. Check ctx->pwd in audit_log_name(). The audit_inode* functions can be called without going through getname_flags() or getname_kernel() that sets audit_names and cwd, so set the cwd in audit_alloc_name() if it has not already been done so due to audit_names being valid and purge all other audit_getcwd() calls. Revert the LSM dump_common_audit_data() LSM_AUDIT_DATA_* cases from the ghak96 patch since they are no longer necessary due to cwd coverage in audit_alloc_name(). Thanks to bauen1 for reporting LSM situations in which context->cwd is not valid, inadvertantly fixed by the ghak96 patch. Please see upstream github issue https://github.com/linux-audit/audit-kernel/issues/120 This is also related to upstream github issue https://github.com/linux-audit/audit-kernel/issues/96 Signed-off-by: Richard Guy Briggs Signed-off-by: Paul Moore --- include/linux/audit.h | 8 -------- kernel/audit.c | 3 +++ kernel/auditsc.c | 27 +++++++-------------------- security/lsm_audit.c | 5 ----- 4 files changed, 10 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/audit.h b/include/linux/audit.h index b3d859831a31..82b7c1116a85 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -292,7 +292,6 @@ extern void __audit_syscall_entry(int major, unsigned long a0, unsigned long a1, extern void __audit_syscall_exit(int ret_success, long ret_value); extern struct filename *__audit_reusename(const __user char *uptr); extern void __audit_getname(struct filename *name); -extern void __audit_getcwd(void); extern void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags); extern void __audit_file(const struct file *); @@ -351,11 +350,6 @@ static inline void audit_getname(struct filename *name) if (unlikely(!audit_dummy_context())) __audit_getname(name); } -static inline void audit_getcwd(void) -{ - if (unlikely(audit_context())) - __audit_getcwd(); -} static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) { @@ -584,8 +578,6 @@ static inline struct filename *audit_reusename(const __user char *name) } static inline void audit_getname(struct filename *name) { } -static inline void audit_getcwd(void) -{ } static inline void audit_inode(struct filename *name, const struct dentry *dentry, unsigned int aflags) diff --git a/kernel/audit.c b/kernel/audit.c index 0be42cac086b..ac0aeaa99937 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1865,6 +1865,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); + /* cancel dummy context to enable supporting records */ + if (ctx) + ctx->dummy = 0; audit_log_format(ab, "audit(%llu.%03lu:%u): ", (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8dba8f0983b5..183d79cc2e12 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -929,6 +929,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); + context->fds[0] = -1; return context; } @@ -1367,7 +1368,10 @@ static void audit_log_name(struct audit_context *context, struct audit_names *n, /* name was specified as a relative path and the * directory component is the cwd */ - audit_log_d_path(ab, " name=", &context->pwd); + if (context->pwd.dentry && context->pwd.mnt) + audit_log_d_path(ab, " name=", &context->pwd); + else + audit_log_format(ab, " name=(null)"); break; default: /* log the name's directory component */ @@ -1435,9 +1439,6 @@ static void audit_log_proctitle(void) struct audit_context *context = audit_context(); struct audit_buffer *ab; - if (!context || context->dummy) - return; - ab = audit_log_start(context, GFP_KERNEL, AUDIT_PROCTITLE); if (!ab) return; /* audit_panic or being filtered */ @@ -1866,6 +1867,8 @@ static struct audit_names *audit_alloc_name(struct audit_context *context, list_add_tail(&aname->list, &context->names_list); context->name_count++; + if (!context->pwd.dentry) + get_fs_pwd(current->fs, &context->pwd); return aname; } @@ -1894,20 +1897,6 @@ __audit_reusename(const __user char *uptr) return NULL; } -inline void _audit_getcwd(struct audit_context *context) -{ - if (!context->pwd.dentry) - get_fs_pwd(current->fs, &context->pwd); -} - -void __audit_getcwd(void) -{ - struct audit_context *context = audit_context(); - - if (context->in_syscall) - _audit_getcwd(context); -} - /** * __audit_getname - add a name to the list * @name: name to add @@ -1931,8 +1920,6 @@ void __audit_getname(struct filename *name) n->name_len = AUDIT_NAME_FULL; name->aname = n; name->refcnt++; - - _audit_getcwd(context); } static inline int audit_copy_fcaps(struct audit_names *name, diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 53d0d183db8f..221370794d14 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -241,7 +241,6 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } - audit_getcwd(); break; } case LSM_AUDIT_DATA_FILE: { @@ -255,7 +254,6 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } - audit_getcwd(); break; } case LSM_AUDIT_DATA_IOCTL_OP: { @@ -271,7 +269,6 @@ static void dump_common_audit_data(struct audit_buffer *ab, } audit_log_format(ab, " ioctlcmd=0x%hx", a->u.op->cmd); - audit_getcwd(); break; } case LSM_AUDIT_DATA_DENTRY: { @@ -286,7 +283,6 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); } - audit_getcwd(); break; } case LSM_AUDIT_DATA_INODE: { @@ -304,7 +300,6 @@ static void dump_common_audit_data(struct audit_buffer *ab, audit_log_format(ab, " dev="); audit_log_untrustedstring(ab, inode->i_sb->s_id); audit_log_format(ab, " ino=%lu", inode->i_ino); - audit_getcwd(); break; } case LSM_AUDIT_DATA_TASK: { -- cgit v1.2.3 From 921c7ebd1337d1a46783d7e15a850e12aed2eaa0 Mon Sep 17 00:00:00 2001 From: Mateusz Nosek Date: Sun, 27 Sep 2020 02:08:58 +0200 Subject: futex: Fix incorrect should_fail_futex() handling If should_futex_fail() returns true in futex_wake_pi(), then the 'ret' variable is set to -EFAULT and then immediately overwritten. So the failure injection is non-functional. Fix it by actually leaving the function and returning -EFAULT. The Fixes tag is kinda blury because the initial commit which introduced failure injection was already sloppy, but the below mentioned commit broke it completely. [ tglx: Massaged changelog ] Fixes: 6b4f4bc9cb22 ("locking/futex: Allow low-level atomic operations to return -EAGAIN") Signed-off-by: Mateusz Nosek Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20200927000858.24219-1-mateusznosek0@gmail.com --- kernel/futex.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index a5876694a60e..39681bf8b06c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1502,8 +1502,10 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); - if (unlikely(should_fail_futex(true))) + if (unlikely(should_fail_futex(true))) { ret = -EFAULT; + goto out_unlock; + } ret = cmpxchg_futex_value_locked(&curval, uaddr, uval, newval); if (!ret && (curval != uval)) { -- cgit v1.2.3 From 2cbd5a45e5296b28d64224ffbbd33d427704ba1b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Sat, 24 Oct 2020 22:35:22 +0100 Subject: genirq/irqdomain: Implement get_name() method on irqchip fwnodes Prerequisite to make x86 more irqdomain compliant. Signed-off-by: David Woodhouse Signed-off-by: Thomas Gleixner Acked-by: Marc Zyngier Link: https://lore.kernel.org/r/20201024213535.443185-23-dwmw2@infradead.org --- kernel/irq/irqdomain.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf8b374b892d..673fa64c1c44 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -42,7 +42,16 @@ static inline void debugfs_add_domain_dir(struct irq_domain *d) { } static inline void debugfs_remove_domain_dir(struct irq_domain *d) { } #endif -const struct fwnode_operations irqchip_fwnode_ops; +static const char *irqchip_fwnode_get_name(const struct fwnode_handle *fwnode) +{ + struct irqchip_fwid *fwid = container_of(fwnode, struct irqchip_fwid, fwnode); + + return fwid->name; +} + +const struct fwnode_operations irqchip_fwnode_ops = { + .get_name = irqchip_fwnode_get_name, +}; EXPORT_SYMBOL_GPL(irqchip_fwnode_ops); /** -- cgit v1.2.3 From 77f6ab8b7768cf5e6bdd0e72499270a0671506ee Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 28 Oct 2020 16:39:49 -0400 Subject: don't dump the threads that had been already exiting when zapped. Coredump logics needs to report not only the registers of the dumping thread, but (since 2.5.43) those of other threads getting killed. Doing that might require extra state saved on the stack in asm glue at kernel entry; signal delivery logics does that (we need to be able to save sigcontext there, at the very least) and so does seccomp. That covers all callers of do_coredump(). Secondary threads get hit with SIGKILL and caught as soon as they reach exit_mm(), which normally happens in signal delivery, so those are also fine most of the time. Unfortunately, it is possible to end up with secondary zapped when it has already entered exit(2) (or, worse yet, is oopsing). In those cases we reach exit_mm() when mm->core_state is already set, but the stack contents is not what we would have in signal delivery. At least on two architectures (alpha and m68k) it leads to infoleaks - we end up with a chunk of kernel stack written into coredump, with the contents consisting of normal C stack frames of the call chain leading to exit_mm() instead of the expected copy of userland registers. In case of alpha we leak 312 bytes of stack. Other architectures (including the regset-using ones) might have similar problems - the normal user of regsets is ptrace and the state of tracee at the time of such calls is special in the same way signal delivery is. Note that had the zapper gotten to the exiting thread slightly later, it wouldn't have been included into coredump anyway - we skip the threads that have already cleared their ->mm. So let's pretend that zapper always loses the race. IOW, have exit_mm() only insert into the dumper list if we'd gotten there from handling a fatal signal[*] As the result, the callers of do_exit() that have *not* gone through get_signal() are not seen by coredump logics as secondary threads. Which excludes voluntary exit()/oopsen/traps/etc. The dumper thread itself is unaffected by that, so seccomp is fine. [*] originally I intended to add a new flag in tsk->flags, but ebiederman pointed out that PF_SIGNALED is already doing just what we need. Cc: stable@vger.kernel.org Fixes: d89f3847def4 ("[PATCH] thread-aware coredumps, 2.5.43-C3") History-tree: https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git Acked-by: "Eric W. Biederman" Signed-off-by: Al Viro --- kernel/exit.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 87a2d515de0d..1f236ed375f8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -454,7 +454,10 @@ static void exit_mm(void) mmap_read_unlock(mm); self.task = current; - self.next = xchg(&core_state->dumper.next, &self); + if (self.task->flags & PF_SIGNALED) + self.next = xchg(&core_state->dumper.next, &self); + else + self.task = NULL; /* * Implies mb(), the result of xchg() must be visible * to core_state->dumper. -- cgit v1.2.3 From cf83b2d2e2b64920bd6999b199dfa271d7e94cf8 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 27 Oct 2020 23:10:54 -0700 Subject: bpf: Permit cond_resched for some iterators Commit e679654a704e ("bpf: Fix a rcu_sched stall issue with bpf task/task_file iterator") tries to fix rcu stalls warning which is caused by bpf task_file iterator when running "bpftool prog". rcu: INFO: rcu_sched self-detected stall on CPU rcu: \x097-....: (20999 ticks this GP) idle=302/1/0x4000000000000000 softirq=1508852/1508852 fqs=4913 \x09(t=21031 jiffies g=2534773 q=179750) NMI backtrace for cpu 7 CPU: 7 PID: 184195 Comm: bpftool Kdump: loaded Tainted: G W 5.8.0-00004-g68bfc7f8c1b4 #6 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A17 05/03/2019 Call Trace: dump_stack+0x57/0x70 nmi_cpu_backtrace.cold+0x14/0x53 ? lapic_can_unplug_cpu.cold+0x39/0x39 nmi_trigger_cpumask_backtrace+0xb7/0xc7 rcu_dump_cpu_stacks+0xa2/0xd0 rcu_sched_clock_irq.cold+0x1ff/0x3d9 ? tick_nohz_handler+0x100/0x100 update_process_times+0x5b/0x90 tick_sched_timer+0x5e/0xf0 __hrtimer_run_queues+0x12a/0x2a0 hrtimer_interrupt+0x10e/0x280 __sysvec_apic_timer_interrupt+0x51/0xe0 asm_call_on_stack+0xf/0x20 sysvec_apic_timer_interrupt+0x6f/0x80 ... task_file_seq_next+0x52/0xa0 bpf_seq_read+0xb9/0x320 vfs_read+0x9d/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x38/0x60 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The fix is to limit the number of bpf program runs to be one million. This fixed the program in most cases. But we also found under heavy load, which can increase the wallclock time for bpf_seq_read(), the warning may still be possible. For example, calling bpf_delay() in the "while" loop of bpf_seq_read(), which will introduce artificial delay, the warning will show up in my qemu run. static unsigned q; volatile unsigned *p = &q; volatile unsigned long long ll; static void bpf_delay(void) { int i, j; for (i = 0; i < 10000; i++) for (j = 0; j < 10000; j++) ll += *p; } There are two ways to fix this issue. One is to reduce the above one million threshold to say 100,000 and hopefully rcu warning will not show up any more. Another is to introduce a target feature which enables bpf_seq_read() calling cond_resched(). This patch took second approach as the first approach may cause more -EAGAIN failures for read() syscalls. Note that not all bpf_iter targets can permit cond_resched() in bpf_seq_read() as some, e.g., netlink seq iterator, rcu read lock critical section spans through seq_ops->next() -> seq_ops->show() -> seq_ops->next(). For the kernel code with the above hack, "bpftool p" roughly takes 38 seconds to finish on my VM with 184 bpf program runs. Using the following command, I am able to collect the number of context switches: perf stat -e context-switches -- ./bpftool p >& log Without this patch, 69 context-switches With this patch, 75 context-switches This patch added additional 6 context switches, roughly every 6 seconds to reschedule, to avoid lengthy no-rescheduling which may cause the above RCU warnings. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201028061054.1411116-1-yhs@fb.com --- include/linux/bpf.h | 5 +++++ kernel/bpf/bpf_iter.c | 14 ++++++++++++++ kernel/bpf/task_iter.c | 2 ++ 3 files changed, 21 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2b16bf48aab6..2fffd30e13ac 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1294,6 +1294,10 @@ typedef void (*bpf_iter_show_fdinfo_t) (const struct bpf_iter_aux_info *aux, typedef int (*bpf_iter_fill_link_info_t)(const struct bpf_iter_aux_info *aux, struct bpf_link_info *info); +enum bpf_iter_feature { + BPF_ITER_RESCHED = BIT(0), +}; + #define BPF_ITER_CTX_ARG_MAX 2 struct bpf_iter_reg { const char *target; @@ -1302,6 +1306,7 @@ struct bpf_iter_reg { bpf_iter_show_fdinfo_t show_fdinfo; bpf_iter_fill_link_info_t fill_link_info; u32 ctx_arg_info_size; + u32 feature; struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX]; const struct bpf_iter_seq_info *seq_info; }; diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 8f10e30ea0b0..5454161407f1 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -67,6 +67,15 @@ static void bpf_iter_done_stop(struct seq_file *seq) iter_priv->done_stop = true; } +static bool bpf_iter_support_resched(struct seq_file *seq) +{ + struct bpf_iter_priv_data *iter_priv; + + iter_priv = container_of(seq->private, struct bpf_iter_priv_data, + target_private); + return iter_priv->tinfo->reg_info->feature & BPF_ITER_RESCHED; +} + /* maximum visited objects before bailing out */ #define MAX_ITER_OBJECTS 1000000 @@ -83,6 +92,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, struct seq_file *seq = file->private_data; size_t n, offs, copied = 0; int err = 0, num_objs = 0; + bool can_resched; void *p; mutex_lock(&seq->lock); @@ -135,6 +145,7 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, goto done; } + can_resched = bpf_iter_support_resched(seq); while (1) { loff_t pos = seq->index; @@ -180,6 +191,9 @@ static ssize_t bpf_seq_read(struct file *file, char __user *buf, size_t size, } break; } + + if (can_resched) + cond_resched(); } stop: offs = seq->count; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 5b6af30bfbcd..1fdb2fc196cd 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -337,6 +337,7 @@ static const struct bpf_iter_seq_info task_seq_info = { static struct bpf_iter_reg task_reg_info = { .target = "task", + .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 1, .ctx_arg_info = { { offsetof(struct bpf_iter__task, task), @@ -354,6 +355,7 @@ static const struct bpf_iter_seq_info task_file_seq_info = { static struct bpf_iter_reg task_file_reg_info = { .target = "task_file", + .feature = BPF_ITER_RESCHED, .ctx_arg_info_size = 2, .ctx_arg_info = { { offsetof(struct bpf_iter__task_file, task), -- cgit v1.2.3 From 5c251e9dc0e127bac6fc5b8e6696363d2e35f515 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Oct 2020 14:32:27 -0600 Subject: signal: Add task_sigpending() helper This is in preparation for maintaining signal_pending() as the decider of whether or not a schedule() loop should be broken, or continue sleeping. This is different than the core signal use cases, which really need to know whether an actual signal is pending or not. task_sigpending() returns non-zero if TIF_SIGPENDING is set. Only core kernel use cases should care about the distinction between the two, make sure those use the task_sigpending() helper. Signed-off-by: Jens Axboe Signed-off-by: Thomas Gleixner Reviewed-by: Thomas Gleixner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20201026203230.386348-2-axboe@kernel.dk --- include/linux/sched/signal.h | 9 +++++++-- kernel/events/uprobes.c | 2 +- kernel/signal.c | 8 ++++---- 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 1bad18a1d8ba..404145dc536e 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -353,11 +353,16 @@ static inline int restart_syscall(void) return -ERESTARTNOINTR; } -static inline int signal_pending(struct task_struct *p) +static inline int task_sigpending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } +static inline int signal_pending(struct task_struct *p) +{ + return task_sigpending(p); +} + static inline int __fatal_signal_pending(struct task_struct *p) { return unlikely(sigismember(&p->pending.signal, SIGKILL)); @@ -365,7 +370,7 @@ static inline int __fatal_signal_pending(struct task_struct *p) static inline int fatal_signal_pending(struct task_struct *p) { - return signal_pending(p) && __fatal_signal_pending(p); + return task_sigpending(p) && __fatal_signal_pending(p); } static inline int signal_pending_state(long state, struct task_struct *p) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 649fd53dc9ad..edd0c985a939 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1973,7 +1973,7 @@ bool uprobe_deny_signal(void) WARN_ON_ONCE(utask->state != UTASK_SSTEP); - if (signal_pending(t)) { + if (task_sigpending(t)) { spin_lock_irq(&t->sighand->siglock); clear_tsk_thread_flag(t, TIF_SIGPENDING); spin_unlock_irq(&t->sighand->siglock); diff --git a/kernel/signal.c b/kernel/signal.c index 42b67d2cea37..b179eccc86d0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -983,7 +983,7 @@ static inline bool wants_signal(int sig, struct task_struct *p) if (task_is_stopped_or_traced(p)) return false; - return task_curr(p) || !signal_pending(p); + return task_curr(p) || !task_sigpending(p); } static void complete_signal(int sig, struct task_struct *p, enum pid_type type) @@ -2822,7 +2822,7 @@ static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which) /* Remove the signals this thread can handle. */ sigandsets(&retarget, &retarget, &t->blocked); - if (!signal_pending(t)) + if (!task_sigpending(t)) signal_wake_up(t, 0); if (sigisemptyset(&retarget)) @@ -2856,7 +2856,7 @@ void exit_signals(struct task_struct *tsk) cgroup_threadgroup_change_end(tsk); - if (!signal_pending(tsk)) + if (!task_sigpending(tsk)) goto out; unblocked = tsk->blocked; @@ -2900,7 +2900,7 @@ long do_no_restart_syscall(struct restart_block *param) static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset) { - if (signal_pending(tsk) && !thread_group_empty(tsk)) { + if (task_sigpending(tsk) && !thread_group_empty(tsk)) { sigset_t newblocked; /* A set of now blocked but previously unblocked signals. */ sigandnsets(&newblocked, newset, ¤t->blocked); -- cgit v1.2.3 From 12db8b690010ccfadf9d0b49a1e1798e47dbbe1a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Oct 2020 14:32:28 -0600 Subject: entry: Add support for TIF_NOTIFY_SIGNAL Add TIF_NOTIFY_SIGNAL handling in the generic entry code, which if set, will return true if signal_pending() is used in a wait loop. That causes an exit of the loop so that notify_signal tracehooks can be run. If the wait loop is currently inside a system call, the system call is restarted once task_work has been processed. In preparation for only having arch_do_signal() handle syscall restarts if _TIF_SIGPENDING isn't set, rename it to arch_do_signal_or_restart(). Pass in a boolean that tells the architecture specific signal handler if it should attempt to get a signal, or just process a potential syscall restart. For !CONFIG_GENERIC_ENTRY archs, add the TIF_NOTIFY_SIGNAL handling to get_signal(). This is done to minimize the needed architecture changes to support this feature. Signed-off-by: Jens Axboe Signed-off-by: Thomas Gleixner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20201026203230.386348-3-axboe@kernel.dk --- arch/x86/kernel/signal.c | 4 ++-- include/linux/entry-common.h | 11 ++++++++--- include/linux/entry-kvm.h | 4 ++-- include/linux/sched/signal.h | 11 ++++++++++- include/linux/tracehook.h | 27 +++++++++++++++++++++++++++ kernel/entry/common.c | 14 +++++++++++--- kernel/entry/kvm.c | 3 +++ kernel/signal.c | 14 ++++++++++++++ 8 files changed, 77 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index d5fa494c2304..ec3b9c6e5a4c 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -804,11 +804,11 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -void arch_do_signal(struct pt_regs *regs) +void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { struct ksignal ksig; - if (get_signal(&ksig)) { + if (has_signal && get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ handle_signal(&ksig, regs); return; diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index efebbffcd5cc..c7bfac45f951 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -37,6 +37,10 @@ # define _TIF_UPROBE (0) #endif +#ifndef _TIF_NOTIFY_SIGNAL +# define _TIF_NOTIFY_SIGNAL (0) +#endif + /* * TIF flags handled in syscall_enter_from_usermode() */ @@ -69,7 +73,7 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \ + _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ARCH_EXIT_TO_USER_MODE_WORK) /** @@ -226,12 +230,13 @@ static __always_inline void arch_exit_to_user_mode(void) { } #endif /** - * arch_do_signal - Architecture specific signal delivery function + * arch_do_signal_or_restart - Architecture specific signal delivery function * @regs: Pointer to currents pt_regs + * @has_signal: actual signal to handle * * Invoked from exit_to_user_mode_loop(). */ -void arch_do_signal(struct pt_regs *regs); +void arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal); /** * arch_syscall_exit_tracehook - Wrapper around tracehook_report_syscall_exit() diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 0cef17afb41a..9b93f8584ff7 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -11,8 +11,8 @@ # define ARCH_XFER_TO_GUEST_MODE_WORK (0) #endif -#define XFER_TO_GUEST_MODE_WORK \ - (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +#define XFER_TO_GUEST_MODE_WORK \ + (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \ _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) struct kvm_vcpu; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 404145dc536e..bd5afa076189 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -360,6 +360,15 @@ static inline int task_sigpending(struct task_struct *p) static inline int signal_pending(struct task_struct *p) { +#if defined(TIF_NOTIFY_SIGNAL) + /* + * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same + * behavior in terms of ensuring that we break out of wait loops + * so that notify signal callbacks can be processed. + */ + if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) + return 1; +#endif return task_sigpending(p); } @@ -507,7 +516,7 @@ extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); static inline void restore_saved_sigmask_unless(bool interrupted) { if (interrupted) - WARN_ON(!test_thread_flag(TIF_SIGPENDING)); + WARN_ON(!signal_pending(current)); else restore_saved_sigmask(); } diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 36fb3bbed6b2..1e8caca92e1f 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -198,4 +198,31 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) blkcg_maybe_throttle_current(); } +/* + * called by exit_to_user_mode_loop() if ti_work & _TIF_NOTIFY_SIGNAL. This + * is currently used by TWA_SIGNAL based task_work, which requires breaking + * wait loops to ensure that task_work is noticed and run. + */ +static inline void tracehook_notify_signal(void) +{ +#if defined(TIF_NOTIFY_SIGNAL) + clear_thread_flag(TIF_NOTIFY_SIGNAL); + smp_mb__after_atomic(); + if (current->task_works) + task_work_run(); +#endif +} + +/* + * Called when we have work to process from exit_to_user_mode_loop() + */ +static inline void set_notify_signal(struct task_struct *task) +{ +#if defined(TIF_NOTIFY_SIGNAL) + if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && + !wake_up_state(task, TASK_INTERRUPTIBLE)) + kick_process(task); +#endif +} + #endif /* */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9852e0d62d95..42eff115c426 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -109,7 +109,15 @@ static __always_inline void exit_to_user_mode(void) } /* Workaround to allow gradual conversion of architecture code */ -void __weak arch_do_signal(struct pt_regs *regs) { } +void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } + +static void handle_signal_work(struct pt_regs *regs, unsigned long ti_work) +{ + if (ti_work & _TIF_NOTIFY_SIGNAL) + tracehook_notify_signal(); + + arch_do_signal_or_restart(regs, ti_work & _TIF_SIGPENDING); +} static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work) @@ -131,8 +139,8 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, if (ti_work & _TIF_PATCH_PENDING) klp_update_patch_state(current); - if (ti_work & _TIF_SIGPENDING) - arch_do_signal(regs); + if (ti_work & (_TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL)) + handle_signal_work(regs, ti_work); if (ti_work & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c index eb1a8a4c867c..b828a3ddebf1 100644 --- a/kernel/entry/kvm.c +++ b/kernel/entry/kvm.c @@ -8,6 +8,9 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) do { int ret; + if (ti_work & _TIF_NOTIFY_SIGNAL) + tracehook_notify_signal(); + if (ti_work & _TIF_SIGPENDING) { kvm_handle_signal_exit(vcpu); return -EINTR; diff --git a/kernel/signal.c b/kernel/signal.c index b179eccc86d0..61b377e65c46 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2529,6 +2529,20 @@ bool get_signal(struct ksignal *ksig) struct signal_struct *signal = current->signal; int signr; + /* + * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so + * that the arch handlers don't all have to do it. If we get here + * without TIF_SIGPENDING, just exit after running signal work. + */ +#ifdef TIF_NOTIFY_SIGNAL + if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) { + if (test_thread_flag(TIF_NOTIFY_SIGNAL)) + tracehook_notify_signal(); + if (!task_sigpending(current)) + return false; + } +#endif + if (unlikely(uprobe_deny_signal())) return false; -- cgit v1.2.3 From 114518eb6430b832d2f9f5a008043b913ccf0e24 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Oct 2020 14:32:30 -0600 Subject: task_work: Use TIF_NOTIFY_SIGNAL if available If the arch supports TIF_NOTIFY_SIGNAL, then use that for TWA_SIGNAL as it's more efficient than using the signal delivery method. This is especially true on threaded applications, where ->sighand is shared across threads, but it's also lighter weight on non-shared cases. io_uring is a heavy consumer of TWA_SIGNAL based task_work. A test with threads shows a nice improvement running an io_uring based echo server. stock kernel: 0.01% <= 0.1 milliseconds 95.86% <= 0.2 milliseconds 98.27% <= 0.3 milliseconds 99.71% <= 0.4 milliseconds 100.00% <= 0.5 milliseconds 100.00% <= 0.6 milliseconds 100.00% <= 0.7 milliseconds 100.00% <= 0.8 milliseconds 100.00% <= 0.9 milliseconds 100.00% <= 1.0 milliseconds 100.00% <= 1.1 milliseconds 100.00% <= 2 milliseconds 100.00% <= 3 milliseconds 100.00% <= 3 milliseconds 1378930.00 requests per second ~1600% CPU 1.38M requests/second, and all 16 CPUs are maxed out. patched kernel: 0.01% <= 0.1 milliseconds 98.24% <= 0.2 milliseconds 99.47% <= 0.3 milliseconds 99.99% <= 0.4 milliseconds 100.00% <= 0.5 milliseconds 100.00% <= 0.6 milliseconds 100.00% <= 0.7 milliseconds 100.00% <= 0.8 milliseconds 100.00% <= 0.9 milliseconds 100.00% <= 1.2 milliseconds 1666111.38 requests per second ~1450% CPU 1.67M requests/second, and we're no longer just hammering on the sighand lock. The original reporter states: "For 5.7.15 my benchmark achieves 1.6M qps and system cpu is at ~80%. for 5.7.16 or later it achieves only 1M qps and the system cpu is is at ~100%" with the only difference there being that TWA_SIGNAL is used unconditionally in 5.7.16, since it's required to be able to handle the inability to run task_work if the application is waiting in the kernel already on an event that needs task_work run to be satisfied. Also see commit 0ba9c9edcd15. Reported-by: Roman Gershman Signed-off-by: Jens Axboe Signed-off-by: Thomas Gleixner Reviewed-by: Oleg Nesterov Link: https://lore.kernel.org/r/20201026203230.386348-5-axboe@kernel.dk --- kernel/task_work.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index 613b2d634af8..ae058893913c 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -5,6 +5,34 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +/* + * TWA_SIGNAL signaling - use TIF_NOTIFY_SIGNAL, if available, as it's faster + * than TIF_SIGPENDING as there's no dependency on ->sighand. The latter is + * shared for threads, and can cause contention on sighand->lock. Even for + * the non-threaded case TIF_NOTIFY_SIGNAL is more efficient, as no locking + * or IRQ disabling is involved for notification (or running) purposes. + */ +static void task_work_notify_signal(struct task_struct *task) +{ +#if defined(TIF_NOTIFY_SIGNAL) + set_notify_signal(task); +#else + unsigned long flags; + + /* + * Only grab the sighand lock if we don't already have some + * task_work pending. This pairs with the smp_store_mb() + * in get_signal(), see comment there. + */ + if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && + lock_task_sighand(task, &flags)) { + task->jobctl |= JOBCTL_TASK_WORK; + signal_wake_up(task, 0); + unlock_task_sighand(task, &flags); + } +#endif +} + /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback @@ -28,7 +56,6 @@ int task_work_add(struct task_struct *task, struct callback_head *work, int notify) { struct callback_head *head; - unsigned long flags; do { head = READ_ONCE(task->task_works); @@ -42,17 +69,7 @@ task_work_add(struct task_struct *task, struct callback_head *work, int notify) set_notify_resume(task); break; case TWA_SIGNAL: - /* - * Only grab the sighand lock if we don't already have some - * task_work pending. This pairs with the smp_store_mb() - * in get_signal(), see comment there. - */ - if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && - lock_task_sighand(task, &flags)) { - task->jobctl |= JOBCTL_TASK_WORK; - signal_wake_up(task, 0); - unlock_task_sighand(task, &flags); - } + task_work_notify_signal(task); break; } -- cgit v1.2.3 From b9c88f752268383beff0d56e50d52b8ae62a02f8 Mon Sep 17 00:00:00 2001 From: jun qian Date: Thu, 15 Oct 2020 14:48:46 +0800 Subject: sched/fair: Improve the accuracy of sched_stat_wait statistics When the sched_schedstat changes from 0 to 1, some sched se maybe already in the runqueue, the se->statistics.wait_start will be 0. So it will let the (rq_of(cfs_rq)) - se->statistics.wait_start) wrong. We need to avoid this scenario. Signed-off-by: jun qian Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yafang Shao Link: https://lkml.kernel.org/r/20201015064846.19809-1-qianjun.kernel@gmail.com --- kernel/sched/fair.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 290f9e38378c..b9368d123451 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -906,6 +906,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) if (!schedstat_enabled()) return; + /* + * When the sched_schedstat changes from 0 to 1, some sched se + * maybe already in the runqueue, the se->statistics.wait_start + * will be 0.So it will let the delta wrong. We need to avoid this + * scenario. + */ + if (unlikely(!schedstat_val(se->statistics.wait_start))) + return; + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); if (entity_is_task(se)) { -- cgit v1.2.3 From 26762423a2664692de2bcccc9de684a5ac105e23 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Thu, 8 Oct 2020 23:48:46 +0800 Subject: sched/deadline: Optimize sched_dl_global_validate() Under CONFIG_SMP, dl_bw is per root domain, but not per CPU. When checking or updating dl_bw, currently iterating every CPU is overdoing, just need iterate each root domain once. Suggested-by: Peter Zijlstra Signed-off-by: Peng Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/78d21ee792cc48ff79e8cd62a5f26208463684d6.1602171061.git.iwtbavbm@gmail.com --- kernel/sched/deadline.c | 39 ++++++++++++++++++++++++++++++++------- kernel/sched/sched.h | 9 +++++++++ kernel/sched/topology.c | 1 + 3 files changed, 42 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f232305dcefe..98d96d40e202 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -97,6 +97,17 @@ static inline unsigned long dl_bw_capacity(int i) return __dl_bw_capacity(i); } } + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + struct root_domain *rd = cpu_rq(cpu)->rd; + + if (rd->visit_gen == gen) + return true; + + rd->visit_gen = gen; + return false; +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -112,6 +123,11 @@ static inline unsigned long dl_bw_capacity(int i) { return SCHED_CAPACITY_SCALE; } + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + return false; +} #endif static inline @@ -2535,11 +2551,15 @@ const struct sched_class dl_sched_class .update_curr = update_curr_dl, }; +/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ +static u64 dl_generation; + int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + u64 gen = ++dl_generation; struct dl_bw *dl_b; int cpu, ret = 0; unsigned long flags; @@ -2548,13 +2568,13 @@ int sched_dl_global_validate(void) * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) + goto next; + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); @@ -2562,6 +2582,7 @@ int sched_dl_global_validate(void) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); +next: rcu_read_unlock_sched(); if (ret) @@ -2587,6 +2608,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) void sched_dl_do_global(void) { u64 new_bw = -1; + u64 gen = ++dl_generation; struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -2597,11 +2619,14 @@ void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - /* - * FIXME: As above... - */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) { + rcu_read_unlock_sched(); + continue; + } + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index df80bfcea92e..49a2daea618b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -801,6 +801,15 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; + /* + * Indicate whether a root_domain's dl_bw has been checked or + * updated. It's monotonously increasing value. + * + * Also, some corner cases, like 'wrap around' is dangerous, but given + * that u64 is 'big enough'. So that shouldn't be a concern. + */ + u64 visit_gen; + #ifdef HAVE_RT_PUSH_IPI /* * For IPI pull requests, loop across the rto_mask. diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dd7770226086..90f3e5558fa2 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -516,6 +516,7 @@ static int init_rootdomain(struct root_domain *rd) init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); #endif + rd->visit_gen = 0; init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; -- cgit v1.2.3 From a57415f5d1e43c3a5c5d412cd85e2792d7ed9b11 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Thu, 8 Oct 2020 23:49:42 +0800 Subject: sched/deadline: Fix sched_dl_global_validate() When change sched_rt_{runtime, period}_us, we validate that the new settings should at least accommodate the currently allocated -dl bandwidth: sched_rt_handler() --> sched_dl_bandwidth_validate() { new_bw = global_rt_runtime()/global_rt_period(); for_each_possible_cpu(cpu) { dl_b = dl_bw_of(cpu); if (new_bw < dl_b->total_bw) <------- ret = -EBUSY; } } But under CONFIG_SMP, dl_bw is per root domain , but not per CPU, dl_b->total_bw is the allocated bandwidth of the whole root domain. Instead, we should compare dl_b->total_bw against "cpus*new_bw", where 'cpus' is the number of CPUs of the root domain. Also, below annotation(in kernel/sched/sched.h) implied implementation only appeared in SCHED_DEADLINE v2[1], then deadline scheduler kept evolving till got merged(v9), but the annotation remains unchanged, meaningless and misleading, update it. * With respect to SMP, the bandwidth is given on a per-CPU basis, * meaning that: * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; * - dl_total_bw array contains, in the i-eth element, the currently * allocated bandwidth on the i-eth CPU. [1]: https://lore.kernel.org/lkml/1267385230.13676.101.camel@Palantir/ Fixes: 332ac17ef5bf ("sched/deadline: Add bandwidth management for SCHED_DEADLINE tasks") Signed-off-by: Peng Liu Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/db6bbda316048cda7a1bbc9571defde193a8d67e.1602171061.git.iwtbavbm@gmail.com --- kernel/sched/deadline.c | 5 +++-- kernel/sched/sched.h | 42 ++++++++++++++++++------------------------ 2 files changed, 21 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 98d96d40e202..0f75e95ae024 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2561,7 +2561,7 @@ int sched_dl_global_validate(void) u64 new_bw = to_ratio(period, runtime); u64 gen = ++dl_generation; struct dl_bw *dl_b; - int cpu, ret = 0; + int cpu, cpus, ret = 0; unsigned long flags; /* @@ -2576,9 +2576,10 @@ int sched_dl_global_validate(void) goto next; dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) + if (new_bw * cpus < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 49a2daea618b..965b2968c13a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -257,30 +257,6 @@ struct rt_bandwidth { void __dl_clear_params(struct task_struct *p); -/* - * To keep the bandwidth of -deadline tasks and groups under control - * we need some place where: - * - store the maximum -deadline bandwidth of the system (the group); - * - cache the fraction of that bandwidth that is currently allocated. - * - * This is all done in the data structure below. It is similar to the - * one used for RT-throttling (rt_bandwidth), with the main difference - * that, since here we are only interested in admission control, we - * do not decrease any runtime while the group "executes", neither we - * need a timer to replenish it. - * - * With respect to SMP, the bandwidth is given on a per-CPU basis, - * meaning that: - * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; - * - dl_total_bw array contains, in the i-eth element, the currently - * allocated bandwidth on the i-eth CPU. - * Moreover, groups consume bandwidth on each CPU, while tasks only - * consume bandwidth on the CPU they're running on. - * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw - * that will be shown the next time the proc or cgroup controls will - * be red. It on its turn can be changed by writing on its own - * control. - */ struct dl_bandwidth { raw_spinlock_t dl_runtime_lock; u64 dl_runtime; @@ -292,6 +268,24 @@ static inline int dl_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } +/* + * To keep the bandwidth of -deadline tasks under control + * we need some place where: + * - store the maximum -deadline bandwidth of each cpu; + * - cache the fraction of bandwidth that is currently allocated in + * each root domain; + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, bandwidth is given on a per root domain basis, + * meaning that: + * - bw (< 100%) is the deadline bandwidth of each CPU; + * - total_bw is the currently allocated bandwidth in each root domain; + */ struct dl_bw { raw_spinlock_t lock; u64 bw; -- cgit v1.2.3 From 5e054bca44fe92323de5e9b71478d1904b8bb1b7 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 22 Sep 2020 10:39:33 +0200 Subject: sched/cpupri: Remove pri_to_cpu[CPUPRI_IDLE] pri_to_cpu[CPUPRI_IDLE=0] isn't used since cpupri_set(..., newpri) is never called with newpri = MAX_PRIO (140). Current mapping: p->rt_priority p->prio newpri cpupri -1 -1 (CPUPRI_INVALID) 140 0 (CPUPRI_IDLE) 100 1 (CPUPRI_NORMAL) 1 98 98 3 ... 49 50 50 51 50 49 49 52 ... 99 0 0 101 Even when cpupri was introduced with commit 6e0534f27819 ("sched: use a 2-d bitmap for searching lowest-pri CPU") in v2.6.27, only (1) CPUPRI_INVALID (-1), (2) MAX_RT_PRIO (100), (3) an RT prio (RT1..RT99) were used as newprio in cpupri_set(..., newpri) -> convert_prio(newpri). MAX_RT_PRIO is used only in dec_rt_tasks() -> dec_rt_prio() -> dec_rt_prio_smp() -> cpupri_set() in case of !rt_rq->rt_nr_running. I.e. it stands for a non-rt task, including the IDLE task. Commit 57785df5ac53 ("sched: Fix task priority bug") removed code in v2.6.33 which did set the priority of the IDLE task to MAX_PRIO. Although this happened after the introduction of cpupri, it didn't have an effect on the values used for cpupri_set(..., newpri). Remove CPUPRI_IDLE and adapt the cpupri implementation accordingly. This will save a useless for loop with an atomic_read in cpupri_find_fitness() calling __cpupri_find(). New mapping: p->rt_priority p->prio newpri cpupri -1 -1 (CPUPRI_INVALID) 100 0 (CPUPRI_NORMAL) 1 98 98 2 ... 49 50 50 50 50 49 49 51 ... 99 0 0 100 Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200922083934.19275-2-dietmar.eggemann@arm.com --- kernel/sched/cpupri.c | 10 ++++------ kernel/sched/cpupri.h | 7 +++---- 2 files changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 0033731a0797..a5d14ed485f4 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -11,7 +11,7 @@ * This code tracks the priority of each CPU so that global migration * decisions are easy to calculate. Each CPU can be in a state as follows: * - * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * (INVALID), NORMAL, RT1, ... RT99 * * going from the lowest priority to the highest. CPUs in the INVALID state * are not eligible for routing. The system maintains this state with @@ -19,24 +19,22 @@ * in that class). Therefore a typical application without affinity * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * searches). For tasks with affinity restrictions, the algorithm has a - * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ #include "sched.h" -/* Convert between a 140 based task->prio, and our 102 based cpupri */ +/* Convert between a 140 based task->prio, and our 101 based cpupri */ static int convert_prio(int prio) { int cpupri; if (prio == CPUPRI_INVALID) cpupri = CPUPRI_INVALID; - else if (prio == MAX_PRIO) - cpupri = CPUPRI_IDLE; else if (prio >= MAX_RT_PRIO) cpupri = CPUPRI_NORMAL; else - cpupri = MAX_RT_PRIO - prio + 1; + cpupri = MAX_RT_PRIO - prio; return cpupri; } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index efbb492bb94c..1a162369b8d4 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,11 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 1) #define CPUPRI_INVALID -1 -#define CPUPRI_IDLE 0 -#define CPUPRI_NORMAL 1 -/* values 2-101 are RT priorities 0-99 */ +#define CPUPRI_NORMAL 0 +/* values 2-100 are RT priorities 0-99 */ struct cpupri_vec { atomic_t count; -- cgit v1.2.3 From 1b08782ce31f612d98e11ccccf3e3df9a147a67d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 22 Sep 2020 10:39:34 +0200 Subject: sched/cpupri: Remove pri_to_cpu[1] pri_to_cpu[1] isn't used since cpupri_set(..., newpri) is never called with newpri = 99. The valid RT priorities RT1..RT99 (p->rt_priority = [1..99]) map into cpupri (idx of pri_to_cpu[]) = [2..100] Current mapping: p->rt_priority p->prio newpri cpupri -1 -1 (CPUPRI_INVALID) 100 0 (CPUPRI_NORMAL) 1 98 98 2 ... 49 50 50 50 50 49 49 51 ... 99 0 0 100 So cpupri = 1 isn't used. Reduce the size of pri_to_cpu[] by 1 and adapt the cpupri implementation accordingly. This will save a useless for loop with an atomic_read in cpupri_find_fitness() calling __cpupri_find(). New mapping: p->rt_priority p->prio newpri cpupri -1 -1 (CPUPRI_INVALID) 100 0 (CPUPRI_NORMAL) 1 98 98 1 ... 49 50 50 49 50 49 49 50 ... 99 0 0 99 Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200922083934.19275-3-dietmar.eggemann@arm.com --- kernel/sched/cpupri.c | 6 +++--- kernel/sched/cpupri.h | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index a5d14ed485f4..8d9952a51664 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -19,12 +19,12 @@ * in that class). Therefore a typical application without affinity * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * searches). For tasks with affinity restrictions, the algorithm has a - * worst case complexity of O(min(101, nr_domcpus)), though the scenario that + * worst case complexity of O(min(100, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ #include "sched.h" -/* Convert between a 140 based task->prio, and our 101 based cpupri */ +/* Convert between a 140 based task->prio, and our 100 based cpupri */ static int convert_prio(int prio) { int cpupri; @@ -34,7 +34,7 @@ static int convert_prio(int prio) else if (prio >= MAX_RT_PRIO) cpupri = CPUPRI_NORMAL; else - cpupri = MAX_RT_PRIO - prio; + cpupri = MAX_RT_PRIO - prio - 1; return cpupri; } diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index 1a162369b8d4..e28e1ed12e3d 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,10 +1,10 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 1) +#define CPUPRI_NR_PRIORITIES MAX_RT_PRIO #define CPUPRI_INVALID -1 #define CPUPRI_NORMAL 0 -/* values 2-100 are RT priorities 0-99 */ +/* values 1-99 are for RT1-RT99 priorities */ struct cpupri_vec { atomic_t count; -- cgit v1.2.3 From 934fc3314b39e16a89fc4d5d0d5cbfe71dcbe7b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Oct 2020 21:06:49 +0200 Subject: sched/cpupri: Remap CPUPRI_NORMAL to MAX_RT_PRIO-1 This makes the mapping continuous and frees up 100 for other usage. Prev mapping: p->rt_priority p->prio newpri cpupri -1 -1 (CPUPRI_INVALID) 100 0 (CPUPRI_NORMAL) 1 98 98 1 ... 49 50 50 49 50 49 49 50 ... 99 0 0 99 New mapping: p->rt_priority p->prio newpri cpupri -1 -1 (CPUPRI_INVALID) 99 0 (CPUPRI_NORMAL) 1 98 98 1 ... 49 50 50 49 50 49 49 50 ... 99 0 0 99 Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann --- kernel/sched/cpupri.c | 34 +++++++++++++++++++++++++++------- kernel/sched/rt.c | 16 +++++++++------- 2 files changed, 36 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 8d9952a51664..e43491039226 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -24,17 +24,37 @@ */ #include "sched.h" -/* Convert between a 140 based task->prio, and our 100 based cpupri */ +/* + * p->rt_priority p->prio newpri cpupri + * + * -1 -1 (CPUPRI_INVALID) + * + * 99 0 (CPUPRI_NORMAL) + * + * 1 98 98 1 + * ... + * 49 50 50 49 + * 50 49 49 50 + * ... + * 99 0 0 99 + */ static int convert_prio(int prio) { int cpupri; - if (prio == CPUPRI_INVALID) - cpupri = CPUPRI_INVALID; - else if (prio >= MAX_RT_PRIO) - cpupri = CPUPRI_NORMAL; - else - cpupri = MAX_RT_PRIO - prio - 1; + switch (prio) { + case CPUPRI_INVALID: + cpupri = CPUPRI_INVALID; /* -1 */ + break; + + case 0 ... 98: + cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */ + break; + + case MAX_RT_PRIO-1: + cpupri = CPUPRI_NORMAL; /* 0 */ + break; + } return cpupri; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 49ec096a8aa1..8a3b1ba09253 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -89,8 +89,8 @@ void init_rt_rq(struct rt_rq *rt_rq) __set_bit(MAX_RT_PRIO, array->bitmap); #if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->highest_prio.next = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + rt_rq->highest_prio.next = MAX_RT_PRIO-1; rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); @@ -161,7 +161,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->rt_nr_boosted = 0; rt_rq->rq = rq; rt_rq->tg = tg; @@ -393,8 +393,9 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) p = plist_first_entry(&rq->rt.pushable_tasks, struct task_struct, pushable_tasks); rq->rt.highest_prio.next = p->prio; - } else - rq->rt.highest_prio.next = MAX_RT_PRIO; + } else { + rq->rt.highest_prio.next = MAX_RT_PRIO-1; + } } #else @@ -1147,8 +1148,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) sched_find_first_bit(array->bitmap); } - } else - rt_rq->highest_prio.curr = MAX_RT_PRIO; + } else { + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + } dec_rt_prio_smp(rt_rq, prio, prev_prio); } -- cgit v1.2.3 From b13772f8135633f273f0cf742143b19cffbf9e1d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Oct 2020 21:39:04 +0200 Subject: sched/cpupri: Add CPUPRI_HIGHER Add CPUPRI_HIGHER above the RT99 priority to denote the CPU is in use by higher priority tasks (specifically deadline). XXX: we should probably drive PUSH-PULL from cpupri, that would automagically result in an RT-PUSH when DL sets cpupri to CPUPRI_HIGHER. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann --- kernel/sched/cpupri.c | 12 +++++++++--- kernel/sched/cpupri.h | 3 ++- kernel/sched/deadline.c | 3 +++ 3 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index e43491039226..9ca0835f260a 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -11,7 +11,7 @@ * This code tracks the priority of each CPU so that global migration * decisions are easy to calculate. Each CPU can be in a state as follows: * - * (INVALID), NORMAL, RT1, ... RT99 + * (INVALID), NORMAL, RT1, ... RT99, HIGHER * * going from the lowest priority to the highest. CPUs in the INVALID state * are not eligible for routing. The system maintains this state with @@ -19,7 +19,7 @@ * in that class). Therefore a typical application without affinity * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * searches). For tasks with affinity restrictions, the algorithm has a - * worst case complexity of O(min(100, nr_domcpus)), though the scenario that + * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ #include "sched.h" @@ -37,6 +37,8 @@ * 50 49 49 50 * ... * 99 0 0 99 + * + * 100 100 (CPUPRI_HIGHER) */ static int convert_prio(int prio) { @@ -54,6 +56,10 @@ static int convert_prio(int prio) case MAX_RT_PRIO-1: cpupri = CPUPRI_NORMAL; /* 0 */ break; + + case MAX_RT_PRIO: + cpupri = CPUPRI_HIGHER; /* 100 */ + break; } return cpupri; @@ -195,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, * cpupri_set - update the CPU priority setting * @cp: The cpupri context * @cpu: The target CPU - * @newpri: The priority (INVALID-RT99) to assign to this CPU + * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked * diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index e28e1ed12e3d..d6cba0020064 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,10 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define CPUPRI_NR_PRIORITIES MAX_RT_PRIO +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) #define CPUPRI_INVALID -1 #define CPUPRI_NORMAL 0 /* values 1-99 are for RT1-RT99 priorities */ +#define CPUPRI_HIGHER 100 struct cpupri_vec { atomic_t count; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0f75e95ae024..0b45dd1068f7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1394,6 +1394,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + if (dl_rq->earliest_dl.curr == 0) + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER); dl_rq->earliest_dl.curr = deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } @@ -1411,6 +1413,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { struct rb_node *leftmost = dl_rq->root.rb_leftmost; struct sched_dl_entity *entry; -- cgit v1.2.3 From 45da7a2b0af8fa29dff2e6ba8926322068350fce Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 Aug 2020 10:48:17 +0200 Subject: sched/fair: Exclude the current CPU from find_new_ilb() It is possible for find_new_ilb() to select the current CPU, however, this only happens from newidle balancing, in which case need_resched() will be true, and consequently nohz_csd_func() will not trigger the softirq. Exclude the current CPU from becoming an ILB target. Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/fair.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b9368d123451..cd9a37c0601b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10056,6 +10056,10 @@ static inline int find_new_ilb(void) for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { + + if (ilb == smp_processor_id()) + continue; + if (idle_cpu(ilb)) return ilb; } -- cgit v1.2.3 From 5bc78502322a5e4eef3f1b2a2813751dc6434143 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 20 Oct 2020 09:47:13 -0400 Subject: sched: fix exit_mm vs membarrier (v4) exit_mm should issue memory barriers after user-space memory accesses, before clearing current->mm, to order user-space memory accesses performed prior to exit_mm before clearing tsk->mm, which has the effect of skipping the membarrier private expedited IPIs. exit_mm should also update the runqueue's membarrier_state so membarrier global expedited IPIs are not sent when they are not needed. The membarrier system call can be issued concurrently with do_exit if we have thread groups created with CLONE_VM but not CLONE_THREAD. Here is the scenario I have in mind: Two thread groups are created, A and B. Thread group B is created by issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD. Let's assume we have a single thread within each thread group (Thread A and Thread B). The AFAIU we can have: Userspace variables: int x = 0, y = 0; CPU 0 CPU 1 Thread A Thread B (in thread group A) (in thread group B) x = 1 barrier() y = 1 exit() exit_mm() current->mm = NULL; r1 = load y membarrier() skips CPU 0 (no IPI) because its current mm is NULL r2 = load x BUG_ON(r1 == 1 && r2 == 0) Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201020134715.13909-2-mathieu.desnoyers@efficios.com --- include/linux/sched/mm.h | 5 +++++ kernel/exit.c | 16 +++++++++++++++- kernel/sched/membarrier.c | 12 ++++++++++++ 3 files changed, 32 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index d5ece7a9a403..a91fb3ad9ec7 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -347,6 +347,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) extern void membarrier_exec_mmap(struct mm_struct *mm); +extern void membarrier_update_current_mm(struct mm_struct *next_mm); + #else #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS static inline void membarrier_arch_switch_mm(struct mm_struct *prev, @@ -361,6 +363,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm) static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { } +static inline void membarrier_update_current_mm(struct mm_struct *next_mm) +{ +} #endif #endif /* _LINUX_SCHED_MM_H */ diff --git a/kernel/exit.c b/kernel/exit.c index 87a2d515de0d..a3dd6b36f99a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -475,10 +475,24 @@ static void exit_mm(void) BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); + /* + * When a thread stops operating on an address space, the loop + * in membarrier_private_expedited() may not observe that + * tsk->mm, and the loop in membarrier_global_expedited() may + * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED + * rq->membarrier_state, so those would not issue an IPI. + * Membarrier requires a memory barrier after accessing + * user-space memory, before clearing tsk->mm or the + * rq->membarrier_state. + */ + smp_mb__after_spinlock(); + local_irq_disable(); current->mm = NULL; - mmap_read_unlock(mm); + membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); + local_irq_enable(); task_unlock(current); + mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index e23e74d52db5..aac329258af0 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -76,6 +76,18 @@ void membarrier_exec_mmap(struct mm_struct *mm) this_cpu_write(runqueues.membarrier_state, 0); } +void membarrier_update_current_mm(struct mm_struct *next_mm) +{ + struct rq *rq = this_rq(); + int membarrier_state = 0; + + if (next_mm) + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} + static int membarrier_global_expedited(void) { int cpu; -- cgit v1.2.3 From 618758ed3a4f7d790414d020b362111748ebbf9f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 20 Oct 2020 09:47:14 -0400 Subject: sched: membarrier: cover kthread_use_mm (v4) Add comments and memory barrier to kthread_use_mm and kthread_unuse_mm to allow the effect of membarrier(2) to apply to kthreads accessing user-space memory as well. Given that no prior kthread use this guarantee and that it only affects kthreads, adding this guarantee does not affect user-space ABI. Refine the check in membarrier_global_expedited to exclude runqueues running the idle thread rather than all kthreads from the IPI cpumask. Now that membarrier_global_expedited can IPI kthreads, the scheduler also needs to update the runqueue's membarrier_state when entering lazy TLB state. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201020134715.13909-3-mathieu.desnoyers@efficios.com --- kernel/kthread.c | 21 +++++++++++++++++++++ kernel/sched/idle.c | 1 + kernel/sched/membarrier.c | 7 +++---- 3 files changed, 25 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index e29773c82b70..481428fe5f22 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1248,6 +1248,7 @@ void kthread_use_mm(struct mm_struct *mm) tsk->active_mm = mm; } tsk->mm = mm; + membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); @@ -1255,8 +1256,19 @@ void kthread_use_mm(struct mm_struct *mm) finish_arch_post_lock_switch(); #endif + /* + * When a kthread starts operating on an address space, the loop + * in membarrier_{private,global}_expedited() may not observe + * that tsk->mm, and not issue an IPI. Membarrier requires a + * memory barrier after storing to tsk->mm, before accessing + * user-space memory. A full memory barrier for membarrier + * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by + * mmdrop(), or explicitly with smp_mb(). + */ if (active_mm != mm) mmdrop(active_mm); + else + smp_mb(); to_kthread(tsk)->oldfs = force_uaccess_begin(); } @@ -1276,9 +1288,18 @@ void kthread_unuse_mm(struct mm_struct *mm) force_uaccess_end(to_kthread(tsk)->oldfs); task_lock(tsk); + /* + * When a kthread stops operating on an address space, the loop + * in membarrier_{private,global}_expedited() may not observe + * that tsk->mm, and not issue an IPI. Membarrier requires a + * memory barrier after accessing user-space memory, before + * clearing tsk->mm. + */ + smp_mb__after_spinlock(); sync_mm_rss(mm); local_irq_disable(); tsk->mm = NULL; + membarrier_update_current_mm(NULL); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 24d0ee26377d..846743e39b3c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -338,6 +338,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); WARN_ON_ONCE(!duration_ns); + WARN_ON_ONCE(current->mm); rcu_sleep_check(); preempt_disable(); diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index aac329258af0..f223f3590b8f 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -126,12 +126,11 @@ static int membarrier_global_expedited(void) continue; /* - * Skip the CPU if it runs a kernel thread. The scheduler - * leaves the prior task mm in place as an optimization when - * scheduling a kthread. + * Skip the CPU if it runs a kernel thread which is not using + * a task mm. */ p = rcu_dereference(cpu_rq(cpu)->curr); - if (p->flags & PF_KTHREAD) + if (!p->mm) continue; __cpumask_set_cpu(cpu, tmpmask); -- cgit v1.2.3 From 25595eb6aaa9fbb31330f1e0b400642694bc6574 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 20 Oct 2020 09:47:15 -0400 Subject: sched: membarrier: document memory ordering scenarios Document membarrier ordering scenarios in membarrier.c. Thanks to Alan Stern for refreshing my memory. Now that I have those in mind, it seems appropriate to serialize them to comments for posterity. Signed-off-by: Mathieu Desnoyers Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201020134715.13909-4-mathieu.desnoyers@efficios.com --- kernel/sched/membarrier.c | 128 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index f223f3590b8f..5a40b3828ff2 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -6,6 +6,134 @@ */ #include "sched.h" +/* + * For documentation purposes, here are some membarrier ordering + * scenarios to keep in mind: + * + * A) Userspace thread execution after IPI vs membarrier's memory + * barrier before sending the IPI + * + * Userspace variables: + * + * int x = 0, y = 0; + * + * The memory barrier at the start of membarrier() on CPU0 is necessary in + * order to enforce the guarantee that any writes occurring on CPU0 before + * the membarrier() is executed will be visible to any code executing on + * CPU1 after the IPI-induced memory barrier: + * + * CPU0 CPU1 + * + * x = 1 + * membarrier(): + * a: smp_mb() + * b: send IPI IPI-induced mb + * c: smp_mb() + * r2 = y + * y = 1 + * barrier() + * r1 = x + * + * BUG_ON(r1 == 0 && r2 == 0) + * + * The write to y and load from x by CPU1 are unordered by the hardware, + * so it's possible to have "r1 = x" reordered before "y = 1" at any + * point after (b). If the memory barrier at (a) is omitted, then "x = 1" + * can be reordered after (a) (although not after (c)), so we get r1 == 0 + * and r2 == 0. This violates the guarantee that membarrier() is + * supposed by provide. + * + * The timing of the memory barrier at (a) has to ensure that it executes + * before the IPI-induced memory barrier on CPU1. + * + * B) Userspace thread execution before IPI vs membarrier's memory + * barrier after completing the IPI + * + * Userspace variables: + * + * int x = 0, y = 0; + * + * The memory barrier at the end of membarrier() on CPU0 is necessary in + * order to enforce the guarantee that any writes occurring on CPU1 before + * the membarrier() is executed will be visible to any code executing on + * CPU0 after the membarrier(): + * + * CPU0 CPU1 + * + * x = 1 + * barrier() + * y = 1 + * r2 = y + * membarrier(): + * a: smp_mb() + * b: send IPI IPI-induced mb + * c: smp_mb() + * r1 = x + * BUG_ON(r1 == 0 && r2 == 1) + * + * The writes to x and y are unordered by the hardware, so it's possible to + * have "r2 = 1" even though the write to x doesn't execute until (b). If + * the memory barrier at (c) is omitted then "r1 = x" can be reordered + * before (b) (although not before (a)), so we get "r1 = 0". This violates + * the guarantee that membarrier() is supposed to provide. + * + * The timing of the memory barrier at (c) has to ensure that it executes + * after the IPI-induced memory barrier on CPU1. + * + * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * d: switch to kthread (includes mb) + * b: read rq->curr->mm == NULL + * e: switch to user (includes mb) + * c: smp_mb() + * + * Using the scenario from (A), we can show that (a) needs to be paired + * with (e). Using the scenario from (B), we can show that (c) needs to + * be paired with (d). + * + * D) exit_mm vs membarrier + * + * Two thread groups are created, A and B. Thread group B is created by + * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD. + * Let's assume we have a single thread within each thread group (Thread A + * and Thread B). Thread A runs on CPU0, Thread B runs on CPU1. + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * exit_mm(): + * d: smp_mb() + * e: current->mm = NULL + * b: read rq->curr->mm == NULL + * c: smp_mb() + * + * Using scenario (B), we can show that (c) needs to be paired with (d). + * + * E) kthread_{use,unuse}_mm vs membarrier + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * kthread_unuse_mm() + * d: smp_mb() + * e: current->mm = NULL + * b: read rq->curr->mm == NULL + * kthread_use_mm() + * f: current->mm = mm + * g: smp_mb() + * c: smp_mb() + * + * Using the scenario from (A), we can show that (a) needs to be paired + * with (g). Using the scenario from (B), we can show that (c) needs to + * be paired with (d). + */ + /* * Bitmask made from a "or" of all commands within enum membarrier_cmd, * except MEMBARRIER_CMD_QUERY. -- cgit v1.2.3 From 345a957fcc95630bf5535d7668a59ed983eb49a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 20 Oct 2020 16:46:55 +0200 Subject: sched: Reenable interrupts in do_sched_yield() do_sched_yield() invokes schedule() with interrupts disabled which is not allowed. This goes back to the pre git era to commit a6efb709806c ("[PATCH] irqlock patch 2.5.27-H6") in the history tree. Reenable interrupts and remove the misleading comment which "explains" it. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/87r1pt7y5c.fsf@nanos.tec.linutronix.de --- kernel/sched/core.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..6f533bb7d3b9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6094,12 +6094,8 @@ static void do_sched_yield(void) schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ preempt_disable(); - rq_unlock(rq, &rf); + rq_unlock_irq(rq, &rf); sched_preempt_enable_no_resched(); schedule(); -- cgit v1.2.3 From 43c31ac0e665d942fcaba83a725a8b1aeeb7adf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Oct 2020 15:45:33 +0200 Subject: sched: Remove relyance on STRUCT_ALIGNMENT Florian reported that all of kernel/sched/ is rebuild when CONFIG_BLK_DEV_INITRD is changed, which, while not a bug is unexpected. This is due to us including vmlinux.lds.h. Jakub explained that the problem is that we put the alignment requirement on the type instead of on a variable. Type alignment is a minimum, the compiler is free to pick any larger alignment for a specific instance of the type (eg. the variable). So force the type alignment on all individual variable definitions and remove the undesired dependency on vmlinux.lds.h. Fixes: 85c2ce9104eb ("sched, vmlinux.lds: Increase STRUCT_ALIGNMENT to 64 bytes for GCC-4.9") Reported-by: Florian Fainelli Suggested-by: Jakub Jelinek Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/deadline.c | 4 ++-- kernel/sched/fair.c | 4 ++-- kernel/sched/idle.c | 4 ++-- kernel/sched/rt.c | 4 ++-- kernel/sched/sched.h | 17 +++++++++++++++-- kernel/sched/stop_task.c | 3 +-- 6 files changed, 24 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0b45dd1068f7..c6ce90fbb7ac 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2522,8 +2522,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } } -const struct sched_class dl_sched_class - __section("__dl_sched_class") = { +DEFINE_SCHED_CLASS(dl) = { + .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index cd9a37c0601b..f30d35a43f73 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -11171,8 +11171,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class - __section("__fair_sched_class") = { +DEFINE_SCHED_CLASS(fair) = { + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 846743e39b3c..9da69c4e0ee9 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -458,8 +458,8 @@ static void update_curr_idle(struct rq *rq) /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class - __section("__idle_sched_class") = { +DEFINE_SCHED_CLASS(idle) = { + /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 8a3b1ba09253..9b27352e0c1b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2431,8 +2431,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -const struct sched_class rt_sched_class - __section("__rt_sched_class") = { +DEFINE_SCHED_CLASS(rt) = { + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 965b2968c13a..3e45055efbc5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -67,7 +67,6 @@ #include #include -#include #ifdef CONFIG_PARAVIRT # include @@ -1836,7 +1835,7 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif -} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ +}; static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { @@ -1850,6 +1849,20 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) next->sched_class->set_next_task(rq, next, false); } + +/* + * Helper to define a sched_class instance; each one is placed in a separate + * section which is ordered by the linker script: + * + * include/asm-generic/vmlinux.lds.h + * + * Also enforce alignment on the instance, not the type, to guarantee layout. + */ +#define DEFINE_SCHED_CLASS(name) \ +const struct sched_class name##_sched_class \ + __aligned(__alignof__(struct sched_class)) \ + __section("__" #name "_sched_class") + /* Defined in include/asm-generic/vmlinux.lds.h */ extern struct sched_class __begin_sched_classes[]; extern struct sched_class __end_sched_classes[]; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index ceb5b6b12561..91bb10cc070e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -109,8 +109,7 @@ static void update_curr_stop(struct rq *rq) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -const struct sched_class stop_sched_class - __section("__stop_sched_class") = { +DEFINE_SCHED_CLASS(stop) = { .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, -- cgit v1.2.3 From d8fcb81f1acf651a0e50eacecca43d0524984f87 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 22 Oct 2020 15:15:50 +0200 Subject: sched/fair: Check for idle core in wake_affine In the case of a thread wakeup, wake_affine determines whether a core will be chosen for the thread on the socket where the thread ran previously or on the socket of the waker. This is done primarily by comparing the load of the core where th thread ran previously (prev) and the load of the waker (this). commit 11f10e5420f6 ("sched/fair: Use load instead of runnable load in wakeup path") changed the load computation from the runnable load to the load average, where the latter includes the load of threads that have already blocked on the core. When a short-running daemon processes happens to run on prev, this change raised the situation that prev could appear to have a greater load than this, even when prev is actually idle. When prev and this are on the same socket, the idle prev is detected later, in select_idle_sibling. But if that does not hold, prev is completely ignored, causing the waking thread to move to the socket of the waker. In the case of N mostly active threads on N cores, this triggers other migrations and hurts performance. In contrast, before commit 11f10e5420f6, the load on an idle core was 0, and in the case of a non-idle waker core, the effect of wake_affine was to select prev as the target for searching for a core for the waking thread. To avoid unnecessary migrations, extend wake_affine_idle to check whether the core where the thread previously ran is currently idle, and if so simply return that core as the target. [1] commit 11f10e5420f6ce ("sched/fair: Use load instead of runnable load in wakeup path") This particularly has an impact when using the ondemand power manager, where kworkers run every 0.004 seconds on all cores, increasing the likelihood that an idle core will be considered to have a load. The following numbers were obtained with the benchmarking tool hyperfine (https://github.com/sharkdp/hyperfine) on the NAS parallel benchmarks (https://www.nas.nasa.gov/publications/npb.html). The tests were run on an 80-core Intel(R) Xeon(R) CPU E7-8870 v4 @ 2.10GHz. Active (intel_pstate) and passive (intel_cpufreq) power management were used. Times are in seconds. All experiments use all 160 hardware threads. v5.9/intel-pstate v5.9+patch/intel-pstate bt.C.c 24.725724+-0.962340 23.349608+-1.607214 lu.C.x 29.105952+-4.804203 25.249052+-5.561617 sp.C.x 31.220696+-1.831335 30.227760+-2.429792 ua.C.x 26.606118+-1.767384 25.778367+-1.263850 v5.9/ondemand v5.9+patch/ondemand bt.C.c 25.330360+-1.028316 23.544036+-1.020189 lu.C.x 35.872659+-4.872090 23.719295+-3.883848 sp.C.x 32.141310+-2.289541 29.125363+-0.872300 ua.C.x 29.024597+-1.667049 25.728888+-1.539772 On the smaller data sets (A and B) and on the other NAS benchmarks there is no impact on performance. This also has a major impact on the splash2x.volrend benchmark of the parsec benchmark suite that goes from 1m25 without this patch to 0m45, in active (intel_pstate) mode. Fixes: 11f10e5420f6 ("sched/fair: Use load instead of runnable load in wakeup path") Signed-off-by: Julia Lawall Signed-off-by: Peter Zijlstra (Intel) Reviewed-by Vincent Guittot Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/1603372550-14680-1-git-send-email-Julia.Lawall@inria.fr --- kernel/sched/fair.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f30d35a43f73..52cacfc62922 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5813,6 +5813,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; + if (available_idle_cpu(prev_cpu)) + return prev_cpu; + return nr_cpumask_bits; } -- cgit v1.2.3 From 8d97e71811aaafe4abf611dc24822fd6e73df1a1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Oct 2020 06:57:46 -0700 Subject: perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE Current perf can report both virtual addresses and physical addresses, but not the MMU page size. Without the MMU page size information of the utilized page, users cannot decide whether to promote/demote large pages to optimize memory usage. Add a new sample type for the data MMU page size. Current perf already has a facility to collect data virtual addresses. A page walker is required to walk the pages tables and calculate the MMU page size from a given virtual address. On some platforms, e.g., X86, the page walker is invoked in an NMI handler. So the page walker must be NMI-safe and low overhead. Besides, the page walker should work for both user and kernel virtual address. The existing generic page walker, e.g., walk_page_range_novma(), is a little bit complex and doesn't guarantee the NMI-safe. The follow_page() is only for user-virtual address. Add a new function perf_get_page_size() to walk the page tables and calculate the MMU page size. In the function: - Interrupts have to be disabled to prevent any teardown of the page tables. - For user space threads, the current->mm is used for the page walker. For kernel threads and the like, the current->mm is NULL. The init_mm is used for the page walker. The active_mm is not used here, because it can be NULL. Quote from Peter Zijlstra, "context_switch() can set prev->active_mm to NULL when it transfers it to @next. It does this before @current is updated. So an NMI that comes in between this active_mm swizzling and updating @current will see !active_mm." - The MMU page size is calculated from the page table level. The method should work for all architectures, but it has only been verified on X86. Should there be some architectures, which support perf, where the method doesn't work, it can be fixed later separately. Reporting the wrong page size would not be fatal for the architecture. Some under discussion features may impact the method in the future. Quote from Dave Hansen, "There are lots of weird things folks are trying to do with the page tables, like Address Space Isolation. For instance, if you get a perf NMI when running userspace, current->mm->pgd is *different* than the PGD that was in use when userspace was running. It's close enough today, but it might not stay that way." If the case happens later, lots of consecutive page walk errors will happen. The worst case is that lots of page-size '0' are returned, which would not be fatal. In the perf tool, a check is implemented to detect this case. Once it happens, a kernel patch could be implemented accordingly then. Suggested-by: Peter Zijlstra Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201001135749.2804-2-kan.liang@linux.intel.com --- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 4 +- kernel/events/core.c | 103 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0c19d279b97f..7e3785dd27d9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1034,6 +1034,7 @@ struct perf_sample_data { u64 phys_addr; u64 cgroup; + u64 data_page_size; } ____cacheline_aligned; /* default value for data source */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 077e7ee69e3d..cc6ea346e9f9 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -143,8 +143,9 @@ enum perf_event_sample_format { PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_AUX = 1U << 20, PERF_SAMPLE_CGROUP = 1U << 21, + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, - PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 23, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; @@ -896,6 +897,7 @@ enum perf_event_type { * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * { u64 size; * char data[size]; } && PERF_SAMPLE_AUX + * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index fb662eb4fb69..a796db2f3b57 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "internal.h" @@ -1894,6 +1895,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + size += sizeof(data->data_page_size); + event->header_size = size; } @@ -6938,6 +6942,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_CGROUP) perf_output_put(handle, data->cgroup); + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + perf_output_put(handle, data->data_page_size); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -6995,6 +7002,94 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } +#ifdef CONFIG_MMU + +/* + * Return the MMU page size of a given virtual address + */ +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + return 0; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_leaf(*p4d)) + return 1ULL << P4D_SHIFT; + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return 0; + + if (pud_leaf(*pud)) + return 1ULL << PUD_SHIFT; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return 0; + + if (pmd_leaf(*pmd)) + return 1ULL << PMD_SHIFT; + + pte = pte_offset_map(pmd, addr); + if (!pte_present(*pte)) { + pte_unmap(pte); + return 0; + } + + pte_unmap(pte); + return PAGE_SIZE; +} + +#else + +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr) +{ + return 0; +} + +#endif + +static u64 perf_get_page_size(unsigned long addr) +{ + struct mm_struct *mm; + unsigned long flags; + u64 size; + + if (!addr) + return 0; + + /* + * Software page-table walkers must disable IRQs, + * which prevents any tear down of the page tables. + */ + local_irq_save(flags); + + mm = current->mm; + if (!mm) { + /* + * For kernel threads and the like, use init_mm so that + * we can find kernel memory. + */ + mm = &init_mm; + } + + size = __perf_get_page_size(mm, addr); + + local_irq_restore(flags); + + return size; +} + static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; struct perf_callchain_entry * @@ -7150,6 +7245,14 @@ void perf_prepare_sample(struct perf_event_header *header, } #endif + /* + * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't + * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, + * but the value will not dump to the userspace. + */ + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + data->data_page_size = perf_get_page_size(data->addr); + if (sample_type & PERF_SAMPLE_AUX) { u64 size; -- cgit v1.2.3 From 995f088efebe1eba0282a6ffa12411b37f8990c2 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 1 Oct 2020 06:57:49 -0700 Subject: perf/core: Add support for PERF_SAMPLE_CODE_PAGE_SIZE When studying code layout, it is useful to capture the page size of the sampled code address. Add a new sample type for code page size. The new sample type requires collecting the ip. The code page size can be calculated from the NMI-safe perf_get_page_size(). For large PEBS, it's very unlikely that the mapping is gone for the earlier PEBS records. Enable the feature for the large PEBS. The worst case is that page-size '0' is returned. Signed-off-by: Kan Liang Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201001135749.2804-5-kan.liang@linux.intel.com --- arch/x86/events/perf_event.h | 2 +- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 4 +++- kernel/events/core.c | 11 ++++++++++- 4 files changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index ee2b9b9fc2a5..10032f023fcc 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -132,7 +132,7 @@ struct amd_nb { PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ - PERF_SAMPLE_PERIOD) + PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE) #define PEBS_GP_REGS \ ((1ULL << PERF_REG_X86_AX) | \ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 7e3785dd27d9..e533b03af053 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1035,6 +1035,7 @@ struct perf_sample_data { u64 phys_addr; u64 cgroup; u64 data_page_size; + u64 code_page_size; } ____cacheline_aligned; /* default value for data source */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index cc6ea346e9f9..c2f20ee3124d 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -144,8 +144,9 @@ enum perf_event_sample_format { PERF_SAMPLE_AUX = 1U << 20, PERF_SAMPLE_CGROUP = 1U << 21, PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, + PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, - PERF_SAMPLE_MAX = 1U << 23, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 24, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; @@ -898,6 +899,7 @@ enum perf_event_type { * { u64 size; * char data[size]; } && PERF_SAMPLE_AUX * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE + * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index a796db2f3b57..7f655d19b8c4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1898,6 +1898,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) size += sizeof(data->data_page_size); + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + size += sizeof(data->code_page_size); + event->header_size = size; } @@ -6945,6 +6948,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) perf_output_put(handle, data->data_page_size); + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + perf_output_put(handle, data->code_page_size); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -7125,7 +7131,7 @@ void perf_prepare_sample(struct perf_event_header *header, __perf_event_header__init_id(header, data, event); - if (sample_type & PERF_SAMPLE_IP) + if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); if (sample_type & PERF_SAMPLE_CALLCHAIN) { @@ -7253,6 +7259,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) data->data_page_size = perf_get_page_size(data->addr); + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + data->code_page_size = perf_get_page_size(data->ip); + if (sample_type & PERF_SAMPLE_AUX) { u64 size; -- cgit v1.2.3 From 51b646b2d9f84d6ff6300e3c1d09f2be4329a424 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Oct 2020 11:09:27 +0200 Subject: perf,mm: Handle non-page-table-aligned hugetlbfs A limited nunmber of architectures support hugetlbfs sizes that do not align with the page-tables (ARM64, Power, Sparc64). Add support for this to the generic perf_get_page_size() implementation, and also allow an architecture to override this implementation. This latter is only needed when it uses non-page-table aligned huge pages in its kernel map. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/perf_event.h | 4 ++++ kernel/events/core.c | 39 +++++++++++++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index e533b03af053..0defb526cd0c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1590,4 +1590,8 @@ extern void __weak arch_perf_update_userpage(struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now); +#ifdef CONFIG_MMU +extern __weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr); +#endif + #endif /* _LINUX_PERF_EVENT_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 7f655d19b8c4..b458ed3dc81b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7011,10 +7011,18 @@ static u64 perf_virt_to_phys(u64 virt) #ifdef CONFIG_MMU /* - * Return the MMU page size of a given virtual address + * Return the MMU page size of a given virtual address. + * + * This generic implementation handles page-table aligned huge pages, as well + * as non-page-table aligned hugetlbfs compound pages. + * + * If an architecture supports and uses non-page-table aligned pages in their + * kernel mapping it will need to provide it's own implementation of this + * function. */ -static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr) +__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr) { + struct page *page; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -7036,15 +7044,27 @@ static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr) if (!pud_present(*pud)) return 0; - if (pud_leaf(*pud)) + if (pud_leaf(*pud)) { +#ifdef pud_page + page = pud_page(*pud); + if (PageHuge(page)) + return page_size(compound_head(page)); +#endif return 1ULL << PUD_SHIFT; + } pmd = pmd_offset(pud, addr); if (!pmd_present(*pmd)) return 0; - if (pmd_leaf(*pmd)) + if (pmd_leaf(*pmd)) { +#ifdef pmd_page + page = pmd_page(*pmd); + if (PageHuge(page)) + return page_size(compound_head(page)); +#endif return 1ULL << PMD_SHIFT; + } pte = pte_offset_map(pmd, addr); if (!pte_present(*pte)) { @@ -7052,13 +7072,20 @@ static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr) return 0; } + page = pte_page(*pte); + if (PageHuge(page)) { + u64 size = page_size(compound_head(page)); + pte_unmap(pte); + return size; + } + pte_unmap(pte); return PAGE_SIZE; } #else -static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr) +static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr) { return 0; } @@ -7089,7 +7116,7 @@ static u64 perf_get_page_size(unsigned long addr) mm = &init_mm; } - size = __perf_get_page_size(mm, addr); + size = arch_perf_get_page_size(mm, addr); local_irq_restore(flags); -- cgit v1.2.3 From 45ff510517f3b1354a3d9c273ad5e5e8d08312cb Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 28 Oct 2020 09:36:32 -0700 Subject: entry: Fixup irqentry_enter() comment irq_enter_from_user_mode() was changed to irqentry_enter_from_user_mode(). Update the comment within irqentry_enter() to reflect this change. Suggested-by: Thomas Gleixner Signed-off-by: Ira Weiny Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201028163632.965518-1-ira.weiny@intel.com --- kernel/entry/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 42eff115c426..f7ed415a6768 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -302,7 +302,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) /* * If RCU is not watching then the same careful * sequence vs. lockdep and tracing is required - * as in irq_enter_from_user_mode(). + * as in irqentry_enter_from_user_mode(). */ lockdep_hardirqs_off(CALLER_ADDR0); rcu_irq_enter(); -- cgit v1.2.3 From 5e8ed280dab9eeabc1ba0b2db5dbe9fe6debb6b5 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Tue, 27 Oct 2020 15:03:36 +0100 Subject: module: set MODULE_STATE_GOING state when a module fails to load If a module fails to load due to an error in prepare_coming_module(), the following error handling in load_module() runs with MODULE_STATE_COMING in module's state. Fix it by correctly setting MODULE_STATE_GOING under "bug_cleanup" label. Signed-off-by: Miroslav Benes Signed-off-by: Jessica Yu --- kernel/module.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index a4fa44a652a7..b34235082394 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3991,6 +3991,7 @@ static int load_module(struct load_info *info, const char __user *uargs, MODULE_STATE_GOING, mod); klp_module_going(mod); bug_cleanup: + mod->state = MODULE_STATE_GOING; /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); -- cgit v1.2.3 From d1e7c2996e988866e7ceceb4641a0886885b7889 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 29 Oct 2020 12:12:46 +0100 Subject: cpufreq: schedutil: Always call driver if CPUFREQ_NEED_UPDATE_LIMITS is set Because sugov_update_next_freq() may skip a frequency update even if the need_freq_update flag has been set for the policy at hand, policy limits updates may not take effect as expected. For example, if the intel_pstate driver operates in the passive mode with HWP enabled, it needs to update the HWP min and max limits when the policy min and max limits change, respectively, but that may not happen if the target frequency does not change along with the limit at hand. In particular, if the policy min is changed first, causing the target frequency to be adjusted to it, and the policy max limit is changed later to the same value, the HWP max limit will not be updated to follow it as expected, because the target frequency is still equal to the policy min limit and it will not change until that limit is updated. To address this issue, modify get_next_freq() to let the driver callback run if the CPUFREQ_NEED_UPDATE_LIMITS cpufreq driver flag is set regardless of whether or not the new frequency to set is equal to the previous one. Fixes: f6ebbcf08f37 ("cpufreq: intel_pstate: Implement passive mode with HWP enabled") Reported-by: Zhang Rui Tested-by: Zhang Rui Cc: 5.9+ # 5.9+: 1c534352f47f cpufreq: Introduce CPUFREQ_NEED_UPDATE_LIMITS ... Cc: 5.9+ # 5.9+: a62f68f5ca53 cpufreq: Introduce cpufreq_driver_test_flags() Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e254745a82cb..c03a5775d019 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -102,7 +102,8 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->next_freq == next_freq) + if (sg_policy->next_freq == next_freq && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) return false; sg_policy->next_freq = next_freq; @@ -161,7 +162,8 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, freq = map_util_freq(util, freq, max); - if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) + if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update && + !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) return sg_policy->next_freq; sg_policy->need_freq_update = false; -- cgit v1.2.3 From 9d0a49c7023c0905ea19116cf74beb7d9611d8ac Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 10:22:41 -0500 Subject: tracepoint: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- kernel/tracepoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 26efd22f0633..3f659f855074 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -50,7 +50,7 @@ static bool ok_to_free_tracepoints; */ struct tp_probes { struct rcu_head rcu; - struct tracepoint_func probes[0]; + struct tracepoint_func probes[]; }; static inline void *allocate_probes(int count) -- cgit v1.2.3 From fa29c9c11d4e2ba514421758991e5b3095642844 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 31 Aug 2020 10:23:32 -0500 Subject: params: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9-rc1/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 3835fb82c64b..164d79330849 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -530,7 +530,7 @@ struct module_param_attrs { unsigned int num; struct attribute_group grp; - struct param_attribute attrs[0]; + struct param_attribute attrs[]; }; #ifdef CONFIG_SYSFS -- cgit v1.2.3 From 080b6f40763565f65ebb9540219c71ce885cf568 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 28 Oct 2020 18:15:05 +0100 Subject: bpf: Don't rely on GCC __attribute__((optimize)) to disable GCSE Commit 3193c0836 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") introduced a __no_fgcse macro that expands to a function scope __attribute__((optimize("-fno-gcse"))), to disable a GCC specific optimization that was causing trouble on x86 builds, and was not expected to have any positive effect in the first place. However, as the GCC manual documents, __attribute__((optimize)) is not for production use, and results in all other optimization options to be forgotten for the function in question. This can cause all kinds of trouble, but in one particular reported case, it causes -fno-asynchronous-unwind-tables to be disregarded, resulting in .eh_frame info to be emitted for the function. This reverts commit 3193c0836, and instead, it disables the -fgcse optimization for the entire source file, but only when building for X86 using GCC with CONFIG_BPF_JIT_ALWAYS_ON disabled. Note that the original commit states that CONFIG_RETPOLINE=n triggers the issue, whereas CONFIG_RETPOLINE=y performs better without the optimization, so it is kept disabled in both cases. Fixes: 3193c0836f20 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") Signed-off-by: Ard Biesheuvel Signed-off-by: Alexei Starovoitov Tested-by: Geert Uytterhoeven Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/lkml/CAMuHMdUg0WJHEcq6to0-eODpXPOywLot6UD2=GFHpzoj_hCoBQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20201028171506.15682-2-ardb@kernel.org --- include/linux/compiler-gcc.h | 2 -- include/linux/compiler_types.h | 4 ---- kernel/bpf/Makefile | 6 +++++- kernel/bpf/core.c | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index d1e3c6896b71..5deb37024574 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -175,5 +175,3 @@ #else #define __diag_GCC_8(s) #endif - -#define __no_fgcse __attribute__((optimize("-fno-gcse"))) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6e390d58a9f8..ac3fa37a84f9 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -247,10 +247,6 @@ struct ftrace_likely_data { #define asm_inline asm #endif -#ifndef __no_fgcse -# define __no_fgcse -#endif - /* Are two types/vars the same type (ignoring qualifiers)? */ #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index bdc8cd1b6767..c1b9f71ee6aa 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o -CFLAGS_core.o += $(call cc-disable-warning, override-init) +ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y) +# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details +cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse +endif +CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9268d77898b7..55454d2278b1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1369,7 +1369,7 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * * Decode and execute eBPF instructions. */ -static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z -- cgit v1.2.3 From d48e3850030623e1c20785bceaaf78f916d0b1a3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 26 Oct 2020 16:22:56 +0100 Subject: locking/lockdep: Remove more raw_cpu_read() usage I initially thought raw_cpu_read() was OK, since if it is !0 we have IRQs disabled and can't get migrated, so if we get migrated both CPUs must have 0 and it doesn't matter which 0 we read. And while that is true; it isn't the whole store, on pretty much all architectures (except x86) this can result in computing the address for one CPU, getting migrated, the old CPU continuing execution with another task (possibly setting recursion) and then the new CPU reading the value of the old CPU, which is no longer 0. Similer to: baffd723e44d ("lockdep: Revert "lockdep: Use raw_cpu_*() for per-cpu variables"") Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201026152256.GB2651@hirez.programming.kicks-ass.net --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index fc206aefa970..11028497d4df 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -84,7 +84,7 @@ static inline bool lockdep_enabled(void) if (!debug_locks) return false; - if (raw_cpu_read(lockdep_recursion)) + if (this_cpu_read(lockdep_recursion)) return false; if (current->lockdep_recursion) -- cgit v1.2.3 From 1a39340865ce505a029b37aeb47a3e4c8db5f6c6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 27 Oct 2020 13:48:34 +0100 Subject: lockdep: Fix nr_unused_locks accounting Chris reported that commit 24d5a3bffef1 ("lockdep: Fix usage_traceoverflow") breaks the nr_unused_locks validation code triggered by /proc/lockdep_stats. By fully splitting LOCK_USED and LOCK_USED_READ it becomes a bad indicator for accounting nr_unused_locks; simplyfy by using any first bit. Fixes: 24d5a3bffef1 ("lockdep: Fix usage_traceoverflow") Reported-by: Chris Wilson Signed-off-by: Peter Zijlstra (Intel) Tested-by: Chris Wilson Link: https://lkml.kernel.org/r/20201027124834.GL2628@hirez.programming.kicks-ass.net --- kernel/locking/lockdep.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 11028497d4df..b71ad8d9f1c9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4396,6 +4396,9 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, if (unlikely(hlock_class(this)->usage_mask & new_mask)) goto unlock; + if (!hlock_class(this)->usage_mask) + debug_atomic_dec(nr_unused_locks); + hlock_class(this)->usage_mask |= new_mask; if (new_bit < LOCK_TRACE_STATES) { @@ -4403,19 +4406,10 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; } - switch (new_bit) { - case 0 ... LOCK_USED-1: + if (new_bit < LOCK_USED) { ret = mark_lock_irq(curr, this, new_bit); if (!ret) return 0; - break; - - case LOCK_USED: - debug_atomic_dec(nr_unused_locks); - break; - - default: - break; } unlock: -- cgit v1.2.3 From c50eb518e262fa06bd334e6eec172eaf5d7a5bd9 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 29 Oct 2020 00:19:24 -0700 Subject: bpf: Use separate lockdep class for each hashtab If a hashtab is accessed in both NMI and non-NMI contexts, it may cause deadlock in bucket->lock. LOCKDEP NMI warning highlighted this issue: ./test_progs -t stacktrace [ 74.828970] [ 74.828971] ================================ [ 74.828972] WARNING: inconsistent lock state [ 74.828973] 5.9.0-rc8+ #275 Not tainted [ 74.828974] -------------------------------- [ 74.828975] inconsistent {INITIAL USE} -> {IN-NMI} usage. [ 74.828976] taskset/1174 [HC2[2]:SC0[0]:HE0:SE1] takes: [ 74.828977] ffffc90000ee96b0 (&htab->buckets[i].raw_lock){....}-{2:2}, at: htab_map_update_elem+0x271/0x5a0 [ 74.828981] {INITIAL USE} state was registered at: [ 74.828982] lock_acquire+0x137/0x510 [ 74.828983] _raw_spin_lock_irqsave+0x43/0x90 [ 74.828984] htab_map_update_elem+0x271/0x5a0 [ 74.828984] 0xffffffffa0040b34 [ 74.828985] trace_call_bpf+0x159/0x310 [ 74.828986] perf_trace_run_bpf_submit+0x5f/0xd0 [ 74.828987] perf_trace_urandom_read+0x1be/0x220 [ 74.828988] urandom_read_nowarn.isra.0+0x26f/0x380 [ 74.828989] vfs_read+0xf8/0x280 [ 74.828989] ksys_read+0xc9/0x160 [ 74.828990] do_syscall_64+0x33/0x40 [ 74.828991] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 74.828992] irq event stamp: 1766 [ 74.828993] hardirqs last enabled at (1765): [] asm_exc_page_fault+0x1e/0x30 [ 74.828994] hardirqs last disabled at (1766): [] irqentry_enter+0x37/0x60 [ 74.828995] softirqs last enabled at (856): [] fpu__clear+0xac/0x120 [ 74.828996] softirqs last disabled at (854): [] fpu__clear+0x20/0x120 [ 74.828997] [ 74.828998] other info that might help us debug this: [ 74.828999] Possible unsafe locking scenario: [ 74.828999] [ 74.829000] CPU0 [ 74.829001] ---- [ 74.829001] lock(&htab->buckets[i].raw_lock); [ 74.829003] [ 74.829004] lock(&htab->buckets[i].raw_lock); [ 74.829006] [ 74.829006] *** DEADLOCK *** [ 74.829007] [ 74.829008] 1 lock held by taskset/1174: [ 74.829008] #0: ffff8883ec3fd020 (&cpuctx_lock){-...}-{2:2}, at: perf_event_task_tick+0x101/0x650 [ 74.829012] [ 74.829013] stack backtrace: [ 74.829014] CPU: 0 PID: 1174 Comm: taskset Not tainted 5.9.0-rc8+ #275 [ 74.829015] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 [ 74.829016] Call Trace: [ 74.829016] [ 74.829017] dump_stack+0x9a/0xd0 [ 74.829018] lock_acquire+0x461/0x510 [ 74.829019] ? lock_release+0x6b0/0x6b0 [ 74.829020] ? stack_map_get_build_id_offset+0x45e/0x800 [ 74.829021] ? htab_map_update_elem+0x271/0x5a0 [ 74.829022] ? rcu_read_lock_held_common+0x1a/0x50 [ 74.829022] ? rcu_read_lock_held+0x5f/0xb0 [ 74.829023] _raw_spin_lock_irqsave+0x43/0x90 [ 74.829024] ? htab_map_update_elem+0x271/0x5a0 [ 74.829025] htab_map_update_elem+0x271/0x5a0 [ 74.829026] bpf_prog_1fd9e30e1438d3c5_oncpu+0x9c/0xe88 [ 74.829027] bpf_overflow_handler+0x127/0x320 [ 74.829028] ? perf_event_text_poke_output+0x4d0/0x4d0 [ 74.829029] ? sched_clock_cpu+0x18/0x130 [ 74.829030] __perf_event_overflow+0xae/0x190 [ 74.829030] handle_pmi_common+0x34c/0x470 [ 74.829031] ? intel_pmu_save_and_restart+0x90/0x90 [ 74.829032] ? lock_acquire+0x3f8/0x510 [ 74.829033] ? lock_release+0x6b0/0x6b0 [ 74.829034] intel_pmu_handle_irq+0x11e/0x240 [ 74.829034] perf_event_nmi_handler+0x40/0x60 [ 74.829035] nmi_handle+0x110/0x360 [ 74.829036] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829037] default_do_nmi+0x6b/0x170 [ 74.829038] exc_nmi+0x106/0x130 [ 74.829038] end_repeat_nmi+0x16/0x55 [ 74.829039] RIP: 0010:__intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829042] Code: 2f 1f 03 48 8d bb b8 0c 00 00 e8 29 09 41 00 48 ... [ 74.829043] RSP: 0000:ffff8880a604fc90 EFLAGS: 00000002 [ 74.829044] RAX: 000000070000000f RBX: ffff8883ec2195a0 RCX: 000000000000038f [ 74.829045] RDX: 0000000000000007 RSI: ffffffff82e72c20 RDI: ffff8883ec21a258 [ 74.829046] RBP: 000000070000000f R08: ffffffff8101b013 R09: fffffbfff0a7982d [ 74.829047] R10: ffffffff853cc167 R11: fffffbfff0a7982c R12: 0000000000000000 [ 74.829049] R13: ffff8883ec3f0af0 R14: ffff8883ec3fd120 R15: ffff8883e9c92098 [ 74.829049] ? intel_pmu_lbr_enable_all+0x43/0x240 [ 74.829050] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829051] ? __intel_pmu_enable_all.constprop.0+0x72/0xf0 [ 74.829052] [ 74.829053] perf_event_task_tick+0x48d/0x650 [ 74.829054] scheduler_tick+0x129/0x210 [ 74.829054] update_process_times+0x37/0x70 [ 74.829055] tick_sched_handle.isra.0+0x35/0x90 [ 74.829056] tick_sched_timer+0x8f/0xb0 [ 74.829057] __hrtimer_run_queues+0x364/0x7d0 [ 74.829058] ? tick_sched_do_timer+0xa0/0xa0 [ 74.829058] ? enqueue_hrtimer+0x1e0/0x1e0 [ 74.829059] ? recalibrate_cpu_khz+0x10/0x10 [ 74.829060] ? ktime_get_update_offsets_now+0x1a3/0x360 [ 74.829061] hrtimer_interrupt+0x1bb/0x360 [ 74.829062] ? rcu_read_lock_sched_held+0xa1/0xd0 [ 74.829063] __sysvec_apic_timer_interrupt+0xed/0x3d0 [ 74.829064] sysvec_apic_timer_interrupt+0x3f/0xd0 [ 74.829064] ? asm_sysvec_apic_timer_interrupt+0xa/0x20 [ 74.829065] asm_sysvec_apic_timer_interrupt+0x12/0x20 [ 74.829066] RIP: 0033:0x7fba18d579b4 [ 74.829068] Code: 74 54 44 0f b6 4a 04 41 83 e1 0f 41 80 f9 ... [ 74.829069] RSP: 002b:00007ffc9ba69570 EFLAGS: 00000206 [ 74.829071] RAX: 00007fba192084c0 RBX: 00007fba18c24d28 RCX: 00000000000007a4 [ 74.829072] RDX: 00007fba18c30488 RSI: 0000000000000000 RDI: 000000000000037b [ 74.829073] RBP: 00007fba18ca5760 R08: 00007fba18c248fc R09: 00007fba18c94c30 [ 74.829074] R10: 000000000000002f R11: 0000000000073c30 R12: 00007ffc9ba695e0 [ 74.829075] R13: 00000000000003f3 R14: 00007fba18c21ac8 R15: 00000000000058d6 However, such warning should not apply across multiple hashtabs. The system will not deadlock if one hashtab is used in NMI, while another hashtab is used in non-NMI. Use separate lockdep class for each hashtab, so that we don't get this false alert. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201029071925.3103400-2-songliubraving@fb.com --- kernel/bpf/hashtab.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 1815e97d4c9c..278da031c91a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -99,6 +99,7 @@ struct bpf_htab { u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ u32 hashrnd; + struct lock_class_key lockdep_key; }; /* each htab element is struct htab_elem + key + value */ @@ -136,12 +137,18 @@ static void htab_init_buckets(struct bpf_htab *htab) { unsigned i; + lockdep_register_key(&htab->lockdep_key); for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); - if (htab_use_raw_lock(htab)) + if (htab_use_raw_lock(htab)) { raw_spin_lock_init(&htab->buckets[i].raw_lock); - else + lockdep_set_class(&htab->buckets[i].raw_lock, + &htab->lockdep_key); + } else { spin_lock_init(&htab->buckets[i].lock); + lockdep_set_class(&htab->buckets[i].lock, + &htab->lockdep_key); + } } } @@ -1312,6 +1319,7 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); } -- cgit v1.2.3 From 20b6cc34ea74b6a84599c1f8a70f3315b56a1883 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 29 Oct 2020 00:19:25 -0700 Subject: bpf: Avoid hashtab deadlock with map_locked If a hashtab is accessed in both non-NMI and NMI context, the system may deadlock on bucket->lock. Fix this issue with percpu counter map_locked. map_locked rejects concurrent access to the same bucket from the same CPU. To reduce memory overhead, map_locked is not added per bucket. Instead, 8 percpu counters are added to each hashtab. buckets are assigned to these counters based on the lower bits of its hash. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201029071925.3103400-3-songliubraving@fb.com --- kernel/bpf/hashtab.c | 114 ++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 82 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 278da031c91a..da59ba978d17 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -86,6 +86,9 @@ struct bucket { }; }; +#define HASHTAB_MAP_LOCK_COUNT 8 +#define HASHTAB_MAP_LOCK_MASK (HASHTAB_MAP_LOCK_COUNT - 1) + struct bpf_htab { struct bpf_map map; struct bucket *buckets; @@ -100,6 +103,7 @@ struct bpf_htab { u32 elem_size; /* size of each element in bytes */ u32 hashrnd; struct lock_class_key lockdep_key; + int __percpu *map_locked[HASHTAB_MAP_LOCK_COUNT]; }; /* each htab element is struct htab_elem + key + value */ @@ -152,26 +156,41 @@ static void htab_init_buckets(struct bpf_htab *htab) } } -static inline unsigned long htab_lock_bucket(const struct bpf_htab *htab, - struct bucket *b) +static inline int htab_lock_bucket(const struct bpf_htab *htab, + struct bucket *b, u32 hash, + unsigned long *pflags) { unsigned long flags; + hash = hash & HASHTAB_MAP_LOCK_MASK; + + migrate_disable(); + if (unlikely(__this_cpu_inc_return(*(htab->map_locked[hash])) != 1)) { + __this_cpu_dec(*(htab->map_locked[hash])); + migrate_enable(); + return -EBUSY; + } + if (htab_use_raw_lock(htab)) raw_spin_lock_irqsave(&b->raw_lock, flags); else spin_lock_irqsave(&b->lock, flags); - return flags; + *pflags = flags; + + return 0; } static inline void htab_unlock_bucket(const struct bpf_htab *htab, - struct bucket *b, + struct bucket *b, u32 hash, unsigned long flags) { + hash = hash & HASHTAB_MAP_LOCK_MASK; if (htab_use_raw_lock(htab)) raw_spin_unlock_irqrestore(&b->raw_lock, flags); else spin_unlock_irqrestore(&b->lock, flags); + __this_cpu_dec(*(htab->map_locked[hash])); + migrate_enable(); } static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node); @@ -429,8 +448,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; + int err, i; u64 cost; - int err; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) @@ -487,6 +506,13 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab->buckets) goto free_charge; + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { + htab->map_locked[i] = __alloc_percpu_gfp(sizeof(int), + sizeof(int), GFP_USER); + if (!htab->map_locked[i]) + goto free_map_locked; + } + if (htab->map.map_flags & BPF_F_ZERO_SEED) htab->hashrnd = 0; else @@ -497,7 +523,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (prealloc) { err = prealloc_init(htab); if (err) - goto free_buckets; + goto free_map_locked; if (!percpu && !lru) { /* lru itself can remove the least used element, so @@ -513,7 +539,9 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) free_prealloc: prealloc_destroy(htab); -free_buckets: +free_map_locked: + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) + free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); free_charge: bpf_map_charge_finish(&htab->map.memory); @@ -694,12 +722,15 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) struct hlist_nulls_node *n; unsigned long flags; struct bucket *b; + int ret; tgt_l = container_of(node, struct htab_elem, lru_node); b = __select_bucket(htab, tgt_l->hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, tgt_l->hash, &flags); + if (ret) + return false; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) if (l == tgt_l) { @@ -707,7 +738,7 @@ static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node) break; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, tgt_l->hash, flags); return l == tgt_l; } @@ -979,7 +1010,9 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, */ } - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1020,7 +1053,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1058,7 +1091,9 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, return -ENOMEM; memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size); - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1077,7 +1112,7 @@ static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value, ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (ret) bpf_lru_push_free(&htab->lru, &l_new->lru_node); @@ -1112,7 +1147,9 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1135,7 +1172,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1175,7 +1212,9 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, return -ENOMEM; } - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l_old = lookup_elem_raw(head, hash, key, key_size); @@ -1197,7 +1236,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, } ret = 0; err: - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (l_new) bpf_lru_push_free(&htab->lru, &l_new->lru_node); return ret; @@ -1225,7 +1264,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) struct htab_elem *l; unsigned long flags; u32 hash, key_size; - int ret = -ENOENT; + int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); @@ -1235,17 +1274,20 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_nulls_del_rcu(&l->hash_node); free_htab_elem(htab, l); - ret = 0; + } else { + ret = -ENOENT; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); return ret; } @@ -1257,7 +1299,7 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) struct htab_elem *l; unsigned long flags; u32 hash, key_size; - int ret = -ENOENT; + int ret; WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); @@ -1267,16 +1309,18 @@ static int htab_lru_map_delete_elem(struct bpf_map *map, void *key) b = __select_bucket(htab, hash); head = &b->head; - flags = htab_lock_bucket(htab, b); + ret = htab_lock_bucket(htab, b, hash, &flags); + if (ret) + return ret; l = lookup_elem_raw(head, hash, key, key_size); - if (l) { + if (l) hlist_nulls_del_rcu(&l->hash_node); - ret = 0; - } + else + ret = -ENOENT; - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, hash, flags); if (l) bpf_lru_push_free(&htab->lru, &l->lru_node); return ret; @@ -1302,6 +1346,7 @@ static void delete_all_elements(struct bpf_htab *htab) static void htab_map_free(struct bpf_map *map) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + int i; /* bpf_free_used_maps() or close(map_fd) will trigger this map_free callback. * bpf_free_used_maps() is called after bpf prog is no longer executing. @@ -1320,6 +1365,8 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); lockdep_unregister_key(&htab->lockdep_key); + for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) + free_percpu(htab->map_locked[i]); kfree(htab); } @@ -1423,8 +1470,11 @@ again_nocopy: b = &htab->buckets[batch]; head = &b->head; /* do not grab the lock unless need it (bucket_cnt > 0). */ - if (locked) - flags = htab_lock_bucket(htab, b); + if (locked) { + ret = htab_lock_bucket(htab, b, batch, &flags); + if (ret) + goto next_batch; + } bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) @@ -1441,7 +1491,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); goto after_loop; @@ -1452,7 +1502,7 @@ again_nocopy: /* Note that since bucket_cnt > 0 here, it is implicit * that the locked was grabbed, so release it. */ - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); rcu_read_unlock(); bpf_enable_instrumentation(); kvfree(keys); @@ -1505,7 +1555,7 @@ again_nocopy: dst_val += value_size; } - htab_unlock_bucket(htab, b, flags); + htab_unlock_bucket(htab, b, batch, flags); locked = false; while (node_to_free) { -- cgit v1.2.3 From 77f6c0b87479c4578ac0798fc249637092ac45a3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 24 Sep 2020 12:30:50 +0200 Subject: timekeeping: remove arch_gettimeoffset With Arm EBSA110 gone, nothing uses it any more, so the corresponding code and the Kconfig option can be removed. Acked-by: Thomas Gleixner Reviewed-by: Linus Walleij Signed-off-by: Arnd Bergmann --- .../time/modern-timekeeping/arch-support.txt | 33 ---------------------- drivers/Makefile | 2 -- drivers/clocksource/Kconfig | 2 +- include/linux/time.h | 13 --------- kernel/time/Kconfig | 9 ------ kernel/time/clocksource.c | 8 ------ kernel/time/timekeeping.c | 25 +--------------- kernel/trace/Kconfig | 2 -- 8 files changed, 2 insertions(+), 92 deletions(-) delete mode 100644 Documentation/features/time/modern-timekeeping/arch-support.txt (limited to 'kernel') diff --git a/Documentation/features/time/modern-timekeeping/arch-support.txt b/Documentation/features/time/modern-timekeeping/arch-support.txt deleted file mode 100644 index a84c3b9d9a94..000000000000 --- a/Documentation/features/time/modern-timekeeping/arch-support.txt +++ /dev/null @@ -1,33 +0,0 @@ -# -# Feature name: modern-timekeeping -# Kconfig: !ARCH_USES_GETTIMEOFFSET -# description: arch does not use arch_gettimeoffset() anymore -# - ----------------------- - | arch |status| - ----------------------- - | alpha: | ok | - | arc: | ok | - | arm: | TODO | - | arm64: | ok | - | c6x: | ok | - | csky: | ok | - | h8300: | ok | - | hexagon: | ok | - | ia64: | ok | - | m68k: | ok | - | microblaze: | ok | - | mips: | ok | - | nds32: | ok | - | nios2: | ok | - | openrisc: | ok | - | parisc: | ok | - | powerpc: | ok | - | riscv: | ok | - | s390: | ok | - | sh: | ok | - | sparc: | ok | - | um: | ok | - | x86: | ok | - | xtensa: | ok | - ----------------------- diff --git a/drivers/Makefile b/drivers/Makefile index c0cd1b9075e3..4ff1e4459512 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -135,9 +135,7 @@ obj-$(CONFIG_INFINIBAND) += infiniband/ obj-y += firmware/ obj-$(CONFIG_CRYPTO) += crypto/ obj-$(CONFIG_SUPERH) += sh/ -ifndef CONFIG_ARCH_USES_GETTIMEOFFSET obj-y += clocksource/ -endif obj-$(CONFIG_DCA) += dca/ obj-$(CONFIG_HID) += hid/ obj-$(CONFIG_PPC_PS3) += ps3/ diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 68b087bff59c..764936bfcb2c 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -563,7 +563,7 @@ config CLKSRC_QCOM config CLKSRC_VERSATILE bool "ARM Versatile (Express) reference platforms clock source" if COMPILE_TEST - depends on GENERIC_SCHED_CLOCK && !ARCH_USES_GETTIMEOFFSET + depends on GENERIC_SCHED_CLOCK select TIMER_OF default y if (ARCH_VEXPRESS || ARCH_VERSATILE) && ARM help diff --git a/include/linux/time.h b/include/linux/time.h index b142cb5f5a53..16cf4522d6f3 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -21,19 +21,6 @@ extern time64_t mktime64(const unsigned int year, const unsigned int mon, const unsigned int day, const unsigned int hour, const unsigned int min, const unsigned int sec); -/* Some architectures do not supply their own clocksource. - * This is mainly the case in architectures that get their - * inter-tick times by reading the counter on their interval - * timer. Since these timers wrap every tick, they're not really - * useful as clocksources. Wrapping them to act like one is possible - * but not very efficient. So we provide a callout these arches - * can implement for use with the jiffies clocksource to provide - * finer then tick granular time. - */ -#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET -extern u32 (*arch_gettimeoffset)(void); -#endif - #ifdef CONFIG_POSIX_TIMERS extern void clear_itimer(void); #else diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index a09b1d61df6a..51d298ccbe05 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -26,10 +26,6 @@ config CLOCKSOURCE_VALIDATE_LAST_CYCLE config GENERIC_TIME_VSYSCALL bool -# Old style timekeeping -config ARCH_USES_GETTIMEOFFSET - bool - # The generic clock events infrastructure config GENERIC_CLOCKEVENTS bool @@ -72,7 +68,6 @@ config TICK_ONESHOT config NO_HZ_COMMON bool - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT choice @@ -87,7 +82,6 @@ config HZ_PERIODIC config NO_HZ_IDLE bool "Idle dynticks system (tickless idle)" - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select NO_HZ_COMMON help This option enables a tickless idle system: timer interrupts @@ -99,7 +93,6 @@ config NO_HZ_IDLE config NO_HZ_FULL bool "Full dynticks system (tickless)" # NO_HZ_COMMON dependency - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS # We need at least one periodic CPU for timekeeping depends on SMP depends on HAVE_CONTEXT_TRACKING @@ -158,7 +151,6 @@ config CONTEXT_TRACKING_FORCE config NO_HZ bool "Old Idle dynticks config" - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS help This is the old config entry that enables dynticks idle. We keep it around for a little while to enforce backward @@ -166,7 +158,6 @@ config NO_HZ config HIGH_RES_TIMERS bool "High Resolution Timer Support" - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS select TICK_ONESHOT help This option enables high resolution timer support. If your diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 02441ead3c3b..cce484a2cc7c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -705,8 +705,6 @@ static inline void clocksource_update_max_deferment(struct clocksource *cs) &cs->max_cycles); } -#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET - static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) { struct clocksource *cs; @@ -798,12 +796,6 @@ static void clocksource_select_fallback(void) __clocksource_select(true); } -#else /* !CONFIG_ARCH_USES_GETTIMEOFFSET */ -static inline void clocksource_select(void) { } -static inline void clocksource_select_fallback(void) { } - -#endif - /* * clocksource_done_booting - Called near the end of core bootup * diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6858a31364b6..52fff7e9edcd 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -369,13 +369,6 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) /* Timekeeper helper functions. */ -#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET -static u32 default_arch_gettimeoffset(void) { return 0; } -u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; -#else -static inline u32 arch_gettimeoffset(void) { return 0; } -#endif - static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 delta) { u64 nsec; @@ -383,8 +376,7 @@ static inline u64 timekeeping_delta_to_ns(const struct tk_read_base *tkr, u64 de nsec = delta * tkr->mult + tkr->xtime_nsec; nsec >>= tkr->shift; - /* If arch requires, add in get_arch_timeoffset() */ - return nsec + arch_gettimeoffset(); + return nsec; } static inline u64 timekeeping_get_ns(const struct tk_read_base *tkr) @@ -778,16 +770,8 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk->tkr_raw.cycle_last = cycle_now; tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; - - /* If arch requires, add in get_arch_timeoffset() */ - tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; - - tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult; - /* If arch requires, add in get_arch_timeoffset() */ - tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift; - tk_normalize_xtime(tk); } @@ -2133,19 +2117,12 @@ static void timekeeping_advance(enum timekeeping_adv_mode mode) if (unlikely(timekeeping_suspended)) goto out; -#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - offset = real_tk->cycle_interval; - - if (mode != TK_ADV_TICK) - goto out; -#else offset = clocksource_delta(tk_clock_read(&tk->tkr_mono), tk->tkr_mono.cycle_last, tk->tkr_mono.mask); /* Check if there's really nothing to do */ if (offset < real_tk->cycle_interval && mode == TK_ADV_TICK) goto out; -#endif /* Do some additional sanity checking */ timekeeping_check_update(tk, offset); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a4020c0b4508..b74099f990bf 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -253,7 +253,6 @@ config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n depends on TRACE_IRQFLAGS_SUPPORT - depends on !ARCH_USES_GETTIMEOFFSET select TRACE_IRQFLAGS select GENERIC_TRACER select TRACER_MAX_TRACE @@ -277,7 +276,6 @@ config IRQSOFF_TRACER config PREEMPT_TRACER bool "Preemption-off Latency Tracer" default n - depends on !ARCH_USES_GETTIMEOFFSET depends on PREEMPTION select GENERIC_TRACER select TRACER_MAX_TRACE -- cgit v1.2.3 From b3550164a19d62e515af6cacb5a31f0b2b3f9501 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 24 Sep 2020 15:21:43 +0200 Subject: timekeeping: add CONFIG_LEGACY_TIMER_TICK All platforms that currently do not use generic clockevents roughly call the same set of functions in their timer interrupts: xtime_update(), update_process_times() and profile_tick(), sometimes in a different sequence. Add a helper function that performs all three of them, to make the callers more uniform and simplify the interface. Reviewed-by: Geert Uytterhoeven Reviewed-by: Linus Walleij Signed-off-by: Arnd Bergmann --- include/linux/timekeeping.h | 1 + kernel/time/Kconfig | 7 +++++++ kernel/time/Makefile | 1 + kernel/time/tick-legacy.c | 19 +++++++++++++++++++ 4 files changed, 28 insertions(+) create mode 100644 kernel/time/tick-legacy.c (limited to 'kernel') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7f7e4a3f4394..3670cb1670ff 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -12,6 +12,7 @@ extern int timekeeping_suspended; /* Architecture timer tick functions: */ extern void update_process_times(int user); extern void xtime_update(unsigned long ticks); +extern void legacy_timer_tick(unsigned long ticks); /* * Get and set timeofday diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 51d298ccbe05..c6867f29d279 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -57,6 +57,13 @@ config POSIX_CPU_TIMERS_TASK_WORK bool default y if POSIX_TIMERS && HAVE_POSIX_CPU_TIMERS_TASK_WORK +config LEGACY_TIMER_TICK + bool + help + The legacy timer tick helper is used by platforms that + lack support for the generic clockevent framework. + New platforms should use generic clockevents instead. + if GENERIC_CLOCKEVENTS menu "Timers subsystem" diff --git a/kernel/time/Makefile b/kernel/time/Makefile index c8f00168afe8..1fb1c1ef6a19 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -16,6 +16,7 @@ ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o +obj-$(CONFIG_LEGACY_TIMER_TICK) += tick-legacy.o obj-$(CONFIG_HAVE_GENERIC_VDSO) += vsyscall.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o diff --git a/kernel/time/tick-legacy.c b/kernel/time/tick-legacy.c new file mode 100644 index 000000000000..73c5a0af4743 --- /dev/null +++ b/kernel/time/tick-legacy.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Timer tick function for architectures that lack generic clockevents, + * consolidated here from m68k/ia64/parisc/arm. + */ + +#include +#include +#include + +#include "tick-internal.h" + +void legacy_timer_tick(unsigned long ticks) +{ + if (ticks) + xtime_update(ticks); + update_process_times(user_mode(get_irq_regs())); + profile_tick(CPU_PROFILING); +} -- cgit v1.2.3 From 56cc7b8acfb7c763f71c0492fa8da01dca7c1760 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 24 Sep 2020 17:39:11 +0200 Subject: timekeeping: remove xtime_update There are no more users of xtime_update aside from legacy_timer_tick(), so fold it into that function and remove the declaration. update_process_times() is now only called inside of the kernel/time/ code, so the declaration can be moved there. Reviewed-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Reviewed-by: Linus Walleij Signed-off-by: Arnd Bergmann --- include/linux/timekeeping.h | 2 -- kernel/time/tick-legacy.c | 22 ++++++++++++++++++++-- kernel/time/timekeeping.c | 16 ---------------- kernel/time/timekeeping.h | 1 + 4 files changed, 21 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 3670cb1670ff..d47009611109 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -10,8 +10,6 @@ void timekeeping_init(void); extern int timekeeping_suspended; /* Architecture timer tick functions: */ -extern void update_process_times(int user); -extern void xtime_update(unsigned long ticks); extern void legacy_timer_tick(unsigned long ticks); /* diff --git a/kernel/time/tick-legacy.c b/kernel/time/tick-legacy.c index 73c5a0af4743..af225b32f5b3 100644 --- a/kernel/time/tick-legacy.c +++ b/kernel/time/tick-legacy.c @@ -10,10 +10,28 @@ #include "tick-internal.h" +/** + * legacy_timer_tick() - advances the timekeeping infrastructure + * @ticks: number of ticks, that have elapsed since the last call. + * + * This is used by platforms that have not been converted to + * generic clockevents. + * + * If 'ticks' is zero, the CPU is not handling timekeeping, so + * only perform process accounting and profiling. + * + * Must be called with interrupts disabled. + */ void legacy_timer_tick(unsigned long ticks) { - if (ticks) - xtime_update(ticks); + if (ticks) { + raw_spin_lock(&jiffies_lock); + write_seqcount_begin(&jiffies_seq); + do_timer(ticks); + write_seqcount_end(&jiffies_seq); + raw_spin_unlock(&jiffies_lock); + update_wall_time(); + } update_process_times(user_mode(get_irq_regs())); profile_tick(CPU_PROFILING); } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 52fff7e9edcd..daa0ff017819 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -2438,19 +2438,3 @@ void hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts) } EXPORT_SYMBOL(hardpps); #endif /* CONFIG_NTP_PPS */ - -/** - * xtime_update() - advances the timekeeping infrastructure - * @ticks: number of ticks, that have elapsed since the last call. - * - * Must be called with interrupts disabled. - */ -void xtime_update(unsigned long ticks) -{ - raw_spin_lock(&jiffies_lock); - write_seqcount_begin(&jiffies_seq); - do_timer(ticks); - write_seqcount_end(&jiffies_seq); - raw_spin_unlock(&jiffies_lock); - update_wall_time(); -} diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 099737f6f10c..d94b69c5b869 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -22,6 +22,7 @@ static inline int sched_clock_suspend(void) { return 0; } static inline void sched_clock_resume(void) { } #endif +extern void update_process_times(int user); extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -- cgit v1.2.3 From 0774a6ed294b963dc76df2d8342ab86d030759ec Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 24 Sep 2020 11:32:40 +0200 Subject: timekeeping: default GENERIC_CLOCKEVENTS to enabled Almost all machines use GENERIC_CLOCKEVENTS, so it feels wrong to require each one to select that symbol manually. Instead, enable it whenever CONFIG_LEGACY_TIMER_TICK is disabled as a simplification. It should be possible to select both GENERIC_CLOCKEVENTS and LEGACY_TIMER_TICK from an architecture now and decide at runtime between the two. For the clockevents arch-support.txt file, this means that additional architectures are marked as TODO when they have at least one machine that still uses LEGACY_TIMER_TICK, rather than being marked 'ok' when at least one machine has been converted. This means that both m68k and arm (for riscpc) revert to TODO. At this point, we could just always enable CONFIG_GENERIC_CLOCKEVENTS rather than leaving it off when not needed. I built an m68k defconfig kernel (using gcc-10.1.0) and found that this would add around 5.5KB in kernel image size: text data bss dec hex filename 3861936 1092236 196656 5150828 4e986c obj-m68k/vmlinux-no-clockevent 3866201 1093832 196184 5156217 4ead79 obj-m68k/vmlinux-clockevent On Arm (MACH_RPC), that difference appears to be twice as large, around 11KB on top of an 6MB vmlinux. Reviewed-by: Geert Uytterhoeven Acked-by: Geert Uytterhoeven Tested-by: Geert Uytterhoeven Reviewed-by: Linus Walleij Signed-off-by: Arnd Bergmann --- Documentation/features/time/clockevents/arch-support.txt | 6 +++--- arch/alpha/Kconfig | 1 - arch/arc/Kconfig | 1 - arch/arm/Kconfig | 12 ------------ arch/arm64/Kconfig | 1 - arch/arm64/Kconfig.platforms | 1 - arch/c6x/Kconfig | 1 - arch/csky/Kconfig | 1 - arch/h8300/Kconfig | 1 - arch/hexagon/Kconfig | 1 - arch/m68k/Kconfig.cpu | 1 - arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/nds32/Kconfig | 1 - arch/nios2/Kconfig | 1 - arch/openrisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/riscv/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/um/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/xtensa/Kconfig | 1 - kernel/time/Kconfig | 2 +- 25 files changed, 4 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/Documentation/features/time/clockevents/arch-support.txt b/Documentation/features/time/clockevents/arch-support.txt index 61a5c9d68c15..6863a3fbddad 100644 --- a/Documentation/features/time/clockevents/arch-support.txt +++ b/Documentation/features/time/clockevents/arch-support.txt @@ -1,6 +1,6 @@ # # Feature name: clockevents -# Kconfig: GENERIC_CLOCKEVENTS +# Kconfig: !LEGACY_TIMER_TICK # description: arch support generic clock events # ----------------------- @@ -8,14 +8,14 @@ ----------------------- | alpha: | ok | | arc: | ok | - | arm: | ok | + | arm: | TODO | | arm64: | ok | | c6x: | ok | | csky: | ok | | h8300: | ok | | hexagon: | ok | | ia64: | TODO | - | m68k: | ok | + | m68k: | TODO | | microblaze: | ok | | mips: | ok | | nds32: | ok | diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index d6e9fc7a7b19..f0a700946cac 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -27,7 +27,6 @@ config ALPHA select ARCH_WANT_IPC_PARSE_VERSION select ARCH_HAVE_NMI_SAFE_CMPXCHG select AUDIT_ARCH - select GENERIC_CLOCKEVENTS select GENERIC_CPU_VULNERABILITIES select GENERIC_SMP_IDLE_THREAD select GENERIC_STRNCPY_FROM_USER diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 0a89cc9def65..061eb8e23739 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -19,7 +19,6 @@ config ARC select COMMON_CLK select DMA_DIRECT_REMAP select GENERIC_ATOMIC64 if !ISA_ARCV2 || !(ARC_HAS_LL64 && ARC_HAS_LLSC) - select GENERIC_CLOCKEVENTS select GENERIC_FIND_FIRST_BIT # for now, we don't need GENERIC_IRQ_PROBE, CONFIG_GENERIC_IRQ_CHIP select GENERIC_IRQ_SHOW diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a652686c3b32..446c6c88e47f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -321,7 +321,6 @@ config ARCH_MULTIPLATFORM select AUTO_ZRELADDR select TIMER_OF select COMMON_CLK - select GENERIC_CLOCKEVENTS select GENERIC_IRQ_MULTI_HANDLER select HAVE_PCI select PCI_DOMAINS_GENERIC if PCI @@ -336,7 +335,6 @@ config ARM_SINGLE_ARMV7M select TIMER_OF select COMMON_CLK select CPU_V7M - select GENERIC_CLOCKEVENTS select NO_IOPORT_MAP select SPARSE_IRQ select USE_OF @@ -351,7 +349,6 @@ config ARCH_EP93XX select CLKDEV_LOOKUP select CLKSRC_MMIO select CPU_ARM920T - select GENERIC_CLOCKEVENTS select GPIOLIB select HAVE_LEGACY_CLK help @@ -361,7 +358,6 @@ config ARCH_FOOTBRIDGE bool "FootBridge" select CPU_SA110 select FOOTBRIDGE - select GENERIC_CLOCKEVENTS select HAVE_IDE select NEED_MACH_IO_H if !MMU select NEED_MACH_MEMORY_H @@ -389,7 +385,6 @@ config ARCH_IXP4XX select ARCH_SUPPORTS_BIG_ENDIAN select CPU_XSCALE select DMABOUNCE if PCI - select GENERIC_CLOCKEVENTS select GENERIC_IRQ_MULTI_HANDLER select GPIO_IXP4XX select GPIOLIB @@ -405,7 +400,6 @@ config ARCH_IXP4XX config ARCH_DOVE bool "Marvell Dove" select CPU_PJ4 - select GENERIC_CLOCKEVENTS select GENERIC_IRQ_MULTI_HANDLER select GPIOLIB select HAVE_PCI @@ -429,7 +423,6 @@ config ARCH_PXA select CLKSRC_MMIO select TIMER_OF select CPU_XSCALE if !CPU_XSC3 - select GENERIC_CLOCKEVENTS select GENERIC_IRQ_MULTI_HANDLER select GPIO_PXA select GPIOLIB @@ -470,7 +463,6 @@ config ARCH_SA1100 select COMMON_CLK select CPU_FREQ select CPU_SA1100 - select GENERIC_CLOCKEVENTS select GENERIC_IRQ_MULTI_HANDLER select GPIOLIB select HAVE_IDE @@ -485,7 +477,6 @@ config ARCH_S3C24XX bool "Samsung S3C24XX SoCs" select ATAGS select CLKSRC_SAMSUNG_PWM - select GENERIC_CLOCKEVENTS select GPIO_SAMSUNG select GPIOLIB select GENERIC_IRQ_MULTI_HANDLER @@ -509,7 +500,6 @@ config ARCH_OMAP1 select ARCH_OMAP select CLKDEV_LOOKUP select CLKSRC_MMIO - select GENERIC_CLOCKEVENTS select GENERIC_IRQ_CHIP select GENERIC_IRQ_MULTI_HANDLER select GPIOLIB @@ -772,7 +762,6 @@ config ARCH_ACORN config PLAT_IOP bool - select GENERIC_CLOCKEVENTS config PLAT_ORION bool @@ -1163,7 +1152,6 @@ config HAVE_SMP config SMP bool "Symmetric Multi-Processing" depends on CPU_V6K || CPU_V7 - depends on GENERIC_CLOCKEVENTS depends on HAVE_SMP depends on MMU || ARM_MPU select IRQ_WORK diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f858c352f72a..fee87e4104fd 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -101,7 +101,6 @@ config ARM64 select FRAME_POINTER select GENERIC_ALLOCATOR select GENERIC_ARCH_TOPOLOGY - select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index 6f2494dd6d60..748e6d8c3b94 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -256,7 +256,6 @@ config ARCH_TEGRA select ARM_GIC_PM select CLKSRC_MMIO select TIMER_OF - select GENERIC_CLOCKEVENTS select GPIOLIB select PINCTRL select PM diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index 48d66bf0465d..bdeeac28b1be 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -19,7 +19,6 @@ config C6X select IRQ_DOMAIN select OF select OF_EARLY_FLATTREE - select GENERIC_CLOCKEVENTS select MODULES_USE_ELF_RELA select MMU_GATHER_NO_RANGE if MMU select SET_FS diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 268fad5f51cf..28fdf8303dff 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -28,7 +28,6 @@ config CSKY select GENERIC_LIB_UCMPDI2 select GENERIC_ALLOCATOR select GENERIC_ATOMIC64 - select GENERIC_CLOCKEVENTS select GENERIC_CPU_DEVICES select GENERIC_IRQ_CHIP select GENERIC_IRQ_PROBE diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 7945de067e9f..3e3e0f16f7e0 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -12,7 +12,6 @@ config H8300 select FRAME_POINTER select GENERIC_CPU_DEVICES select MODULES_USE_ELF_RELA - select GENERIC_CLOCKEVENTS select COMMON_CLK select ARCH_WANT_FRAME_POINTERS select OF diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index f2afabbadd43..6e00c16a36b5 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -27,7 +27,6 @@ config HEXAGON select GENERIC_IOMAP select GENERIC_SMP_IDLE_THREAD select STACKTRACE_SUPPORT - select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST select MODULES_USE_ELF_RELA select GENERIC_CPU_DEVICES diff --git a/arch/m68k/Kconfig.cpu b/arch/m68k/Kconfig.cpu index 648054d4f860..ce09f993d858 100644 --- a/arch/m68k/Kconfig.cpu +++ b/arch/m68k/Kconfig.cpu @@ -318,7 +318,6 @@ config M54xx config COLDFIRE_PIT_TIMER bool - select GENERIC_CLOCKEVENTS config COLDFIRE_TIMERS bool diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 33925ffed68f..2f0d3f431faf 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -17,7 +17,6 @@ config MICROBLAZE select COMMON_CLK select DMA_DIRECT_REMAP if MMU select GENERIC_ATOMIC64 - select GENERIC_CLOCKEVENTS select GENERIC_CPU_DEVICES select GENERIC_IDLE_POLL_SETUP select GENERIC_IRQ_PROBE diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 2000bb2b0220..077c4ae09550 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -21,7 +21,6 @@ config MIPS select CPU_NO_EFFICIENT_FFS if (TARGET_ISA_REV < 1) select CPU_PM if CPU_IDLE select GENERIC_ATOMIC64 if !64BIT - select GENERIC_CLOCKEVENTS select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE select GENERIC_GETTIMEOFDAY diff --git a/arch/nds32/Kconfig b/arch/nds32/Kconfig index e8e541fd2267..62313902d75d 100644 --- a/arch/nds32/Kconfig +++ b/arch/nds32/Kconfig @@ -17,7 +17,6 @@ config NDS32 select DMA_DIRECT_REMAP select GENERIC_ATOMIC64 select GENERIC_CPU_DEVICES - select GENERIC_CLOCKEVENTS select GENERIC_IRQ_CHIP select GENERIC_IRQ_SHOW select GENERIC_IOREMAP diff --git a/arch/nios2/Kconfig b/arch/nios2/Kconfig index c7c6ba6bec9d..c24955c81c92 100644 --- a/arch/nios2/Kconfig +++ b/arch/nios2/Kconfig @@ -10,7 +10,6 @@ config NIOS2 select COMMON_CLK select TIMER_OF select GENERIC_ATOMIC64 - select GENERIC_CLOCKEVENTS select GENERIC_CPU_DEVICES select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 6233c6293180..591acc5990dc 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -24,7 +24,6 @@ config OPENRISC select GENERIC_CPU_DEVICES select HAVE_UID16 select GENERIC_ATOMIC64 - select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e9f13fe08492..57e2c75f76e9 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -163,7 +163,6 @@ config PPC select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT select GENERIC_ATOMIC64 if PPC32 - select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST if SMP select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 44377fd7860e..3842bbb4fe62 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -37,7 +37,6 @@ config RISCV select EDAC_SUPPORT select GENERIC_ARCH_TOPOLOGY if SMP select GENERIC_ATOMIC64 if !64BIT - select GENERIC_CLOCKEVENTS select GENERIC_EARLY_IOREMAP select GENERIC_GETTIMEOFDAY if HAVE_GENERIC_VDSO select GENERIC_IOREMAP diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 4a2a12be04c9..db246781844d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -116,7 +116,6 @@ config S390 select CLONE_BACKWARDS2 select DMA_OPS if PCI select DYNAMIC_FTRACE if FUNCTION_TRACER - select GENERIC_CLOCKEVENTS select GENERIC_CPU_AUTOPROBE select GENERIC_CPU_VULNERABILITIES select GENERIC_FIND_FIRST_BIT diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 159da4ed578f..5fa580219a86 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -16,7 +16,6 @@ config SUPERH select CPU_NO_EFFICIENT_FFS select DMA_DECLARE_COHERENT select GENERIC_ATOMIC64 - select GENERIC_CLOCKEVENTS select GENERIC_CMOS_UPDATE if SH_SH03 || SH_DREAMCAST select GENERIC_IDLE_POLL_SETUP select GENERIC_IRQ_SHOW diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index a6ca135442f9..718c51cf2c6c 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -39,7 +39,6 @@ config SPARC select HAVE_EBPF_JIT if SPARC64 select HAVE_DEBUG_BUGVERBOSE select GENERIC_SMP_IDLE_THREAD - select GENERIC_CLOCKEVENTS select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER select MODULES_USE_ELF_RELA diff --git a/arch/um/Kconfig b/arch/um/Kconfig index 4b799fad8b48..43333e36e0ba 100644 --- a/arch/um/Kconfig +++ b/arch/um/Kconfig @@ -17,7 +17,6 @@ config UML select NO_DMA select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES - select GENERIC_CLOCKEVENTS select HAVE_GCC_PLUGINS select SET_FS select TTY # Needed for line.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f6946b81f74a..0498d7596ccc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -108,7 +108,6 @@ config X86 select DCACHE_WORD_ACCESS select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT - select GENERIC_CLOCKEVENTS select GENERIC_CLOCKEVENTS_BROADCAST if X86_64 || (X86_32 && X86_LOCAL_APIC) select GENERIC_CLOCKEVENTS_MIN_ADJUST select GENERIC_CMOS_UPDATE diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index d0dfa50bd0bb..2611ba336af8 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -16,7 +16,6 @@ config XTENSA select COMMON_CLK select DMA_REMAP if MMU select GENERIC_ATOMIC64 - select GENERIC_CLOCKEVENTS select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP select GENERIC_SCHED_CLOCK diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index c6867f29d279..9a41848b6ebb 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -28,7 +28,7 @@ config GENERIC_TIME_VSYSCALL # The generic clock events infrastructure config GENERIC_CLOCKEVENTS - bool + def_bool !LEGACY_TIMER_TICK # Architecture can handle broadcast in a driver-agnostic way config ARCH_HAS_TICK_BROADCAST -- cgit v1.2.3 From a38283da05d321fa1fce38ea3cf41c9f1dbd1f21 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 27 Oct 2020 13:50:19 -0500 Subject: printk: ringbuffer: Replace zero-length array with flexible-array member MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use “flexible array members”[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9/process/deprecated.html#zero-length-and-one-element-arrays Signed-off-by: Gustavo A. R. Silva --- kernel/printk/printk_ringbuffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 24a960a89aa8..6b1525685277 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -345,7 +345,7 @@ DESC_ID((id) - DESCS_COUNT(desc_ring)) */ struct prb_data_block { unsigned long id; - char data[0]; + char data[]; }; /* -- cgit v1.2.3 From c1acb4ac1a892cf08d27efcb964ad281728b0545 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Fri, 30 Oct 2020 00:19:05 +0800 Subject: tracing: Fix out of bounds write in get_trace_buf The nesting count of trace_printk allows for 4 levels of nesting. The nesting counter starts at zero and is incremented before being used to retrieve the current context's buffer. But the index to the buffer uses the nesting counter after it was incremented, and not its original number, which in needs to do. Link: https://lkml.kernel.org/r/20201029161905.4269-1-hqjagain@gmail.com Cc: stable@vger.kernel.org Fixes: 3d9622c12c887 ("tracing: Add barrier to trace_printk() buffer nesting modification") Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 528971714fc6..daa96215e294 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3132,7 +3132,7 @@ static char *get_trace_buf(void) /* Interrupts must see nesting incremented before we use the buffer */ barrier(); - return &buffer->buffer[buffer->nesting][0]; + return &buffer->buffer[buffer->nesting - 1][0]; } static void put_trace_buf(void) -- cgit v1.2.3 From ee11b93f95eabdf8198edd4668bf9102e7248270 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Oct 2020 17:31:45 -0400 Subject: ftrace: Fix recursion check for NMI test The code that checks recursion will work to only do the recursion check once if there's nested checks. The top one will do the check, the other nested checks will see recursion was already checked and return zero for its "bit". On the return side, nothing will be done if the "bit" is zero. The problem is that zero is returned for the "good" bit when in NMI context. This will set the bit for NMIs making it look like *all* NMI tracing is recursing, and prevent tracing of anything in NMI context! The simple fix is to return "bit + 1" and subtract that bit on the end to get the real bit. Cc: stable@vger.kernel.org Fixes: edc15cafcbfa3 ("tracing: Avoid unnecessary multiple recursion checks") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f3f5e77123ad..fee535a89560 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -698,7 +698,7 @@ static __always_inline int trace_test_and_set_recursion(int start, int max) current->trace_recursion = val; barrier(); - return bit; + return bit + 1; } static __always_inline void trace_clear_recursion(int bit) @@ -708,6 +708,7 @@ static __always_inline void trace_clear_recursion(int bit) if (!bit) return; + bit--; bit = 1 << bit; val &= ~bit; -- cgit v1.2.3 From 726b3d3f141fba6f841d715fc4d8a4a84f02c02a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 29 Oct 2020 19:35:08 -0400 Subject: ftrace: Handle tracing when switching between context When an interrupt or NMI comes in and switches the context, there's a delay from when the preempt_count() shows the update. As the preempt_count() is used to detect recursion having each context have its own bit get set when tracing starts, and if that bit is already set, it is considered a recursion and the function exits. But if this happens in that section where context has changed but preempt_count() has not been updated, this will be incorrectly flagged as a recursion. To handle this case, create another bit call TRANSITION and test it if the current context bit is already set. Flag the call as a recursion if the TRANSITION bit is already set, and if not, set it and continue. The TRANSITION bit will be cleared normally on the return of the function that set it, or if the current context bit is clear, set it and clear the TRANSITION bit to allow for another transition between the current context and an even higher one. Cc: stable@vger.kernel.org Fixes: edc15cafcbfa3 ("tracing: Avoid unnecessary multiple recursion checks") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.h | 23 +++++++++++++++++++++-- kernel/trace/trace_selftest.c | 9 +++++++-- 2 files changed, 28 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fee535a89560..1dadef445cd1 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -637,6 +637,12 @@ enum { * function is called to clear it. */ TRACE_GRAPH_NOTRACE_BIT, + + /* + * When transitioning between context, the preempt_count() may + * not be correct. Allow for a single recursion to cover this case. + */ + TRACE_TRANSITION_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) @@ -691,8 +697,21 @@ static __always_inline int trace_test_and_set_recursion(int start, int max) return 0; bit = trace_get_context_bit() + start; - if (unlikely(val & (1 << bit))) - return -1; + if (unlikely(val & (1 << bit))) { + /* + * It could be that preempt_count has not been updated during + * a switch between contexts. Allow for a single recursion. + */ + bit = TRACE_TRANSITION_BIT; + if (trace_recursion_test(bit)) + return -1; + trace_recursion_set(bit); + barrier(); + return bit + 1; + } + + /* Normal check passed, clear the transition to allow it again */ + trace_recursion_clear(TRACE_TRANSITION_BIT); val |= 1 << bit; current->trace_recursion = val; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b5e3496cf803..4738ad48a667 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -492,8 +492,13 @@ trace_selftest_function_recursion(void) unregister_ftrace_function(&test_rec_probe); ret = -1; - if (trace_selftest_recursion_cnt != 1) { - pr_cont("*callback not called once (%d)* ", + /* + * Recursion allows for transitions between context, + * and may call the callback twice. + */ + if (trace_selftest_recursion_cnt != 1 && + trace_selftest_recursion_cnt != 2) { + pr_cont("*callback not called once (or twice) (%d)* ", trace_selftest_recursion_cnt); goto out; } -- cgit v1.2.3 From e9696d259d0fb5d239e8c28ca41089838ea76d13 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Mon, 26 Oct 2020 17:02:14 -0700 Subject: swiotlb: fix "x86: Don't panic if can not alloc buffer for swiotlb" kernel/dma/swiotlb.c:swiotlb_init gets called first and tries to allocate a buffer for the swiotlb. It does so by calling memblock_alloc_low(PAGE_ALIGN(bytes), PAGE_SIZE); If the allocation must fail, no_iotlb_memory is set. Later during initialization swiotlb-xen comes in (drivers/xen/swiotlb-xen.c:xen_swiotlb_init) and given that io_tlb_start is != 0, it thinks the memory is ready to use when actually it is not. When the swiotlb is actually needed, swiotlb_tbl_map_single gets called and since no_iotlb_memory is set the kernel panics. Instead, if swiotlb-xen.c:xen_swiotlb_init knew the swiotlb hadn't been initialized, it would do the initialization itself, which might still succeed. Fix the panic by setting io_tlb_start to 0 on swiotlb initialization failure, and also by setting no_iotlb_memory to false on swiotlb initialization success. Fixes: ac2cbab21f31 ("x86: Don't panic if can not alloc buffer for swiotlb") Reported-by: Elliott Mitchell Tested-by: Elliott Mitchell Signed-off-by: Stefano Stabellini Reviewed-by: Christoph Hellwig Cc: stable@vger.kernel.org Signed-off-by: Konrad Rzeszutek Wilk --- kernel/dma/swiotlb.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index b4eea0abc3f0..54078f0d4c87 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -229,6 +229,7 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; if (verbose) swiotlb_print_info(); @@ -260,9 +261,11 @@ swiotlb_init(int verbose) if (vstart && !swiotlb_init_with_tbl(vstart, io_tlb_nslabs, verbose)) return; - if (io_tlb_start) + if (io_tlb_start) { memblock_free_early(io_tlb_start, PAGE_ALIGN(io_tlb_nslabs << IO_TLB_SHIFT)); + io_tlb_start = 0; + } pr_warn("Cannot allocate buffer"); no_iotlb_memory = true; } @@ -360,6 +363,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; } io_tlb_index = 0; + no_iotlb_memory = false; swiotlb_print_info(); -- cgit v1.2.3 From fc0021aa340af65a0a37d77be39e22aa886a6132 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 23 Oct 2020 08:33:09 +0200 Subject: swiotlb: remove the tbl_dma_addr argument to swiotlb_tbl_map_single The tbl_dma_addr argument is used to check the DMA boundary for the allocations, and thus needs to be a dma_addr_t. swiotlb-xen instead passed a physical address, which could lead to incorrect results for strange offsets. Fix this by removing the parameter entirely and hard code the DMA address for io_tlb_start instead. Fixes: 91ffe4ad534a ("swiotlb-xen: introduce phys_to_dma/dma_to_phys translations") Signed-off-by: Christoph Hellwig Reviewed-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/iommu/intel/iommu.c | 5 ++--- drivers/xen/swiotlb-xen.c | 3 +-- include/linux/swiotlb.h | 10 +++------- kernel/dma/swiotlb.c | 16 ++++++---------- 4 files changed, 12 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 8651f6d4dfa0..6b560e6f1930 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -3815,9 +3815,8 @@ bounce_map_single(struct device *dev, phys_addr_t paddr, size_t size, * page aligned, we don't need to use a bounce page. */ if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) { - tlb_addr = swiotlb_tbl_map_single(dev, - phys_to_dma_unencrypted(dev, io_tlb_start), - paddr, size, aligned_size, dir, attrs); + tlb_addr = swiotlb_tbl_map_single(dev, paddr, size, + aligned_size, dir, attrs); if (tlb_addr == DMA_MAPPING_ERROR) { goto swiotlb_error; } else { diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 71ce1b7a23d1..2b385c1b4a99 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -395,8 +395,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, */ trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force); - map = swiotlb_tbl_map_single(dev, virt_to_phys(xen_io_tlb_start), - phys, size, size, dir, attrs); + map = swiotlb_tbl_map_single(dev, phys, size, size, dir, attrs); if (map == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 513913ff7486..3bb72266a75a 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -45,13 +45,9 @@ enum dma_sync_target { SYNC_FOR_DEVICE = 1, }; -extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t phys, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs); +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs); extern void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 54078f0d4c87..781b9dca197c 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -445,14 +445,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, } } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, - dma_addr_t tbl_dma_addr, - phys_addr_t orig_addr, - size_t mapping_size, - size_t alloc_size, - enum dma_data_direction dir, - unsigned long attrs) +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, + size_t mapping_size, size_t alloc_size, + enum dma_data_direction dir, unsigned long attrs) { + dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start); unsigned long flags; phys_addr_t tlb_addr; unsigned int nslots, stride, index, wrap; @@ -671,9 +668,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, swiotlb_force); - swiotlb_addr = swiotlb_tbl_map_single(dev, - phys_to_dma_unencrypted(dev, io_tlb_start), - paddr, size, size, dir, attrs); + swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir, + attrs); if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) return DMA_MAPPING_ERROR; -- cgit v1.2.3 From 8aaeed81fcb917b5cf4976932c5baefa1471128b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 2 Nov 2020 03:41:00 -0800 Subject: bpf: Fix error path in htab_map_alloc() syzbot was able to trigger a use-after-free in htab_map_alloc() [1] htab_map_alloc() lacks a call to lockdep_unregister_key() in its error path. lockdep_register_key() and lockdep_unregister_key() can not fail, it seems better to use them right after htab allocation and before htab freeing, avoiding more goto/labels in htab_map_alloc() [1] BUG: KASAN: use-after-free in lockdep_register_key+0x356/0x3e0 kernel/locking/lockdep.c:1182 Read of size 8 at addr ffff88805fa67ad8 by task syz-executor.3/2356 CPU: 1 PID: 2356 Comm: syz-executor.3 Not tainted 5.9.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xae/0x4c8 mm/kasan/report.c:385 __kasan_report mm/kasan/report.c:545 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:562 lockdep_register_key+0x356/0x3e0 kernel/locking/lockdep.c:1182 htab_init_buckets kernel/bpf/hashtab.c:144 [inline] htab_map_alloc+0x6c5/0x14a0 kernel/bpf/hashtab.c:521 find_and_alloc_map kernel/bpf/syscall.c:122 [inline] map_create kernel/bpf/syscall.c:825 [inline] __do_sys_bpf+0xa80/0x5180 kernel/bpf/syscall.c:4381 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45deb9 Code: 0d b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 db b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007f0eafee1c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 0000000000001a00 RCX: 000000000045deb9 RDX: 0000000000000040 RSI: 0000000020000040 RDI: 405a020000000000 RBP: 000000000118bf60 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000118bf2c R13: 00007ffd3cf9eabf R14: 00007f0eafee29c0 R15: 000000000118bf2c Allocated by task 2053: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track mm/kasan/common.c:56 [inline] __kasan_kmalloc.constprop.0+0xc2/0xd0 mm/kasan/common.c:461 kmalloc include/linux/slab.h:554 [inline] kzalloc include/linux/slab.h:666 [inline] htab_map_alloc+0xdf/0x14a0 kernel/bpf/hashtab.c:454 find_and_alloc_map kernel/bpf/syscall.c:122 [inline] map_create kernel/bpf/syscall.c:825 [inline] __do_sys_bpf+0xa80/0x5180 kernel/bpf/syscall.c:4381 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Freed by task 2053: kasan_save_stack+0x1b/0x40 mm/kasan/common.c:48 kasan_set_track+0x1c/0x30 mm/kasan/common.c:56 kasan_set_free_info+0x1b/0x30 mm/kasan/generic.c:355 __kasan_slab_free+0x102/0x140 mm/kasan/common.c:422 slab_free_hook mm/slub.c:1544 [inline] slab_free_freelist_hook+0x5d/0x150 mm/slub.c:1577 slab_free mm/slub.c:3142 [inline] kfree+0xdb/0x360 mm/slub.c:4124 htab_map_alloc+0x3f9/0x14a0 kernel/bpf/hashtab.c:549 find_and_alloc_map kernel/bpf/syscall.c:122 [inline] map_create kernel/bpf/syscall.c:825 [inline] __do_sys_bpf+0xa80/0x5180 kernel/bpf/syscall.c:4381 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 The buggy address belongs to the object at ffff88805fa67800 which belongs to the cache kmalloc-1k of size 1024 The buggy address is located 728 bytes inside of 1024-byte region [ffff88805fa67800, ffff88805fa67c00) The buggy address belongs to the page: page:000000003c5582c4 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x5fa60 head:000000003c5582c4 order:3 compound_mapcount:0 compound_pincount:0 flags: 0xfff00000010200(slab|head) raw: 00fff00000010200 ffffea0000bc1200 0000000200000002 ffff888010041140 raw: 0000000000000000 0000000000100010 00000001ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff88805fa67980: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88805fa67a00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff88805fa67b00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff88805fa67b80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: c50eb518e262 ("bpf: Use separate lockdep class for each hashtab") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201102114100.3103180-1-eric.dumazet@gmail.com --- kernel/bpf/hashtab.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index da59ba978d17..23f73d4649c9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -141,7 +141,6 @@ static void htab_init_buckets(struct bpf_htab *htab) { unsigned i; - lockdep_register_key(&htab->lockdep_key); for (i = 0; i < htab->n_buckets; i++) { INIT_HLIST_NULLS_HEAD(&htab->buckets[i].head, i); if (htab_use_raw_lock(htab)) { @@ -455,6 +454,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab) return ERR_PTR(-ENOMEM); + lockdep_register_key(&htab->lockdep_key); + bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { @@ -546,6 +547,7 @@ free_map_locked: free_charge: bpf_map_charge_finish(&htab->map.memory); free_htab: + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); return ERR_PTR(err); } @@ -1364,9 +1366,9 @@ static void htab_map_free(struct bpf_map *map) free_percpu(htab->extra_elems); bpf_map_area_free(htab->buckets); - lockdep_unregister_key(&htab->lockdep_key); for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); + lockdep_unregister_key(&htab->lockdep_key); kfree(htab); } -- cgit v1.2.3 From 906695e59324635c62b5ae59df111151a546ca66 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Sat, 31 Oct 2020 16:57:14 +0800 Subject: tracing: Fix the checking of stackidx in __ftrace_trace_stack The array size is FTRACE_KSTACK_NESTING, so the index FTRACE_KSTACK_NESTING is illegal too. And fix two typos by the way. Link: https://lkml.kernel.org/r/20201031085714.2147-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index daa96215e294..410cfeb16db5 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2750,7 +2750,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, /* * If tracing is off, but we have triggers enabled * we still need to look at the event data. Use the temp_buffer - * to store the trace event for the tigger to use. It's recusive + * to store the trace event for the trigger to use. It's recursive * safe and will not be recorded anywhere. */ if (!entry && trace_file->flags & EVENT_FILE_FL_TRIGGER_COND) { @@ -2952,7 +2952,7 @@ static void __ftrace_trace_stack(struct trace_buffer *buffer, stackidx = __this_cpu_inc_return(ftrace_stack_reserve) - 1; /* This should never happen. If it does, yell once and skip */ - if (WARN_ON_ONCE(stackidx > FTRACE_KSTACK_NESTING)) + if (WARN_ON_ONCE(stackidx >= FTRACE_KSTACK_NESTING)) goto out; /* -- cgit v1.2.3 From 23a881852f3eff6a7ba8d240b57de076763fdef9 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 30 Oct 2020 12:51:08 +0530 Subject: cpufreq: schedutil: Don't skip freq update if need_freq_update is set The cpufreq policy's frequency limits (min/max) can get changed at any point of time, while schedutil is trying to update the next frequency. Though the schedutil governor has necessary locking and support in place to make sure we don't miss any of those updates, there is a corner case where the governor will find that the CPU is already running at the desired frequency and so may skip an update. For example, consider that the CPU can run at 1 GHz, 1.2 GHz and 1.4 GHz and is running at 1 GHz currently. Schedutil tries to update the frequency to 1.2 GHz, during this time the policy limits get changed as policy->min = 1.4 GHz. As schedutil (and cpufreq core) does clamp the frequency at various instances, we will eventually set the frequency to 1.4 GHz, while we will save 1.2 GHz in sg_policy->next_freq. Now lets say the policy limits get changed back at this time with policy->min as 1 GHz. The next time schedutil is invoked by the scheduler, we will reevaluate the next frequency (because need_freq_update will get set due to limits change event) and lets say we want to set the frequency to 1.2 GHz again. At this point sugov_update_next_freq() will find the next_freq == current_freq and will abort the update, while the CPU actually runs at 1.4 GHz. Until now need_freq_update was used as a flag to indicate that the policy's frequency limits have changed, and that we should consider the new limits while reevaluating the next frequency. This patch fixes the above mentioned issue by extending the purpose of the need_freq_update flag. If this flag is set now, the schedutil governor will not try to abort a frequency change even if next_freq == current_freq. As similar behavior is required in the case of CPUFREQ_NEED_UPDATE_LIMITS flag as well, need_freq_update will never be set to false if that flag is set for the driver. We also don't need to consider the need_freq_update flag in sugov_update_single() anymore to handle the special case of busy CPU, as we won't abort a frequency update anymore. Reported-by: zhuguangqing Suggested-by: Rafael J. Wysocki Signed-off-by: Viresh Kumar [ rjw: Rearrange code to avoid a branch ] Signed-off-by: Rafael J. Wysocki --- kernel/sched/cpufreq_schedutil.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c03a5775d019..d73bccde2720 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -102,9 +102,12 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (sg_policy->next_freq == next_freq && - !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) - return false; + if (!sg_policy->need_freq_update) { + if (sg_policy->next_freq == next_freq) + return false; + } else { + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + } sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; @@ -162,11 +165,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, freq = map_util_freq(util, freq, max); - if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update && - !cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS)) + if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) return sg_policy->next_freq; - sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = freq; return cpufreq_driver_resolve_freq(policy, freq); } @@ -442,7 +443,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, struct sugov_policy *sg_policy = sg_cpu->sg_policy; unsigned long util, max; unsigned int next_f; - bool busy; unsigned int cached_freq = sg_policy->cached_raw_freq; sugov_iowait_boost(sg_cpu, time, flags); @@ -453,9 +453,6 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; - /* Limits may have changed, don't skip frequency update */ - busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu); - util = sugov_get_util(sg_cpu); max = sg_cpu->max; util = sugov_iowait_apply(sg_cpu, time, util, max); @@ -464,7 +461,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq) { + if (sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; /* Restore cached freq as next_freq has changed */ @@ -829,9 +826,10 @@ static int sugov_start(struct cpufreq_policy *policy) sg_policy->next_freq = 0; sg_policy->work_in_progress = false; sg_policy->limits_changed = false; - sg_policy->need_freq_update = false; sg_policy->cached_raw_freq = 0; + sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); + for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); -- cgit v1.2.3 From 7b3c36fc4c231ca532120bbc0df67a12f09c1d96 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 1 Nov 2020 17:07:44 -0800 Subject: ptrace: fix task_join_group_stop() for the case when current is traced This testcase #include #include #include #include #include #include #include void *tf(void *arg) { return NULL; } int main(void) { int pid = fork(); if (!pid) { kill(getpid(), SIGSTOP); pthread_t th; pthread_create(&th, NULL, tf, NULL); return 0; } waitpid(pid, NULL, WSTOPPED); ptrace(PTRACE_SEIZE, pid, 0, PTRACE_O_TRACECLONE); waitpid(pid, NULL, 0); ptrace(PTRACE_CONT, pid, 0,0); waitpid(pid, NULL, 0); int status; int thread = waitpid(-1, &status, 0); assert(thread > 0 && thread != pid); assert(status == 0x80137f); return 0; } fails and triggers WARN_ON_ONCE(!signr) in do_jobctl_trap(). This is because task_join_group_stop() has 2 problems when current is traced: 1. We can't rely on the "JOBCTL_STOP_PENDING" check, a stopped tracee can be woken up by debugger and it can clone another thread which should join the group-stop. We need to check group_stop_count || SIGNAL_STOP_STOPPED. 2. If SIGNAL_STOP_STOPPED is already set, we should not increment sig->group_stop_count and add JOBCTL_STOP_CONSUME. The new thread should stop without another do_notify_parent_cldstop() report. To clarify, the problem is very old and we should blame ptrace_init_task(). But now that we have task_join_group_stop() it makes more sense to fix this helper to avoid the code duplication. Reported-by: syzbot+3485e3773f7da290eecc@syzkaller.appspotmail.com Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Cc: Jens Axboe Cc: Christian Brauner Cc: "Eric W . Biederman" Cc: Zhiqiang Liu Cc: Tejun Heo Cc: Link: https://lkml.kernel.org/r/20201019134237.GA18810@redhat.com Signed-off-by: Linus Torvalds --- kernel/signal.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a38b3edc6851..ef8f2a28d37c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -391,16 +391,17 @@ static bool task_participate_group_stop(struct task_struct *task) void task_join_group_stop(struct task_struct *task) { + unsigned long mask = current->jobctl & JOBCTL_STOP_SIGMASK; + struct signal_struct *sig = current->signal; + + if (sig->group_stop_count) { + sig->group_stop_count++; + mask |= JOBCTL_STOP_CONSUME; + } else if (!(sig->flags & SIGNAL_STOP_STOPPED)) + return; + /* Have the new thread join an on-going signal group stop */ - unsigned long jobctl = current->jobctl; - if (jobctl & JOBCTL_STOP_PENDING) { - struct signal_struct *sig = current->signal; - unsigned long signr = jobctl & JOBCTL_STOP_SIGMASK; - unsigned long gstop = JOBCTL_STOP_PENDING | JOBCTL_STOP_CONSUME; - if (task_set_jobctl_pending(task, signr | gstop)) { - sig->group_stop_count++; - } - } + task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } /* -- cgit v1.2.3 From 6993d0fdbee0eb38bfac350aa016f65ad11ed3b1 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Sun, 1 Nov 2020 17:07:53 -0800 Subject: kthread_worker: prevent queuing delayed work from timer_fn when it is being canceled There is a small race window when a delayed work is being canceled and the work still might be queued from the timer_fn: CPU0 CPU1 kthread_cancel_delayed_work_sync() __kthread_cancel_work_sync() __kthread_cancel_work() work->canceling++; kthread_delayed_work_timer_fn() kthread_insert_work(); BUG: kthread_insert_work() should not get called when work->canceling is set. Signed-off-by: Zqiang Signed-off-by: Andrew Morton Reviewed-by: Petr Mladek Acked-by: Tejun Heo Cc: Link: https://lkml.kernel.org/r/20201014083030.16895-1-qiang.zhang@windriver.com Signed-off-by: Linus Torvalds --- kernel/kthread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index e29773c82b70..933a625621b8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -897,7 +897,8 @@ void kthread_delayed_work_timer_fn(struct timer_list *t) /* Move the work from worker->delayed_work_list. */ WARN_ON_ONCE(list_empty(&work->node)); list_del_init(&work->node); - kthread_insert_work(worker, work, &worker->work_list); + if (!work->canceling) + kthread_insert_work(worker, work, &worker->work_list); raw_spin_unlock_irqrestore(&worker->lock, flags); } -- cgit v1.2.3 From 3b70ae4f5c4e050bdebeeefe0c369524f37917cf Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Sun, 1 Nov 2020 17:08:10 -0800 Subject: kernel/hung_task.c: make type annotations consistent Commit 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") removed various __user annotations from function signatures as part of its refactoring. It also removed the __user annotation for proc_dohung_task_timeout_secs() at its declaration in sched/sysctl.h, but not at its definition in kernel/hung_task.c. Hence, sparse complains: kernel/hung_task.c:271:5: error: symbol 'proc_dohung_task_timeout_secs' redeclared with different type (incompatible argument 3 (different address spaces)) Adjust the annotation at the definition fitting to that refactoring to make sparse happy again, which also resolves this warning from sparse: kernel/hung_task.c:277:52: warning: incorrect type in argument 3 (different address spaces) kernel/hung_task.c:277:52: expected void * kernel/hung_task.c:277:52: got void [noderef] __user *buffer No functional change. No change in object code. Signed-off-by: Lukas Bulwahn Signed-off-by: Andrew Morton Cc: Christoph Hellwig Cc: Tetsuo Handa Cc: Al Viro Cc: Andrey Ignatov Link: https://lkml.kernel.org/r/20201028130541.20320-1-lukas.bulwahn@gmail.com Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index ce76f490126c..396ebaebea3f 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -225,8 +225,7 @@ static long hung_timeout_jiffies(unsigned long last_checked, * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - void __user *buffer, - size_t *lenp, loff_t *ppos) + void *buffer, size_t *lenp, loff_t *ppos) { int ret; -- cgit v1.2.3 From b02414c8f045ab3b9afc816c3735bc98c5c3d262 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Nov 2020 15:31:27 -0500 Subject: ring-buffer: Fix recursion protection transitions between interrupt context The recursion protection of the ring buffer depends on preempt_count() to be correct. But it is possible that the ring buffer gets called after an interrupt comes in but before it updates the preempt_count(). This will trigger a false positive in the recursion code. Use the same trick from the ftrace function callback recursion code which uses a "transition" bit that gets set, to allow for a single recursion for to handle transitions between contexts. Cc: stable@vger.kernel.org Fixes: 567cd4da54ff4 ("ring-buffer: User context bit recursion checking") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 58 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f45fd9d5a45..dc83b3fa9fe7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -438,14 +438,16 @@ enum { }; /* * Used for which event context the event is in. - * NMI = 0 - * IRQ = 1 - * SOFTIRQ = 2 - * NORMAL = 3 + * TRANSITION = 0 + * NMI = 1 + * IRQ = 2 + * SOFTIRQ = 3 + * NORMAL = 4 * * See trace_recursive_lock() comment below for more details. */ enum { + RB_CTX_TRANSITION, RB_CTX_NMI, RB_CTX_IRQ, RB_CTX_SOFTIRQ, @@ -3014,10 +3016,10 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * a bit of overhead in something as critical as function tracing, * we use a bitmask trick. * - * bit 0 = NMI context - * bit 1 = IRQ context - * bit 2 = SoftIRQ context - * bit 3 = normal context. + * bit 1 = NMI context + * bit 2 = IRQ context + * bit 3 = SoftIRQ context + * bit 4 = normal context. * * This works because this is the order of contexts that can * preempt other contexts. A SoftIRQ never preempts an IRQ @@ -3040,6 +3042,30 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The least significant bit can be cleared this way, and it * just so happens that it is the same bit corresponding to * the current context. + * + * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit + * is set when a recursion is detected at the current context, and if + * the TRANSITION bit is already set, it will fail the recursion. + * This is needed because there's a lag between the changing of + * interrupt context and updating the preempt count. In this case, + * a false positive will be found. To handle this, one extra recursion + * is allowed, and this is done by the TRANSITION bit. If the TRANSITION + * bit is already set, then it is considered a recursion and the function + * ends. Otherwise, the TRANSITION bit is set, and that bit is returned. + * + * On the trace_recursive_unlock(), the TRANSITION bit will be the first + * to be cleared. Even if it wasn't the context that set it. That is, + * if an interrupt comes in while NORMAL bit is set and the ring buffer + * is called before preempt_count() is updated, since the check will + * be on the NORMAL bit, the TRANSITION bit will then be set. If an + * NMI then comes in, it will set the NMI bit, but when the NMI code + * does the trace_recursive_unlock() it will clear the TRANSTION bit + * and leave the NMI bit set. But this is fine, because the interrupt + * code that set the TRANSITION bit will then clear the NMI bit when it + * calls trace_recursive_unlock(). If another NMI comes in, it will + * set the TRANSITION bit and continue. + * + * Note: The TRANSITION bit only handles a single transition between context. */ static __always_inline int @@ -3055,8 +3081,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) bit = pc & NMI_MASK ? RB_CTX_NMI : pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; - if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) - return 1; + if (unlikely(val & (1 << (bit + cpu_buffer->nest)))) { + /* + * It is possible that this was called by transitioning + * between interrupt context, and preempt_count() has not + * been updated yet. In this case, use the TRANSITION bit. + */ + bit = RB_CTX_TRANSITION; + if (val & (1 << (bit + cpu_buffer->nest))) + return 1; + } val |= (1 << (bit + cpu_buffer->nest)); cpu_buffer->current_context = val; @@ -3071,8 +3105,8 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->current_context - (1 << cpu_buffer->nest); } -/* The recursive locking above uses 4 bits */ -#define NESTED_BITS 4 +/* The recursive locking above uses 5 bits */ +#define NESTED_BITS 5 /** * ring_buffer_nest_start - Allow to trace while nested -- cgit v1.2.3 From 561ca66910bf597f170be5a7aa531c4e05f8e9be Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Nov 2020 11:28:39 -0500 Subject: tracing: Make -ENOMEM the default error for parse_synth_field() parse_synth_field() returns a pointer and requires that errors get surrounded by ERR_PTR(). The ret variable is initialized to zero, but should never be used as zero, and if it is, it could cause a false return code and produce a NULL pointer dereference. It makes no sense to set ret to zero. Set ret to -ENOMEM (the most common error case), and have any other errors set it to something else. This removes the need to initialize ret on *every* error branch. Fixes: 761a8c58db6b ("tracing, synthetic events: Replace buggy strcat() with seq_buf operations") Reported-by: Dan Carpenter Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_synth.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 84b7cab55291..881df991742a 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -584,7 +584,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, { struct synth_field *field; const char *prefix = NULL, *field_type = argv[0], *field_name, *array; - int len, ret = 0; + int len, ret = -ENOMEM; struct seq_buf s; ssize_t size; @@ -617,10 +617,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, len--; field->name = kmemdup_nul(field_name, len, GFP_KERNEL); - if (!field->name) { - ret = -ENOMEM; + if (!field->name) goto free; - } + if (!is_good_name(field->name)) { synth_err(SYNTH_ERR_BAD_NAME, errpos(field_name)); ret = -EINVAL; @@ -638,10 +637,9 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, len += strlen(prefix); field->type = kzalloc(len, GFP_KERNEL); - if (!field->type) { - ret = -ENOMEM; + if (!field->type) goto free; - } + seq_buf_init(&s, field->type, len); if (prefix) seq_buf_puts(&s, prefix); @@ -653,6 +651,7 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, } if (WARN_ON_ONCE(!seq_buf_buffer_left(&s))) goto free; + s.buffer[s.len] = '\0'; size = synth_field_size(field->type); @@ -666,10 +665,8 @@ static struct synth_field *parse_synth_field(int argc, const char **argv, len = sizeof("__data_loc ") + strlen(field->type) + 1; type = kzalloc(len, GFP_KERNEL); - if (!type) { - ret = -ENOMEM; + if (!type) goto free; - } seq_buf_init(&s, type, len); seq_buf_puts(&s, "__data_loc "); -- cgit v1.2.3 From 4761612ffe3c1655e58f1ef9cf867c6f67d46fe2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 22 Oct 2020 13:45:52 +0200 Subject: kcsan: selftest: Ensure that address is at least PAGE_SIZE In preparation of supporting only addresses not within the NULL page, change the selftest to never use addresses that are less than PAGE_SIZE. Reviewed-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/selftest.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c index d98bc208d06d..9014a3a82cf9 100644 --- a/kernel/kcsan/selftest.c +++ b/kernel/kcsan/selftest.c @@ -33,6 +33,9 @@ static bool test_encode_decode(void) unsigned long addr; prandom_bytes(&addr, sizeof(addr)); + if (addr < PAGE_SIZE) + addr = PAGE_SIZE; + if (WARN_ON(!check_encodable(addr, size))) return false; -- cgit v1.2.3 From 55a2346c7ac4bbf6ee6972394237bf31e29a1c05 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Thu, 22 Oct 2020 13:45:53 +0200 Subject: kcsan: Never set up watchpoints on NULL pointers Avoid setting up watchpoints on NULL pointers, as otherwise we would crash inside the KCSAN runtime (when checking for value changes) instead of the instrumented code. Because that may be confusing, skip any address less than PAGE_SIZE. Reviewed-by: Dmitry Vyukov Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/encoding.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index 1a6db2f797ac..4f73db6d1407 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -48,7 +48,11 @@ static inline bool check_encodable(unsigned long addr, size_t size) { - return size <= MAX_ENCODABLE_SIZE; + /* + * While we can encode addrs= PAGE_SIZE && size <= MAX_ENCODABLE_SIZE; } static inline long -- cgit v1.2.3 From f505d4346f6129d4708338491cf23ca9cf1d8f2a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 16 Sep 2020 11:45:26 -0700 Subject: srcu: Use a more appropriate lockdep helper The lockdep_is_held() macro is defined as: #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) This hides away the dereference, so that builds with !LOCKDEP don't break. This works in current kernels because the RCU_LOCKDEP_WARN() eliminates its condition at preprocessor time in !LOCKDEP kernels. However, later patches in this series will cause the compiler to see this condition even in !LOCKDEP kernels. This commit prepares for this upcoming change by switching from lock_is_held() to lockdep_is_held(). Signed-off-by: Jakub Kicinski -- CC: jiangshanlai@gmail.com CC: paulmck@kernel.org CC: josh@joshtriplett.org CC: rostedt@goodmis.org CC: mathieu.desnoyers@efficios.com Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c13348ee80a5..6cd6fa2f272c 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -906,7 +906,7 @@ static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) { struct rcu_synchronize rcu; - RCU_LOCKDEP_WARN(lock_is_held(&ssp->dep_map) || + RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || lock_is_held(&rcu_bh_lock_map) || lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), -- cgit v1.2.3 From 77dc174103fdb121c47621e9856d73704b7eddd2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Sep 2020 15:41:50 -0700 Subject: rcu-tasks: Convert rcu_tasks_wait_gp() for-loop to while-loop The infinite for-loop in rcu_tasks_wait_gp() has its only exit at the top of the loop, so this commit does the straightforward conversion to a while-loop, thus saving a few lines. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d5d9f2d03e8a..a93271fc2572 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -338,14 +338,11 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) if (fract > HZ) fract = HZ; - for (;;) { + while (!list_empty(&holdouts)) { bool firstreport; bool needreport; int rtst; - if (list_empty(&holdouts)) - break; - /* Slowly back off waiting for holdouts */ set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); schedule_timeout_idle(HZ/fract); -- cgit v1.2.3 From 27c0f1448389baf7f309b69e62d4b531c9395e88 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 15 Sep 2020 17:08:03 -0700 Subject: rcutorture: Make grace-period kthread report match RCU flavor being tested At the end of the test and after rcu_torture_writer() stalls, rcutorture invokes show_rcu_gp_kthreads() in order to dump out information on the RCU grace-period kthread. This makes a lot of sense when testing vanilla RCU, but not so much for the other flavors. This commit therefore allows per-flavor kthread-dump functions to be specified. [ paulmck: Apply feedback from kernel test robot . ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 16 ++++++++++++++++ kernel/rcu/rcutorture.c | 11 +++++++++-- kernel/rcu/tasks.h | 30 ++++++++++++++---------------- 3 files changed, 39 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index e01cba5e4b52..59ef1ae6dc37 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -533,4 +533,20 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; } static inline void rcu_bind_current_to_nocb(void) { } #endif +#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RCU) +void show_rcu_tasks_classic_gp_kthread(void); +#else +static inline void show_rcu_tasks_classic_gp_kthread(void) {} +#endif +#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_RUDE_RCU) +void show_rcu_tasks_rude_gp_kthread(void); +#else +static inline void show_rcu_tasks_rude_gp_kthread(void) {} +#endif +#if !defined(CONFIG_TINY_RCU) && defined(CONFIG_TASKS_TRACE_RCU) +void show_rcu_tasks_trace_gp_kthread(void); +#else +static inline void show_rcu_tasks_trace_gp_kthread(void) {} +#endif + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 916ea4f66e4b..c811f23692bf 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -317,6 +317,7 @@ struct rcu_torture_ops { void (*cb_barrier)(void); void (*fqs)(void); void (*stats)(void); + void (*gp_kthread_dbg)(void); int (*stall_dur)(void); int irq_capable; int can_boost; @@ -466,6 +467,7 @@ static struct rcu_torture_ops rcu_ops = { .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .stats = NULL, + .gp_kthread_dbg = show_rcu_gp_kthreads, .stall_dur = rcu_jiffies_till_stall_check, .irq_capable = 1, .can_boost = rcu_can_boost(), @@ -693,6 +695,7 @@ static struct rcu_torture_ops tasks_ops = { .exp_sync = synchronize_rcu_mult_test, .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, + .gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -762,6 +765,7 @@ static struct rcu_torture_ops tasks_rude_ops = { .exp_sync = synchronize_rcu_tasks_rude, .call = call_rcu_tasks_rude, .cb_barrier = rcu_barrier_tasks_rude, + .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -800,6 +804,7 @@ static struct rcu_torture_ops tasks_tracing_ops = { .exp_sync = synchronize_rcu_tasks_trace, .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, + .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -1594,7 +1599,8 @@ rcu_torture_stats_print(void) sched_show_task(wtp); splatted = true; } - show_rcu_gp_kthreads(); + if (cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); rcu_ftrace_dump(DUMP_ALL); } rtcv_snap = rcu_torture_current_version; @@ -2472,7 +2478,8 @@ rcu_torture_cleanup(void) return; } - show_rcu_gp_kthreads(); + if (cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); rcu_torture_read_exit_cleanup(); rcu_torture_barrier_cleanup(); rcu_torture_fwd_prog_cleanup(); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index a93271fc2572..0b459890fdcc 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -290,7 +290,7 @@ static void show_rcu_tasks_generic_gp_kthread(struct rcu_tasks *rtp, char *s) ".C"[!!data_race(rtp->cbs_head)], s); } -#endif /* #ifndef CONFIG_TINY_RCU */ +#endif // #ifndef CONFIG_TINY_RCU static void exit_tasks_rcu_finish_trace(struct task_struct *t); @@ -568,12 +568,13 @@ static int __init rcu_spawn_tasks_kthread(void) } core_initcall(rcu_spawn_tasks_kthread); -#ifndef CONFIG_TINY_RCU -static void show_rcu_tasks_classic_gp_kthread(void) +#if !defined(CONFIG_TINY_RCU) +void show_rcu_tasks_classic_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks, ""); } -#endif /* #ifndef CONFIG_TINY_RCU */ +EXPORT_SYMBOL_GPL(show_rcu_tasks_classic_gp_kthread); +#endif // !defined(CONFIG_TINY_RCU) /* Do the srcu_read_lock() for the above synchronize_srcu(). */ void exit_tasks_rcu_start(void) __acquires(&tasks_rcu_exit_srcu) @@ -595,7 +596,6 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu) } #else /* #ifdef CONFIG_TASKS_RCU */ -static inline void show_rcu_tasks_classic_gp_kthread(void) { } void exit_tasks_rcu_start(void) { } void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); } #endif /* #else #ifdef CONFIG_TASKS_RCU */ @@ -696,16 +696,14 @@ static int __init rcu_spawn_tasks_rude_kthread(void) } core_initcall(rcu_spawn_tasks_rude_kthread); -#ifndef CONFIG_TINY_RCU -static void show_rcu_tasks_rude_gp_kthread(void) +#if !defined(CONFIG_TINY_RCU) +void show_rcu_tasks_rude_gp_kthread(void) { show_rcu_tasks_generic_gp_kthread(&rcu_tasks_rude, ""); } -#endif /* #ifndef CONFIG_TINY_RCU */ - -#else /* #ifdef CONFIG_TASKS_RUDE_RCU */ -static void show_rcu_tasks_rude_gp_kthread(void) {} -#endif /* #else #ifdef CONFIG_TASKS_RUDE_RCU */ +EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread); +#endif // !defined(CONFIG_TINY_RCU) +#endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ //////////////////////////////////////////////////////////////////////// // @@ -1199,8 +1197,8 @@ static int __init rcu_spawn_tasks_trace_kthread(void) } core_initcall(rcu_spawn_tasks_trace_kthread); -#ifndef CONFIG_TINY_RCU -static void show_rcu_tasks_trace_gp_kthread(void) +#if !defined(CONFIG_TINY_RCU) +void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; @@ -1210,11 +1208,11 @@ static void show_rcu_tasks_trace_gp_kthread(void) data_race(n_heavy_reader_attempts)); show_rcu_tasks_generic_gp_kthread(&rcu_tasks_trace, buf); } -#endif /* #ifndef CONFIG_TINY_RCU */ +EXPORT_SYMBOL_GPL(show_rcu_tasks_trace_gp_kthread); +#endif // !defined(CONFIG_TINY_RCU) #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } -static inline void show_rcu_tasks_trace_gp_kthread(void) {} #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ #ifndef CONFIG_TINY_RCU -- cgit v1.2.3 From 0c6d18d84db11840dd0f3f65750c6ea0bb6b8e0d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 27 Aug 2020 09:58:19 -0700 Subject: refscale: Bounds-check module parameters The default value for refscale.nreaders is -1, which results in the code setting the value to three-quarters of the number of CPUs. On single-CPU systems, this results in three-quarters of the value one, which the C language's integer arithmetic rounds to zero. This in turn results in a divide-by-zero error. This commit therefore adds bounds checking to the refscale module parameters, so that if they are less than one, they are set to the value one. Reported-by: kernel test robot Tested-by "Chen, Rong A" Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 952595c678b3..fb5f20d9486a 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -681,6 +681,12 @@ ref_scale_init(void) // Reader tasks (default to ~75% of online CPUs). if (nreaders < 0) nreaders = (num_online_cpus() >> 1) + (num_online_cpus() >> 2); + if (WARN_ONCE(loops <= 0, "%s: loops = %ld, adjusted to 1\n", __func__, loops)) + loops = 1; + if (WARN_ONCE(nreaders <= 0, "%s: nreaders = %d, adjusted to 1\n", __func__, nreaders)) + nreaders = 1; + if (WARN_ONCE(nruns <= 0, "%s: nruns = %d, adjusted to 1\n", __func__, nruns)) + nruns = 1; reader_tasks = kcalloc(nreaders, sizeof(reader_tasks[0]), GFP_KERNEL); if (!reader_tasks) { -- cgit v1.2.3 From b08e84da205023009c456bd7f33feb83c5191c60 Mon Sep 17 00:00:00 2001 From: Aleksandr Nogikh Date: Thu, 29 Oct 2020 17:36:18 +0000 Subject: kernel: make kcov_common_handle consider the current context kcov_common_handle is a method that is used to obtain a "default" KCOV remote handle of the current process. The handle can later be passed to kcov_remote_start in order to collect coverage for the processing that is initiated by one process, but done in another. For details see Documentation/dev-tools/kcov.rst and comments in kernel/kcov.c. Presently, if kcov_common_handle is called in an IRQ context, it will return a handle for the interrupted process. This may lead to unreliable and incorrect coverage collection. Adjust the behavior of kcov_common_handle in the following way. If it is called in a task context, return the common handle for the currently running task. Otherwise, return 0. Signed-off-by: Aleksandr Nogikh Reviewed-by: Andrey Konovalov Signed-off-by: Jakub Kicinski --- kernel/kcov.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 6b8368be89c8..80bfe71bbe13 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -1023,6 +1023,8 @@ EXPORT_SYMBOL(kcov_remote_stop); /* See the comment before kcov_remote_start() for usage details. */ u64 kcov_common_handle(void) { + if (!in_task()) + return 0; return current->kcov_handle; } EXPORT_SYMBOL(kcov_common_handle); -- cgit v1.2.3 From 705e9195187d85249fbb0eaa844b1604a98fbc9a Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 31 Oct 2020 23:06:45 +0300 Subject: module: merge repetitive strings in module_sig_check() The 'reason' variable in module_sig_check() points to 3 strings across the *switch* statement, all needlessly starting with the same text. Let's put the starting text into the pr_notice() call -- it saves 21 bytes of the object code (x86 gcc 10.2.1). Suggested-by: Joe Perches Reviewed-by: Miroslav Benes Signed-off-by: Sergey Shtylyov Signed-off-by: Jessica Yu --- kernel/module.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b34235082394..0e54d58babac 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2907,16 +2907,17 @@ static int module_sig_check(struct load_info *info, int flags) * enforcing, certain errors are non-fatal. */ case -ENODATA: - reason = "Loading of unsigned module"; + reason = "unsigned module"; goto decide; case -ENOPKG: - reason = "Loading of module with unsupported crypto"; + reason = "module with unsupported crypto"; goto decide; case -ENOKEY: - reason = "Loading of module with unavailable key"; + reason = "module with unavailable key"; decide: if (is_module_sig_enforced()) { - pr_notice("%s: %s is rejected\n", info->name, reason); + pr_notice("%s: loading of %s is rejected\n", + info->name, reason); return -EKEYREJECTED; } -- cgit v1.2.3 From 10ccd1abb808599a6dc7c9389560016ea3568085 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 31 Oct 2020 23:09:31 +0300 Subject: module: avoid *goto*s in module_sig_check() Let's move the common handling of the non-fatal errors after the *switch* statement -- this avoids *goto*s inside that *switch*... Suggested-by: Joe Perches Reviewed-by: Miroslav Benes Signed-off-by: Sergey Shtylyov Signed-off-by: Jessica Yu --- kernel/module.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0e54d58babac..02b87bc84a42 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2908,20 +2908,13 @@ static int module_sig_check(struct load_info *info, int flags) */ case -ENODATA: reason = "unsigned module"; - goto decide; + break; case -ENOPKG: reason = "module with unsupported crypto"; - goto decide; + break; case -ENOKEY: reason = "module with unavailable key"; - decide: - if (is_module_sig_enforced()) { - pr_notice("%s: loading of %s is rejected\n", - info->name, reason); - return -EKEYREJECTED; - } - - return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); + break; /* All other errors are fatal, including nomem, unparseable * signatures and signature check failures - even if signatures @@ -2930,6 +2923,13 @@ static int module_sig_check(struct load_info *info, int flags) default: return err; } + + if (is_module_sig_enforced()) { + pr_notice("%s: loading of %s is rejected\n", info->name, reason); + return -EKEYREJECTED; + } + + return security_locked_down(LOCKDOWN_MODULE_SIGNATURE); } #else /* !CONFIG_MODULE_SIG */ static int module_sig_check(struct load_info *info, int flags) -- cgit v1.2.3 From 076aa52e402185e1e347bf5c62c61c6388fce4c7 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 31 Oct 2020 23:10:28 +0300 Subject: module: only handle errors with the *switch* statement in module_sig_check() Let's handle the successful call of mod_verify_sig() right after that call, making the *switch* statement only handle the real errors, and then move the comment from the first *case* before *switch* itself and the comment before *default* after it. Fix the comment style, add article/comma/dash, spell out "nomem" as "lack of memory" in these comments, while at it... Suggested-by: Joe Perches Reviewed-by: Miroslav Benes Signed-off-by: Sergey Shtylyov Signed-off-by: Jessica Yu --- kernel/module.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 02b87bc84a42..948d4bbbceb5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2895,17 +2895,18 @@ static int module_sig_check(struct load_info *info, int flags) /* We truncate the module to discard the signature */ info->len -= markerlen; err = mod_verify_sig(mod, info); + if (!err) { + info->sig_ok = true; + return 0; + } } + /* + * We don't permit modules to be loaded into the trusted kernels + * without a valid signature on them, but if we're not enforcing, + * certain errors are non-fatal. + */ switch (err) { - case 0: - info->sig_ok = true; - return 0; - - /* We don't permit modules to be loaded into trusted kernels - * without a valid signature on them, but if we're not - * enforcing, certain errors are non-fatal. - */ case -ENODATA: reason = "unsigned module"; break; @@ -2916,11 +2917,12 @@ static int module_sig_check(struct load_info *info, int flags) reason = "module with unavailable key"; break; - /* All other errors are fatal, including nomem, unparseable - * signatures and signature check failures - even if signatures - * aren't required. - */ default: + /* + * All other errors are fatal, including lack of memory, + * unparseable signatures, and signature check failures -- + * even if signatures aren't required. + */ return err; } -- cgit v1.2.3 From 645f224e7ba2f4200bf163153d384ceb0de5462e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Nov 2020 09:17:49 -0500 Subject: kprobes: Tell lockdep about kprobe nesting Since the kprobe handlers have protection that prohibits other handlers from executing in other contexts (like if an NMI comes in while processing a kprobe, and executes the same kprobe, it will get fail with a "busy" return). Lockdep is unaware of this protection. Use lockdep's nesting api to differentiate between locks taken in INT3 context and other context to suppress the false warnings. Link: https://lore.kernel.org/r/20201102160234.fa0ae70915ad9e2b21c08b85@kernel.org Cc: Peter Zijlstra Acked-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8a12a25fa40d..41fdbb7953c6 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1249,7 +1249,13 @@ __acquires(hlist_lock) *head = &kretprobe_inst_table[hash]; hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); + /* + * Nested is a workaround that will soon not be needed. + * There's other protections that make sure the same lock + * is not taken on the same CPU that lockdep is unaware of. + * Differentiate when it is taken in NMI context. + */ + raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi()); } NOKPROBE_SYMBOL(kretprobe_hash_lock); @@ -1258,7 +1264,13 @@ static void kretprobe_table_lock(unsigned long hash, __acquires(hlist_lock) { raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); - raw_spin_lock_irqsave(hlist_lock, *flags); + /* + * Nested is a workaround that will soon not be needed. + * There's other protections that make sure the same lock + * is not taken on the same CPU that lockdep is unaware of. + * Differentiate when it is taken in NMI context. + */ + raw_spin_lock_irqsave_nested(hlist_lock, *flags, !!in_nmi()); } NOKPROBE_SYMBOL(kretprobe_table_lock); @@ -2028,7 +2040,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) /* TODO: consider to only swap the RA after the last pre_handler fired */ hash = hash_ptr(current, KPROBE_HASH_BITS); - raw_spin_lock_irqsave(&rp->lock, flags); + /* + * Nested is a workaround that will soon not be needed. + * There's other protections that make sure the same lock + * is not taken on the same CPU that lockdep is unaware of. + */ + raw_spin_lock_irqsave_nested(&rp->lock, flags, 1); if (!hlist_empty(&rp->free_instances)) { ri = hlist_entry(rp->free_instances.first, struct kretprobe_instance, hlist); @@ -2039,7 +2056,7 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) ri->task = current; if (rp->entry_handler && rp->entry_handler(ri, regs)) { - raw_spin_lock_irqsave(&rp->lock, flags); + raw_spin_lock_irqsave_nested(&rp->lock, flags, 1); hlist_add_head(&ri->hlist, &rp->free_instances); raw_spin_unlock_irqrestore(&rp->lock, flags); return 0; -- cgit v1.2.3 From 9d820f68b2bdba5b2e7bf135123c3f57c5051d05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 4 Nov 2020 14:06:23 +0100 Subject: entry: Fix the incorrect ordering of lockdep and RCU check When an exception/interrupt hits kernel space and the kernel is not currently in the idle task then RCU must be watching. irqentry_enter() validates this via rcu_irq_enter_check_tick(), which in turn invokes lockdep when taking a lock. But at that point lockdep does not yet know about the fact that interrupts have been disabled by the CPU, which triggers a lockdep splat complaining about inconsistent state. Invoking trace_hardirqs_off() before rcu_irq_enter_check_tick() defeats the point of rcu_irq_enter_check_tick() because trace_hardirqs_off() uses RCU. So use the same sequence as for the idle case and tell lockdep about the irq state change first, invoke the RCU check and then do the lockdep and tracer update. Fixes: a5497bab5f72 ("entry: Provide generic interrupt entry/exit code") Reported-by: Mark Rutland Signed-off-by: Thomas Gleixner Tested-by: Mark Rutland Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87y2jhl19s.fsf@nanos.tec.linutronix.de --- kernel/entry/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 2b8366693d5c..e9e2df3f3f9e 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -337,10 +337,10 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * already contains a warning when RCU is not watching, so no point * in having another one here. */ + lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); rcu_irq_enter_check_tick(); - /* Use the combo lockdep/tracing function */ - trace_hardirqs_off(); + trace_hardirqs_off_finish(); instrumentation_end(); return ret; -- cgit v1.2.3 From b6be002bcd1dd1dedb926abf3c90c794eacb77dc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 2 Nov 2020 12:53:16 -0800 Subject: x86/entry: Move nmi entry/exit into common code Lockdep state handling on NMI enter and exit is nothing specific to X86. It's not any different on other architectures. Also the extra state type is not necessary, irqentry_state_t can carry the necessary information as well. Move it to common code and extend irqentry_state_t to carry lockdep state. [ Ira: Make exit_rcu and lockdep a union as they are mutually exclusive between the IRQ and NMI exceptions, and add kernel documentation for struct irqentry_state_t ] Signed-off-by: Thomas Gleixner Signed-off-by: Ira Weiny Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201102205320.1458656-7-ira.weiny@intel.com --- arch/x86/entry/common.c | 34 ---------------------------------- arch/x86/include/asm/idtentry.h | 3 --- arch/x86/kernel/cpu/mce/core.c | 6 +++--- arch/x86/kernel/nmi.c | 6 +++--- arch/x86/kernel/traps.c | 13 +++++++------ include/linux/entry-common.h | 39 ++++++++++++++++++++++++++++++++++++++- kernel/entry/common.c | 36 ++++++++++++++++++++++++++++++++++++ 7 files changed, 87 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 870efeec8bda..18d8f17f755c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -209,40 +209,6 @@ SYSCALL_DEFINE0(ni_syscall) return -ENOSYS; } -noinstr bool idtentry_enter_nmi(struct pt_regs *regs) -{ - bool irq_state = lockdep_hardirqs_enabled(); - - __nmi_enter(); - lockdep_hardirqs_off(CALLER_ADDR0); - lockdep_hardirq_enter(); - rcu_nmi_enter(); - - instrumentation_begin(); - trace_hardirqs_off_finish(); - ftrace_nmi_enter(); - instrumentation_end(); - - return irq_state; -} - -noinstr void idtentry_exit_nmi(struct pt_regs *regs, bool restore) -{ - instrumentation_begin(); - ftrace_nmi_exit(); - if (restore) { - trace_hardirqs_on_prepare(); - lockdep_hardirqs_on_prepare(CALLER_ADDR0); - } - instrumentation_end(); - - rcu_nmi_exit(); - lockdep_hardirq_exit(); - if (restore) - lockdep_hardirqs_on(CALLER_ADDR0); - __nmi_exit(); -} - #ifdef CONFIG_XEN_PV #ifndef CONFIG_PREEMPTION /* diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index b2442eb0ac2f..247a60a47331 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -11,9 +11,6 @@ #include -bool idtentry_enter_nmi(struct pt_regs *regs); -void idtentry_exit_nmi(struct pt_regs *regs, bool irq_state); - /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 4102b866e7c0..f5c860b1a50b 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1983,7 +1983,7 @@ void (*machine_check_vector)(struct pt_regs *) = unexpected_machine_check; static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) { - bool irq_state; + irqentry_state_t irq_state; WARN_ON_ONCE(user_mode(regs)); @@ -1995,7 +1995,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) mce_check_crashing_cpu()) return; - irq_state = idtentry_enter_nmi(regs); + irq_state = irqentry_nmi_enter(regs); /* * The call targets are marked noinstr, but objtool can't figure * that out because it's an indirect call. Annotate it. @@ -2006,7 +2006,7 @@ static __always_inline void exc_machine_check_kernel(struct pt_regs *regs) if (regs->flags & X86_EFLAGS_IF) trace_hardirqs_on_prepare(); instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); } static __always_inline void exc_machine_check_user(struct pt_regs *regs) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 4bc77aaf1303..bf250a339655 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -475,7 +475,7 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7); DEFINE_IDTENTRY_RAW(exc_nmi) { - bool irq_state; + irqentry_state_t irq_state; /* * Re-enable NMIs right here when running as an SEV-ES guest. This might @@ -502,14 +502,14 @@ nmi_restart: this_cpu_write(nmi_dr7, local_db_save()); - irq_state = idtentry_enter_nmi(regs); + irq_state = irqentry_nmi_enter(regs); inc_irq_stat(__nmi_count); if (!ignore_nmis) default_do_nmi(regs); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); local_db_restore(this_cpu_read(nmi_dr7)); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e19df6cde35d..e1b78829d909 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -405,7 +405,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) } #endif - idtentry_enter_nmi(regs); + irqentry_nmi_enter(regs); instrumentation_begin(); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -651,12 +651,13 @@ DEFINE_IDTENTRY_RAW(exc_int3) instrumentation_end(); irqentry_exit_to_user_mode(regs); } else { - bool irq_state = idtentry_enter_nmi(regs); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); + instrumentation_begin(); if (!do_int3(regs)) die("int3", regs, 0); instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); } } @@ -851,7 +852,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, * includes the entry stack is excluded for everything. */ unsigned long dr7 = local_db_save(); - bool irq_state = idtentry_enter_nmi(regs); + irqentry_state_t irq_state = irqentry_nmi_enter(regs); instrumentation_begin(); /* @@ -908,7 +909,7 @@ static __always_inline void exc_debug_kernel(struct pt_regs *regs, regs->flags &= ~X86_EFLAGS_TF; out: instrumentation_end(); - idtentry_exit_nmi(regs, irq_state); + irqentry_nmi_exit(regs, irq_state); local_db_restore(dr7); } @@ -926,7 +927,7 @@ static __always_inline void exc_debug_user(struct pt_regs *regs, /* * NB: We can't easily clear DR7 here because - * idtentry_exit_to_usermode() can invoke ptrace, schedule, access + * irqentry_exit_to_usermode() can invoke ptrace, schedule, access * user memory, etc. This means that a recursive #DB is possible. If * this happens, that #DB will hit exc_debug_kernel() and clear DR7. * Since we're not on the IST stack right now, everything will be diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index b9711e813ec2..1a128baf3628 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -346,8 +346,26 @@ void irqentry_enter_from_user_mode(struct pt_regs *regs); void irqentry_exit_to_user_mode(struct pt_regs *regs); #ifndef irqentry_state +/** + * struct irqentry_state - Opaque object for exception state storage + * @exit_rcu: Used exclusively in the irqentry_*() calls; signals whether the + * exit path has to invoke rcu_irq_exit(). + * @lockdep: Used exclusively in the irqentry_nmi_*() calls; ensures that + * lockdep state is restored correctly on exit from nmi. + * + * This opaque object is filled in by the irqentry_*_enter() functions and + * must be passed back into the corresponding irqentry_*_exit() functions + * when the exception is complete. + * + * Callers of irqentry_*_[enter|exit]() must consider this structure opaque + * and all members private. Descriptions of the members are provided to aid in + * the maintenance of the irqentry_*() functions. + */ typedef struct irqentry_state { - bool exit_rcu; + union { + bool exit_rcu; + bool lockdep; + }; } irqentry_state_t; #endif @@ -407,4 +425,23 @@ void irqentry_exit_cond_resched(void); */ void noinstr irqentry_exit(struct pt_regs *regs, irqentry_state_t state); +/** + * irqentry_nmi_enter - Handle NMI entry + * @regs: Pointer to currents pt_regs + * + * Similar to irqentry_enter() but taking care of the NMI constraints. + */ +irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs); + +/** + * irqentry_nmi_exit - Handle return from NMI handling + * @regs: Pointer to pt_regs (NMI entry regs) + * @irq_state: Return value from matching call to irqentry_nmi_enter() + * + * Last action before returning to the low level assmenbly code. + * + * Counterpart to irqentry_nmi_enter(). + */ +void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state); + #endif diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 3a1dfecc533e..bc75c114c1b3 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -405,3 +405,39 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) rcu_irq_exit(); } } + +irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) +{ + irqentry_state_t irq_state; + + irq_state.lockdep = lockdep_hardirqs_enabled(); + + __nmi_enter(); + lockdep_hardirqs_off(CALLER_ADDR0); + lockdep_hardirq_enter(); + rcu_nmi_enter(); + + instrumentation_begin(); + trace_hardirqs_off_finish(); + ftrace_nmi_enter(); + instrumentation_end(); + + return irq_state; +} + +void noinstr irqentry_nmi_exit(struct pt_regs *regs, irqentry_state_t irq_state) +{ + instrumentation_begin(); + ftrace_nmi_exit(); + if (irq_state.lockdep) { + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(CALLER_ADDR0); + } + instrumentation_end(); + + rcu_nmi_exit(); + lockdep_hardirq_exit(); + if (irq_state.lockdep) + lockdep_hardirqs_on(CALLER_ADDR0); + __nmi_exit(); +} -- cgit v1.2.3 From 7c0afcad7507636529e6a5a2a5eef5482619a449 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 5 Nov 2020 11:51:09 -0800 Subject: bpf: BPF_PRELOAD depends on BPF_SYSCALL Fix build error when BPF_SYSCALL is not set/enabled but BPF_PRELOAD is by making BPF_PRELOAD depend on BPF_SYSCALL. ERROR: modpost: "bpf_preload_ops" [kernel/bpf/preload/bpf_preload.ko] undefined! Reported-by: kernel test robot Reported-by: Randy Dunlap Signed-off-by: Randy Dunlap Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201105195109.26232-1-rdunlap@infradead.org --- kernel/bpf/preload/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index ace49111d3a3..26bced262473 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -6,6 +6,7 @@ config USERMODE_DRIVER menuconfig BPF_PRELOAD bool "Preload BPF file system with kernel specific program and map iterators" depends on BPF + depends on BPF_SYSCALL # The dependency on !COMPILE_TEST prevents it from being enabled # in allmodconfig or allyesconfig configurations depends on !COMPILE_TEST -- cgit v1.2.3 From d3bec0138bfbe58606fc1d6f57a4cdc1a20218db Mon Sep 17 00:00:00 2001 From: David Verbeiren Date: Wed, 4 Nov 2020 12:23:32 +0100 Subject: bpf: Zero-fill re-used per-cpu map element Zero-fill element values for all other cpus than current, just as when not using prealloc. This is the only way the bpf program can ensure known initial values for all cpus ('onallcpus' cannot be set when coming from the bpf program). The scenario is: bpf program inserts some elements in a per-cpu map, then deletes some (or userspace does). When later adding new elements using bpf_map_update_elem(), the bpf program can only set the value of the new elements for the current cpu. When prealloc is enabled, previously deleted elements are re-used. Without the fix, values for other cpus remain whatever they were when the re-used entry was previously freed. A selftest is added to validate correct operation in above scenario as well as in case of LRU per-cpu map element re-use. Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Signed-off-by: David Verbeiren Signed-off-by: Alexei Starovoitov Acked-by: Matthieu Baerts Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201104112332.15191-1-david.verbeiren@tessares.net --- kernel/bpf/hashtab.c | 30 ++- tools/testing/selftests/bpf/prog_tests/map_init.c | 214 ++++++++++++++++++++++ tools/testing/selftests/bpf/progs/test_map_init.c | 33 ++++ 3 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_init.c create mode 100644 tools/testing/selftests/bpf/progs/test_map_init.c (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 1815e97d4c9c..1fccba6e88c4 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -821,6 +821,32 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + /* When using prealloc and not setting the initial value on all cpus, + * zero-fill element values for other cpus (just as what happens when + * not using prealloc). Otherwise, bpf program has no way to ensure + * known initial values for cpus other than current one + * (onallcpus=false always when coming from bpf prog). + */ + if (htab_is_prealloc(htab) && !onallcpus) { + u32 size = round_up(htab->map.value_size, 8); + int current_cpu = raw_smp_processor_id(); + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == current_cpu) + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, + size); + else + memset(per_cpu_ptr(pptr, cpu), 0, size); + } + } else { + pcpu_copy_value(htab, pptr, value, onallcpus); + } +} + static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && @@ -891,7 +917,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - pcpu_copy_value(htab, pptr, value, onallcpus); + pcpu_init_value(htab, pptr, value, onallcpus); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); @@ -1183,7 +1209,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { - pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c new file mode 100644 index 000000000000..14a31109dd0e --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_init.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Tessares SA */ + +#include +#include "test_map_init.skel.h" + +#define TEST_VALUE 0x1234 +#define FILL_VALUE 0xdeadbeef + +static int nr_cpus; +static int duration; + +typedef unsigned long long map_key_t; +typedef unsigned long long map_value_t; +typedef struct { + map_value_t v; /* padding */ +} __bpf_percpu_val_align pcpu_map_value_t; + + +static int map_populate(int map_fd, int num) +{ + pcpu_map_value_t value[nr_cpus]; + int i, err; + map_key_t key; + + for (i = 0; i < nr_cpus; i++) + bpf_percpu(value, i) = FILL_VALUE; + + for (key = 1; key <= num; key++) { + err = bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + return -1; + } + + return 0; +} + +static struct test_map_init *setup(enum bpf_map_type map_type, int map_sz, + int *map_fd, int populate) +{ + struct test_map_init *skel; + int err; + + skel = test_map_init__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return NULL; + + err = bpf_map__set_type(skel->maps.hashmap1, map_type); + if (!ASSERT_OK(err, "bpf_map__set_type")) + goto error; + + err = bpf_map__set_max_entries(skel->maps.hashmap1, map_sz); + if (!ASSERT_OK(err, "bpf_map__set_max_entries")) + goto error; + + err = test_map_init__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto error; + + *map_fd = bpf_map__fd(skel->maps.hashmap1); + if (CHECK(*map_fd < 0, "bpf_map__fd", "failed\n")) + goto error; + + err = map_populate(*map_fd, populate); + if (!ASSERT_OK(err, "map_populate")) + goto error_map; + + return skel; + +error_map: + close(*map_fd); +error: + test_map_init__destroy(skel); + return NULL; +} + +/* executes bpf program that updates map with key, value */ +static int prog_run_insert_elem(struct test_map_init *skel, map_key_t key, + map_value_t value) +{ + struct test_map_init__bss *bss; + + bss = skel->bss; + + bss->inKey = key; + bss->inValue = value; + bss->inPid = getpid(); + + if (!ASSERT_OK(test_map_init__attach(skel), "skel_attach")) + return -1; + + /* Let tracepoint trigger */ + syscall(__NR_getpgid); + + test_map_init__detach(skel); + + return 0; +} + +static int check_values_one_cpu(pcpu_map_value_t *value, map_value_t expected) +{ + int i, nzCnt = 0; + map_value_t val; + + for (i = 0; i < nr_cpus; i++) { + val = bpf_percpu(value, i); + if (val) { + if (CHECK(val != expected, "map value", + "unexpected for cpu %d: 0x%llx\n", i, val)) + return -1; + nzCnt++; + } + } + + if (CHECK(nzCnt != 1, "map value", "set for %d CPUs instead of 1!\n", + nzCnt)) + return -1; + + return 0; +} + +/* Add key=1 elem with values set for all CPUs + * Delete elem key=1 + * Run bpf prog that inserts new key=1 elem with value=0x1234 + * (bpf prog can only set value for current CPU) + * Lookup Key=1 and check value is as expected for all CPUs: + * value set by bpf prog for one CPU, 0 for all others + */ +static void test_pcpu_map_init(void) +{ + pcpu_map_value_t value[nr_cpus]; + struct test_map_init *skel; + int map_fd, err; + map_key_t key; + + /* max 1 elem in map so insertion is forced to reuse freed entry */ + skel = setup(BPF_MAP_TYPE_PERCPU_HASH, 1, &map_fd, 1); + if (!ASSERT_OK_PTR(skel, "prog_setup")) + return; + + /* delete element so the entry can be re-used*/ + key = 1; + err = bpf_map_delete_elem(map_fd, &key); + if (!ASSERT_OK(err, "bpf_map_delete_elem")) + goto cleanup; + + /* run bpf prog that inserts new elem, re-using the slot just freed */ + err = prog_run_insert_elem(skel, key, TEST_VALUE); + if (!ASSERT_OK(err, "prog_run_insert_elem")) + goto cleanup; + + /* check that key=1 was re-created by bpf prog */ + err = bpf_map_lookup_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto cleanup; + + /* and has expected values */ + check_values_one_cpu(value, TEST_VALUE); + +cleanup: + test_map_init__destroy(skel); +} + +/* Add key=1 and key=2 elems with values set for all CPUs + * Run bpf prog that inserts new key=3 elem + * (only for current cpu; other cpus should have initial value = 0) + * Lookup Key=1 and check value is as expected for all CPUs + */ +static void test_pcpu_lru_map_init(void) +{ + pcpu_map_value_t value[nr_cpus]; + struct test_map_init *skel; + int map_fd, err; + map_key_t key; + + /* Set up LRU map with 2 elements, values filled for all CPUs. + * With these 2 elements, the LRU map is full + */ + skel = setup(BPF_MAP_TYPE_LRU_PERCPU_HASH, 2, &map_fd, 2); + if (!ASSERT_OK_PTR(skel, "prog_setup")) + return; + + /* run bpf prog that inserts new key=3 element, re-using LRU slot */ + key = 3; + err = prog_run_insert_elem(skel, key, TEST_VALUE); + if (!ASSERT_OK(err, "prog_run_insert_elem")) + goto cleanup; + + /* check that key=3 replaced one of earlier elements */ + err = bpf_map_lookup_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto cleanup; + + /* and has expected values */ + check_values_one_cpu(value, TEST_VALUE); + +cleanup: + test_map_init__destroy(skel); +} + +void test_map_init(void) +{ + nr_cpus = bpf_num_possible_cpus(); + if (nr_cpus <= 1) { + printf("%s:SKIP: >1 cpu needed for this test\n", __func__); + test__skip(); + return; + } + + if (test__start_subtest("pcpu_map_init")) + test_pcpu_map_init(); + if (test__start_subtest("pcpu_lru_map_init")) + test_pcpu_lru_map_init(); +} diff --git a/tools/testing/selftests/bpf/progs/test_map_init.c b/tools/testing/selftests/bpf/progs/test_map_init.c new file mode 100644 index 000000000000..c89d28ead673 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_map_init.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Tessares SA */ + +#include "vmlinux.h" +#include + +__u64 inKey = 0; +__u64 inValue = 0; +__u32 inPid = 0; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 2); + __type(key, __u64); + __type(value, __u64); +} hashmap1 SEC(".maps"); + + +SEC("tp/syscalls/sys_enter_getpgid") +int sysenter_getpgid(const void *ctx) +{ + /* Just do it for once, when called from our own test prog. This + * ensures the map value is only updated for a single CPU. + */ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + + if (cur_pid == inPid) + bpf_map_update_elem(&hashmap1, &inKey, &inValue, BPF_NOEXIST); + + return 0; +} + +char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From c6bde958a62b8ca5ee8d2c1fe429aec4ad54efad Mon Sep 17 00:00:00 2001 From: Florian Lehner Date: Thu, 29 Oct 2020 21:14:42 +0100 Subject: bpf: Lift hashtab key_size limit Currently key_size of hashtab is limited to MAX_BPF_STACK. As the key of hashtab can also be a value from a per cpu map it can be larger than MAX_BPF_STACK. The use-case for this patch originates to implement allow/disallow lists for files and file paths. The maximum length of file paths is defined by PATH_MAX with 4096 chars including nul. This limit exceeds MAX_BPF_STACK. Changelog: v5: - Fix cast overflow v4: - Utilize BPF skeleton in tests - Rebase v3: - Rebase v2: - Add a test for bpf side Signed-off-by: Florian Lehner Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201029201442.596690-1-dev@der-flo.net --- kernel/bpf/hashtab.c | 16 +++----- .../selftests/bpf/prog_tests/hash_large_key.c | 43 +++++++++++++++++++++ .../selftests/bpf/progs/test_hash_large_key.c | 44 ++++++++++++++++++++++ tools/testing/selftests/bpf/test_maps.c | 3 +- 4 files changed, 94 insertions(+), 12 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/hash_large_key.c create mode 100644 tools/testing/selftests/bpf/progs/test_hash_large_key.c (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 23f73d4649c9..7bf18d92af41 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -415,17 +415,11 @@ static int htab_map_alloc_check(union bpf_attr *attr) attr->value_size == 0) return -EINVAL; - if (attr->key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - return -E2BIG; - - if (attr->value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's + if ((u64)attr->key_size + attr->value_size >= KMALLOC_MAX_SIZE - + sizeof(struct htab_elem)) + /* if key_size + value_size is bigger, the user space won't be + * able to access the elements via bpf syscall. This check + * also makes sure that the elem_size doesn't overflow and it's * kmalloc-able later in htab_map_update_elem() */ return -E2BIG; diff --git a/tools/testing/selftests/bpf/prog_tests/hash_large_key.c b/tools/testing/selftests/bpf/prog_tests/hash_large_key.c new file mode 100644 index 000000000000..34684c0fc76d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/hash_large_key.c @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include "test_hash_large_key.skel.h" + +void test_hash_large_key(void) +{ + int err, value = 21, duration = 0, hash_map_fd; + struct test_hash_large_key *skel; + + struct bigelement { + int a; + char b[4096]; + long long c; + } key; + bzero(&key, sizeof(key)); + + skel = test_hash_large_key__open_and_load(); + if (CHECK(!skel, "skel_open_and_load", "skeleton open/load failed\n")) + return; + + hash_map_fd = bpf_map__fd(skel->maps.hash_map); + if (CHECK(hash_map_fd < 0, "bpf_map__fd", "failed\n")) + goto cleanup; + + err = test_hash_large_key__attach(skel); + if (CHECK(err, "attach_raw_tp", "err %d\n", err)) + goto cleanup; + + err = bpf_map_update_elem(hash_map_fd, &key, &value, BPF_ANY); + if (CHECK(err, "bpf_map_update_elem", "errno=%d\n", errno)) + goto cleanup; + + key.c = 1; + err = bpf_map_lookup_elem(hash_map_fd, &key, &value); + if (CHECK(err, "bpf_map_lookup_elem", "errno=%d\n", errno)) + goto cleanup; + + CHECK_FAIL(value != 42); + +cleanup: + test_hash_large_key__destroy(skel); +} diff --git a/tools/testing/selftests/bpf/progs/test_hash_large_key.c b/tools/testing/selftests/bpf/progs/test_hash_large_key.c new file mode 100644 index 000000000000..473a22794a62 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_hash_large_key.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 2); + __type(key, struct bigelement); + __type(value, __u32); +} hash_map SEC(".maps"); + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, struct bigelement); +} key_map SEC(".maps"); + +struct bigelement { + int a; + char b[4096]; + long long c; +}; + +SEC("raw_tracepoint/sys_enter") +int bpf_hash_large_key_test(void *ctx) +{ + int zero = 0, err = 1, value = 42; + struct bigelement *key; + + key = bpf_map_lookup_elem(&key_map, &zero); + if (!key) + return 0; + + key->c = 1; + if (bpf_map_update_elem(&hash_map, key, &value, BPF_ANY)) + return 0; + + return 0; +} + diff --git a/tools/testing/selftests/bpf/test_maps.c b/tools/testing/selftests/bpf/test_maps.c index 0d92ebcb335d..0ad3e6305ff0 100644 --- a/tools/testing/selftests/bpf/test_maps.c +++ b/tools/testing/selftests/bpf/test_maps.c @@ -1223,9 +1223,10 @@ out_map_in_map: static void test_map_large(void) { + struct bigkey { int a; - char b[116]; + char b[4096]; long long c; } key; int fd, i, value; -- cgit v1.2.3 From 90574a9c02f1ed46d9d8fec222fbcf375eb90e9b Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 6 Nov 2020 04:40:05 +0100 Subject: printk: remove unneeded dead-store assignment make clang-analyzer on x86_64 defconfig caught my attention with: kernel/printk/printk_ringbuffer.c:885:3: warning: Value stored to 'desc' is never read [clang-analyzer-deadcode.DeadStores] desc = to_desc(desc_ring, head_id); ^ Commit b6cf8b3f3312 ("printk: add lockless ringbuffer") introduced desc_reserve() with this unneeded dead-store assignment. As discussed with John Ogness privately, this is probably just some minor left-over from previous iterations of the ringbuffer implementation. So, simply remove this unneeded dead assignment to make clang-analyzer happy. As compilers will detect this unneeded assignment and optimize this anyway, the resulting object code is identical before and after this change. No functional change. No change to object code. Signed-off-by: Lukas Bulwahn Reviewed-by: Sergey Senozhatsky Reviewed-by: John Ogness Reviewed-by: Nathan Chancellor Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20201106034005.18822-1-lukas.bulwahn@gmail.com --- kernel/printk/printk_ringbuffer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 24a960a89aa8..dd43c4cf16fb 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -882,8 +882,6 @@ static bool desc_reserve(struct printk_ringbuffer *rb, unsigned long *id_out) head_id = atomic_long_read(&desc_ring->head_id); /* LMM(desc_reserve:A) */ do { - desc = to_desc(desc_ring, head_id); - id = DESC_ID(head_id + 1); id_prev_wrap = DESC_ID_PREV_WRAP(desc_ring, id); -- cgit v1.2.3 From 0264c8c9e1b53e9dbb41fae5e54756e84644bc60 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:36 -0500 Subject: ftrace: Move the recursion testing into global headers Currently, if a callback is registered to a ftrace function and its ftrace_ops does not have the RECURSION flag set, it is encapsulated in a helper function that does the recursion for it. Really, all the callbacks should have their own recursion protection for performance reasons. But they should not all implement their own. Move the recursion helpers to global headers, so that all callbacks can use them. Link: https://lkml.kernel.org/r/20201028115612.460535535@goodmis.org Link: https://lkml.kernel.org/r/20201106023546.166456258@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 1 + include/linux/trace_recursion.h | 187 ++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 177 ------------------------------------- 3 files changed, 188 insertions(+), 177 deletions(-) create mode 100644 include/linux/trace_recursion.h (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 1bd3a0356ae4..0e4164a7f56d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -7,6 +7,7 @@ #ifndef _LINUX_FTRACE_H #define _LINUX_FTRACE_H +#include #include #include #include diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h new file mode 100644 index 000000000000..dbb7b6d4c94c --- /dev/null +++ b/include/linux/trace_recursion.h @@ -0,0 +1,187 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TRACE_RECURSION_H +#define _LINUX_TRACE_RECURSION_H + +#include +#include + +#ifdef CONFIG_TRACING + +/* Only current can touch trace_recursion */ + +/* + * For function tracing recursion: + * The order of these bits are important. + * + * When function tracing occurs, the following steps are made: + * If arch does not support a ftrace feature: + * call internal function (uses INTERNAL bits) which calls... + * If callback is registered to the "global" list, the list + * function is called and recursion checks the GLOBAL bits. + * then this function calls... + * The function callback, which can use the FTRACE bits to + * check for recursion. + * + * Now if the arch does not support a feature, and it calls + * the global list function which calls the ftrace callback + * all three of these steps will do a recursion protection. + * There's no reason to do one if the previous caller already + * did. The recursion that we are protecting against will + * go through the same steps again. + * + * To prevent the multiple recursion checks, if a recursion + * bit is set that is higher than the MAX bit of the current + * check, then we know that the check was made by the previous + * caller, and we can skip the current check. + */ +enum { + /* Function recursion bits */ + TRACE_FTRACE_BIT, + TRACE_FTRACE_NMI_BIT, + TRACE_FTRACE_IRQ_BIT, + TRACE_FTRACE_SIRQ_BIT, + + /* INTERNAL_BITs must be greater than FTRACE_BITs */ + TRACE_INTERNAL_BIT, + TRACE_INTERNAL_NMI_BIT, + TRACE_INTERNAL_IRQ_BIT, + TRACE_INTERNAL_SIRQ_BIT, + + TRACE_BRANCH_BIT, +/* + * Abuse of the trace_recursion. + * As we need a way to maintain state if we are tracing the function + * graph in irq because we want to trace a particular function that + * was called in irq context but we have irq tracing off. Since this + * can only be modified by current, we can reuse trace_recursion. + */ + TRACE_IRQ_BIT, + + /* Set if the function is in the set_graph_function file */ + TRACE_GRAPH_BIT, + + /* + * In the very unlikely case that an interrupt came in + * at a start of graph tracing, and we want to trace + * the function in that interrupt, the depth can be greater + * than zero, because of the preempted start of a previous + * trace. In an even more unlikely case, depth could be 2 + * if a softirq interrupted the start of graph tracing, + * followed by an interrupt preempting a start of graph + * tracing in the softirq, and depth can even be 3 + * if an NMI came in at the start of an interrupt function + * that preempted a softirq start of a function that + * preempted normal context!!!! Luckily, it can't be + * greater than 3, so the next two bits are a mask + * of what the depth is when we set TRACE_GRAPH_BIT + */ + + TRACE_GRAPH_DEPTH_START_BIT, + TRACE_GRAPH_DEPTH_END_BIT, + + /* + * To implement set_graph_notrace, if this bit is set, we ignore + * function graph tracing of called functions, until the return + * function is called to clear it. + */ + TRACE_GRAPH_NOTRACE_BIT, + + /* + * When transitioning between context, the preempt_count() may + * not be correct. Allow for a single recursion to cover this case. + */ + TRACE_TRANSITION_BIT, +}; + +#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) +#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) +#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) + +#define trace_recursion_depth() \ + (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) +#define trace_recursion_set_depth(depth) \ + do { \ + current->trace_recursion &= \ + ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \ + current->trace_recursion |= \ + ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \ + } while (0) + +#define TRACE_CONTEXT_BITS 4 + +#define TRACE_FTRACE_START TRACE_FTRACE_BIT +#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_LIST_START TRACE_INTERNAL_BIT +#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) + +#define TRACE_CONTEXT_MASK TRACE_LIST_MAX + +static __always_inline int trace_get_context_bit(void) +{ + int bit; + + if (in_interrupt()) { + if (in_nmi()) + bit = 0; + + else if (in_irq()) + bit = 1; + else + bit = 2; + } else + bit = 3; + + return bit; +} + +static __always_inline int trace_test_and_set_recursion(int start, int max) +{ + unsigned int val = current->trace_recursion; + int bit; + + /* A previous recursion check was made */ + if ((val & TRACE_CONTEXT_MASK) > max) + return 0; + + bit = trace_get_context_bit() + start; + if (unlikely(val & (1 << bit))) { + /* + * It could be that preempt_count has not been updated during + * a switch between contexts. Allow for a single recursion. + */ + bit = TRACE_TRANSITION_BIT; + if (trace_recursion_test(bit)) + return -1; + trace_recursion_set(bit); + barrier(); + return bit + 1; + } + + /* Normal check passed, clear the transition to allow it again */ + trace_recursion_clear(TRACE_TRANSITION_BIT); + + val |= 1 << bit; + current->trace_recursion = val; + barrier(); + + return bit + 1; +} + +static __always_inline void trace_clear_recursion(int bit) +{ + unsigned int val = current->trace_recursion; + + if (!bit) + return; + + bit--; + bit = 1 << bit; + val &= ~bit; + + barrier(); + current->trace_recursion = val; +} + +#endif /* CONFIG_TRACING */ +#endif /* _LINUX_TRACE_RECURSION_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1dadef445cd1..9462251cab92 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -558,183 +558,6 @@ struct tracer { bool noboot; }; - -/* Only current can touch trace_recursion */ - -/* - * For function tracing recursion: - * The order of these bits are important. - * - * When function tracing occurs, the following steps are made: - * If arch does not support a ftrace feature: - * call internal function (uses INTERNAL bits) which calls... - * If callback is registered to the "global" list, the list - * function is called and recursion checks the GLOBAL bits. - * then this function calls... - * The function callback, which can use the FTRACE bits to - * check for recursion. - * - * Now if the arch does not support a feature, and it calls - * the global list function which calls the ftrace callback - * all three of these steps will do a recursion protection. - * There's no reason to do one if the previous caller already - * did. The recursion that we are protecting against will - * go through the same steps again. - * - * To prevent the multiple recursion checks, if a recursion - * bit is set that is higher than the MAX bit of the current - * check, then we know that the check was made by the previous - * caller, and we can skip the current check. - */ -enum { - /* Function recursion bits */ - TRACE_FTRACE_BIT, - TRACE_FTRACE_NMI_BIT, - TRACE_FTRACE_IRQ_BIT, - TRACE_FTRACE_SIRQ_BIT, - - /* INTERNAL_BITs must be greater than FTRACE_BITs */ - TRACE_INTERNAL_BIT, - TRACE_INTERNAL_NMI_BIT, - TRACE_INTERNAL_IRQ_BIT, - TRACE_INTERNAL_SIRQ_BIT, - - TRACE_BRANCH_BIT, -/* - * Abuse of the trace_recursion. - * As we need a way to maintain state if we are tracing the function - * graph in irq because we want to trace a particular function that - * was called in irq context but we have irq tracing off. Since this - * can only be modified by current, we can reuse trace_recursion. - */ - TRACE_IRQ_BIT, - - /* Set if the function is in the set_graph_function file */ - TRACE_GRAPH_BIT, - - /* - * In the very unlikely case that an interrupt came in - * at a start of graph tracing, and we want to trace - * the function in that interrupt, the depth can be greater - * than zero, because of the preempted start of a previous - * trace. In an even more unlikely case, depth could be 2 - * if a softirq interrupted the start of graph tracing, - * followed by an interrupt preempting a start of graph - * tracing in the softirq, and depth can even be 3 - * if an NMI came in at the start of an interrupt function - * that preempted a softirq start of a function that - * preempted normal context!!!! Luckily, it can't be - * greater than 3, so the next two bits are a mask - * of what the depth is when we set TRACE_GRAPH_BIT - */ - - TRACE_GRAPH_DEPTH_START_BIT, - TRACE_GRAPH_DEPTH_END_BIT, - - /* - * To implement set_graph_notrace, if this bit is set, we ignore - * function graph tracing of called functions, until the return - * function is called to clear it. - */ - TRACE_GRAPH_NOTRACE_BIT, - - /* - * When transitioning between context, the preempt_count() may - * not be correct. Allow for a single recursion to cover this case. - */ - TRACE_TRANSITION_BIT, -}; - -#define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) -#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) -#define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) - -#define trace_recursion_depth() \ - (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) -#define trace_recursion_set_depth(depth) \ - do { \ - current->trace_recursion &= \ - ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \ - current->trace_recursion |= \ - ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \ - } while (0) - -#define TRACE_CONTEXT_BITS 4 - -#define TRACE_FTRACE_START TRACE_FTRACE_BIT -#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1) - -#define TRACE_LIST_START TRACE_INTERNAL_BIT -#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) - -#define TRACE_CONTEXT_MASK TRACE_LIST_MAX - -static __always_inline int trace_get_context_bit(void) -{ - int bit; - - if (in_interrupt()) { - if (in_nmi()) - bit = 0; - - else if (in_irq()) - bit = 1; - else - bit = 2; - } else - bit = 3; - - return bit; -} - -static __always_inline int trace_test_and_set_recursion(int start, int max) -{ - unsigned int val = current->trace_recursion; - int bit; - - /* A previous recursion check was made */ - if ((val & TRACE_CONTEXT_MASK) > max) - return 0; - - bit = trace_get_context_bit() + start; - if (unlikely(val & (1 << bit))) { - /* - * It could be that preempt_count has not been updated during - * a switch between contexts. Allow for a single recursion. - */ - bit = TRACE_TRANSITION_BIT; - if (trace_recursion_test(bit)) - return -1; - trace_recursion_set(bit); - barrier(); - return bit + 1; - } - - /* Normal check passed, clear the transition to allow it again */ - trace_recursion_clear(TRACE_TRANSITION_BIT); - - val |= 1 << bit; - current->trace_recursion = val; - barrier(); - - return bit + 1; -} - -static __always_inline void trace_clear_recursion(int bit) -{ - unsigned int val = current->trace_recursion; - - if (!bit) - return; - - bit--; - bit = 1 << bit; - val &= ~bit; - - barrier(); - current->trace_recursion = val; -} - static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { -- cgit v1.2.3 From 6e4eb9cb22fc8a893cb708ed42644de5ee7c3827 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:37 -0500 Subject: ftrace: Add ftrace_test_recursion_trylock() helper function To make it easier for ftrace callbacks to have recursion protection, provide a ftrace_test_recursion_trylock() and ftrace_test_recursion_unlock() helper that tests for recursion. Link: https://lkml.kernel.org/r/20201028115612.634927593@goodmis.org Link: https://lkml.kernel.org/r/20201106023546.378584067@goodmis.org Signed-off-by: Steven Rostedt (VMware) --- include/linux/trace_recursion.h | 25 +++++++++++++++++++++++++ kernel/trace/trace_functions.c | 12 +++++------- 2 files changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index dbb7b6d4c94c..f2a949dbfec7 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -183,5 +183,30 @@ static __always_inline void trace_clear_recursion(int bit) current->trace_recursion = val; } +/** + * ftrace_test_recursion_trylock - tests for recursion in same context + * + * Use this for ftrace callbacks. This will detect if the function + * tracing recursed in the same context (normal vs interrupt), + * + * Returns: -1 if a recursion happened. + * >= 0 if no recursion + */ +static __always_inline int ftrace_test_recursion_trylock(void) +{ + return trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); +} + +/** + * ftrace_test_recursion_unlock - called when function callback is complete + * @bit: The return of a successful ftrace_test_recursion_trylock() + * + * This is used at the end of a ftrace callback. + */ +static __always_inline void ftrace_test_recursion_unlock(int bit) +{ + trace_clear_recursion(bit); +} + #endif /* CONFIG_TRACING */ #endif /* _LINUX_TRACE_RECURSION_H */ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 2c2126e1871d..943756c01190 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -141,22 +141,20 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, if (unlikely(!tr->function_enabled)) return; + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; + pc = preempt_count(); preempt_disable_notrace(); - bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); - if (bit < 0) - goto out; - cpu = smp_processor_id(); data = per_cpu_ptr(tr->array_buffer.data, cpu); if (!atomic_read(&data->disabled)) { local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); } - trace_clear_recursion(bit); - - out: + ftrace_test_recursion_unlock(bit); preempt_enable_notrace(); } -- cgit v1.2.3 From 13f3ea9a2c829f28610bb8772a8b9c328412930e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:41 -0500 Subject: livepatch/ftrace: Add recursion protection to the ftrace callback If a ftrace callback does not supply its own recursion protection and does not set the RECURSION_SAFE flag in its ftrace_ops, then ftrace will make a helper trampoline to do so before calling the callback instead of just calling the callback directly. The default for ftrace_ops is going to change. It will expect that handlers provide their own recursion protection, unless its ftrace_ops states otherwise. Link: https://lkml.kernel.org/r/20201028115613.291169246@goodmis.org Link: https://lkml.kernel.org/r/20201106023547.122802424@goodmis.org Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Joe Lawrence Cc: live-patching@vger.kernel.org Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- kernel/livepatch/patch.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index b552cf2d85f8..6c0164d24bbd 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -45,9 +45,13 @@ static void notrace klp_ftrace_handler(unsigned long ip, struct klp_ops *ops; struct klp_func *func; int patch_state; + int bit; ops = container_of(fops, struct klp_ops, fops); + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; /* * A variant of synchronize_rcu() is used to allow patching functions * where RCU is not watching, see klp_synchronize_transition(). @@ -117,6 +121,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, unlock: preempt_enable_notrace(); + ftrace_test_recursion_unlock(bit); } /* -- cgit v1.2.3 From 4b750b573c5b3ee10e33c1573eaa94a9dad62f19 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:42 -0500 Subject: livepatch: Trigger WARNING if livepatch function fails due to recursion If for some reason a function is called that triggers the recursion detection of live patching, trigger a warning. By not executing the live patch code, it is possible that the old unpatched function will be called placing the system into an unknown state. Link: https://lore.kernel.org/r/20201029145709.GD16774@alley Link: https://lkml.kernel.org/r/20201106023547.312639435@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Joe Lawrence Cc: live-patching@vger.kernel.org Suggested-by: Miroslav Benes Reviewed-by: Petr Mladek Acked-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- kernel/livepatch/patch.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 6c0164d24bbd..15480bf3ce88 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -50,7 +50,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); bit = ftrace_test_recursion_trylock(); - if (bit < 0) + if (WARN_ON_ONCE(bit < 0)) return; /* * A variant of synchronize_rcu() is used to allow patching functions -- cgit v1.2.3 From 5d15a624c34b11c8d1c04c8cc004782e7ac2888d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:43 -0500 Subject: perf/ftrace: Add recursion protection to the ftrace callback If a ftrace callback does not supply its own recursion protection and does not set the RECURSION_SAFE flag in its ftrace_ops, then ftrace will make a helper trampoline to do so before calling the callback instead of just calling the callback directly. The default for ftrace_ops is going to change. It will expect that handlers provide their own recursion protection, unless its ftrace_ops states otherwise. Link: https://lkml.kernel.org/r/20201028115613.444477858@goodmis.org Link: https://lkml.kernel.org/r/20201106023547.466892083@goodmis.org Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 643e0b19920d..fd58d83861d8 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -439,10 +439,15 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct hlist_head head; struct pt_regs regs; int rctx; + int bit; if ((unsigned long)ops->private != smp_processor_id()) return; + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; + event = container_of(ops, struct perf_event, ftrace_ops); /* @@ -463,13 +468,15 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); if (!entry) - return; + goto out; entry->ip = ip; entry->parent_ip = parent_ip; perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, 1, ®s, &head, NULL); +out: + ftrace_test_recursion_unlock(bit); #undef ENTRY_SIZE } -- cgit v1.2.3 From 5d029b035bf112466541b844ee1b86197936db65 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:44 -0500 Subject: perf/ftrace: Check for rcu_is_watching() in callback function If a ftrace callback requires "rcu_is_watching", then it adds the FTRACE_OPS_FL_RCU flag and it will not be called if RCU is not "watching". But this means that it will use a trampoline when called, and this slows down the function tracing a tad. By checking rcu_is_watching() from within the callback, it no longer needs the RCU flag set in the ftrace_ops and it can be safely called directly. Link: https://lkml.kernel.org/r/20201028115613.591878956@goodmis.org Link: https://lkml.kernel.org/r/20201106023547.711035826@goodmis.org Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Jiri Olsa Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_event_perf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index fd58d83861d8..a2b9fddb8148 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -441,6 +441,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, int rctx; int bit; + if (!rcu_is_watching()) + return; + if ((unsigned long)ops->private != smp_processor_id()) return; @@ -484,7 +487,6 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags = FTRACE_OPS_FL_RCU; ops->func = perf_ftrace_function_call; ops->private = (void *)(unsigned long)nr_cpu_ids; -- cgit v1.2.3 From a25d036d939a30623ff73ecad9c8b9116b02e823 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:45 -0500 Subject: ftrace: Reverse what the RECURSION flag means in the ftrace_ops Now that all callbacks are recursion safe, reverse the meaning of the RECURSION flag and rename it from RECURSION_SAFE to simply RECURSION. Now only callbacks that request to have recursion protecting it will have the added trampoline to do so. Also remove the outdated comment about "PER_CPU" when determining to use the ftrace_ops_assist_func. Link: https://lkml.kernel.org/r/20201028115613.742454631@goodmis.org Link: https://lkml.kernel.org/r/20201106023547.904270143@goodmis.org Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Jonathan Corbet Cc: Sebastian Andrzej Siewior Cc: Miroslav Benes Cc: Kamalesh Babulal Cc: Petr Mladek Cc: linux-doc@vger.kernel.org Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace-uses.rst | 82 +++++++++++++++++++++++++++---------- include/linux/ftrace.h | 12 +++--- kernel/trace/fgraph.c | 3 +- kernel/trace/ftrace.c | 20 ++++----- kernel/trace/trace_events.c | 1 - kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_selftest.c | 7 +--- kernel/trace/trace_stack.c | 1 - 8 files changed, 79 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index a4955f7e3d19..86cd14b8e126 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -30,8 +30,8 @@ The ftrace context This requires extra care to what can be done inside a callback. A callback can be called outside the protective scope of RCU. -The ftrace infrastructure has some protections against recursions and RCU -but one must still be very careful how they use the callbacks. +There are helper functions to help against recursion, and making sure +RCU is watching. These are explained below. The ftrace_ops structure @@ -108,6 +108,50 @@ The prototype of the callback function is as follows (as of v4.14): at the start of the function where ftrace was tracing. Otherwise it either contains garbage, or NULL. +Protect your callback +===================== + +As functions can be called from anywhere, and it is possible that a function +called by a callback may also be traced, and call that same callback, +recursion protection must be used. There are two helper functions that +can help in this regard. If you start your code with: + + int bit; + + bit = ftrace_test_recursion_trylock(); + if (bit < 0) + return; + +and end it with: + + ftrace_test_recursion_unlock(bit); + +The code in between will be safe to use, even if it ends up calling a +function that the callback is tracing. Note, on success, +ftrace_test_recursion_trylock() will disable preemption, and the +ftrace_test_recursion_unlock() will enable it again (if it was previously +enabled). + +Alternatively, if the FTRACE_OPS_FL_RECURSION flag is set on the ftrace_ops +(as explained below), then a helper trampoline will be used to test +for recursion for the callback and no recursion test needs to be done. +But this is at the expense of a slightly more overhead from an extra +function call. + +If your callback accesses any data or critical section that requires RCU +protection, it is best to make sure that RCU is "watching", otherwise +that data or critical section will not be protected as expected. In this +case add: + + if (!rcu_is_watching()) + return; + +Alternatively, if the FTRACE_OPS_FL_RCU flag is set on the ftrace_ops +(as explained below), then a helper trampoline will be used to test +for rcu_is_watching for the callback and no other test needs to be done. +But this is at the expense of a slightly more overhead from an extra +function call. + The ftrace FLAGS ================ @@ -128,26 +172,20 @@ FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED will not fail with this flag set. But the callback must check if regs is NULL or not to determine if the architecture supports it. -FTRACE_OPS_FL_RECURSION_SAFE - By default, a wrapper is added around the callback to - make sure that recursion of the function does not occur. That is, - if a function that is called as a result of the callback's execution - is also traced, ftrace will prevent the callback from being called - again. But this wrapper adds some overhead, and if the callback is - safe from recursion, it can set this flag to disable the ftrace - protection. - - Note, if this flag is set, and recursion does occur, it could cause - the system to crash, and possibly reboot via a triple fault. - - It is OK if another callback traces a function that is called by a - callback that is marked recursion safe. Recursion safe callbacks - must never trace any function that are called by the callback - itself or any nested functions that those functions call. - - If this flag is set, it is possible that the callback will also - be called with preemption enabled (when CONFIG_PREEMPTION is set), - but this is not guaranteed. +FTRACE_OPS_FL_RECURSION + By default, it is expected that the callback can handle recursion. + But if the callback is not that worried about overehead, then + setting this bit will add the recursion protection around the + callback by calling a helper function that will do the recursion + protection and only call the callback if it did not recurse. + + Note, if this flag is not set, and recursion does occur, it could + cause the system to crash, and possibly reboot via a triple fault. + + Not, if this flag is set, then the callback will always be called + with preemption disabled. If it is not set, then it is possible + (but not guaranteed) that the callback will be called in + preemptable context. FTRACE_OPS_FL_IPMODIFY Requires FTRACE_OPS_FL_SAVE_REGS set. If the callback is to "hijack" diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0e4164a7f56d..806196345c3f 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -98,7 +98,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); /* * FTRACE_OPS_FL_* bits denote the state of ftrace_ops struct and are * set in the flags member. - * CONTROL, SAVE_REGS, SAVE_REGS_IF_SUPPORTED, RECURSION_SAFE, STUB and + * CONTROL, SAVE_REGS, SAVE_REGS_IF_SUPPORTED, RECURSION, STUB and * IPMODIFY are a kind of attribute flags which can be set only before * registering the ftrace_ops, and can not be modified while registered. * Changing those attribute flags after registering ftrace_ops will @@ -121,10 +121,10 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * passing regs to the handler. * Note, if this flag is set, the SAVE_REGS flag will automatically * get set upon registering the ftrace_ops, if the arch supports it. - * RECURSION_SAFE - The ftrace_ops can set this to tell the ftrace infrastructure - * that the call back has its own recursion protection. If it does - * not set this, then the ftrace infrastructure will add recursion - * protection for the caller. + * RECURSION - The ftrace_ops can set this to tell the ftrace infrastructure + * that the call back needs recursion protection. If it does + * not set this, then the ftrace infrastructure will assume + * that the callback can handle recursion on its own. * STUB - The ftrace_ops is just a place holder. * INITIALIZED - The ftrace_ops has already been initialized (first use time * register_ftrace_function() is called, it will initialized the ops) @@ -156,7 +156,7 @@ enum { FTRACE_OPS_FL_DYNAMIC = BIT(1), FTRACE_OPS_FL_SAVE_REGS = BIT(2), FTRACE_OPS_FL_SAVE_REGS_IF_SUPPORTED = BIT(3), - FTRACE_OPS_FL_RECURSION_SAFE = BIT(4), + FTRACE_OPS_FL_RECURSION = BIT(4), FTRACE_OPS_FL_STUB = BIT(5), FTRACE_OPS_FL_INITIALIZED = BIT(6), FTRACE_OPS_FL_DELETED = BIT(7), diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 5658f13037b3..73edb9e4f354 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -334,8 +334,7 @@ unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, static struct ftrace_ops graph_ops = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED | + .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID | FTRACE_OPS_FL_STUB, #ifdef FTRACE_GRAPH_TRAMP_ADDR diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8185f7240095..39f2bba89b76 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -80,7 +80,7 @@ enum { struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, + .flags = FTRACE_OPS_FL_STUB, INIT_OPS_HASH(ftrace_list_end) }; @@ -866,7 +866,7 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, + .flags = FTRACE_OPS_FL_INITIALIZED, INIT_OPS_HASH(ftrace_profile_ops) }; @@ -1040,8 +1040,7 @@ struct ftrace_ops global_ops = { .local_hash.notrace_hash = EMPTY_HASH, .local_hash.filter_hash = EMPTY_HASH, INIT_OPS_HASH(global_ops) - .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED | + .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID, }; @@ -2382,7 +2381,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip, struct ftrace_ops direct_ops = { .func = call_direct_funcs, - .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_RECURSION_SAFE + .flags = FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS | FTRACE_OPS_FL_PERMANENT, /* @@ -6864,8 +6863,7 @@ void ftrace_init_trace_array(struct trace_array *tr) struct ftrace_ops global_ops = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | - FTRACE_OPS_FL_INITIALIZED | + .flags = FTRACE_OPS_FL_INITIALIZED | FTRACE_OPS_FL_PID, }; @@ -7023,11 +7021,11 @@ NOKPROBE_SYMBOL(ftrace_ops_assist_func); ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { /* - * If the function does not handle recursion, needs to be RCU safe, - * or does per cpu logic, then we need to call the assist handler. + * If the function does not handle recursion or needs to be RCU safe, + * then we need to call the assist handler. */ - if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) || - ops->flags & FTRACE_OPS_FL_RCU) + if (ops->flags & (FTRACE_OPS_FL_RECURSION | + FTRACE_OPS_FL_RCU)) return ftrace_ops_assist_func; return ops->func; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 47a71f96e5bc..244abbcd1db5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3712,7 +3712,6 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __initdata = { .func = function_test_events_call, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static __init void event_trace_self_test_with_function(void) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 943756c01190..89c414ce1388 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -48,7 +48,7 @@ int ftrace_allocate_ftrace_ops(struct trace_array *tr) /* Currently only the non stack version is supported */ ops->func = function_trace_call; - ops->flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_PID; + ops->flags = FTRACE_OPS_FL_PID; tr->ops = ops; ops->private = tr; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 4738ad48a667..8ee3c0bb5d8a 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -150,17 +150,14 @@ static void trace_selftest_test_dyn_func(unsigned long ip, static struct ftrace_ops test_probe1 = { .func = trace_selftest_test_probe1_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe2 = { .func = trace_selftest_test_probe2_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static struct ftrace_ops test_probe3 = { .func = trace_selftest_test_probe3_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static void print_counts(void) @@ -448,11 +445,11 @@ static void trace_selftest_test_recursion_safe_func(unsigned long ip, static struct ftrace_ops test_rec_probe = { .func = trace_selftest_test_recursion_func, + .flags = FTRACE_OPS_FL_RECURSION, }; static struct ftrace_ops test_recsafe_probe = { .func = trace_selftest_test_recursion_safe_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static int @@ -561,7 +558,7 @@ static void trace_selftest_test_regs_func(unsigned long ip, static struct ftrace_ops test_regs_probe = { .func = trace_selftest_test_regs_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_SAVE_REGS, + .flags = FTRACE_OPS_FL_SAVE_REGS, }; static int diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c408423e5d65..969db526a563 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -318,7 +318,6 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, static struct ftrace_ops trace_ops __read_mostly = { .func = stack_trace_call, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, }; static ssize_t -- cgit v1.2.3 From 773c16705058e9be7b0f4ce124e89cd231c120a2 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 5 Nov 2020 21:32:46 -0500 Subject: ftrace: Add recording of functions that caused recursion This adds CONFIG_FTRACE_RECORD_RECURSION that will record to a file "recursed_functions" all the functions that caused recursion while a callback to the function tracer was running. Link: https://lkml.kernel.org/r/20201106023548.102375687@goodmis.org Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Jonathan Corbet Cc: Guo Ren Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Christian Borntraeger Cc: Thomas Gleixner Cc: Borislav Petkov Cc: x86@kernel.org Cc: "H. Peter Anvin" Cc: Kees Cook Cc: Anton Vorontsov Cc: Colin Cross Cc: Tony Luck Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: Miroslav Benes Cc: Petr Mladek Cc: Joe Lawrence Cc: Kamalesh Babulal Cc: Mauro Carvalho Chehab Cc: Sebastian Andrzej Siewior Cc: linux-doc@vger.kernel.org Cc: linux-kernel@vger.kernel.org Cc: linux-csky@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-s390@vger.kernel.org Cc: live-patching@vger.kernel.org Signed-off-by: Steven Rostedt (VMware) --- Documentation/trace/ftrace-uses.rst | 6 +- arch/csky/kernel/probes/ftrace.c | 2 +- arch/parisc/kernel/ftrace.c | 2 +- arch/powerpc/kernel/kprobes-ftrace.c | 2 +- arch/s390/kernel/ftrace.c | 2 +- arch/x86/kernel/kprobes/ftrace.c | 2 +- fs/pstore/ftrace.c | 2 +- include/linux/trace_recursion.h | 29 ++++- kernel/livepatch/patch.c | 2 +- kernel/trace/Kconfig | 25 ++++ kernel/trace/Makefile | 1 + kernel/trace/ftrace.c | 4 +- kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_functions.c | 2 +- kernel/trace/trace_output.c | 6 +- kernel/trace/trace_output.h | 1 + kernel/trace/trace_recursion_record.c | 236 ++++++++++++++++++++++++++++++++++ 17 files changed, 306 insertions(+), 20 deletions(-) create mode 100644 kernel/trace/trace_recursion_record.c (limited to 'kernel') diff --git a/Documentation/trace/ftrace-uses.rst b/Documentation/trace/ftrace-uses.rst index 86cd14b8e126..5981d5691745 100644 --- a/Documentation/trace/ftrace-uses.rst +++ b/Documentation/trace/ftrace-uses.rst @@ -118,7 +118,7 @@ can help in this regard. If you start your code with: int bit; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; @@ -130,7 +130,9 @@ The code in between will be safe to use, even if it ends up calling a function that the callback is tracing. Note, on success, ftrace_test_recursion_trylock() will disable preemption, and the ftrace_test_recursion_unlock() will enable it again (if it was previously -enabled). +enabled). The instruction pointer (ip) and its parent (parent_ip) is passed to +ftrace_test_recursion_trylock() to record where the recursion happened +(if CONFIG_FTRACE_RECORD_RECURSION is set). Alternatively, if the FTRACE_OPS_FL_RECURSION flag is set on the ftrace_ops (as explained below), then a helper trampoline will be used to test diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index 5eb2604fdf71..f30b179924ef 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -18,7 +18,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; struct kprobe_ctlblk *kcb; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 13d85042810a..1c5d3732bda2 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -210,7 +210,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index 5df8d50c65ae..fdfee39938ea 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -20,7 +20,7 @@ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, struct kprobe_ctlblk *kcb; int bit; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(nip, parent_nip); if (bit < 0) return; diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 8f31c726537a..657c1ab45408 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -204,7 +204,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe *p; int bit; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index a40a6cdfcca3..954d930a7127 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -20,7 +20,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct kprobe_ctlblk *kcb; int bit; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 816210fc5d3a..adb0935eb062 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -41,7 +41,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, if (unlikely(oops_in_progress)) return; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/include/linux/trace_recursion.h b/include/linux/trace_recursion.h index ac3d73484cb2..228cc56ed66e 100644 --- a/include/linux/trace_recursion.h +++ b/include/linux/trace_recursion.h @@ -91,6 +91,9 @@ enum { * not be correct. Allow for a single recursion to cover this case. */ TRACE_TRANSITION_BIT, + + /* Used to prevent recursion recording from recursing. */ + TRACE_RECORD_RECURSION_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) @@ -142,7 +145,22 @@ static __always_inline int trace_get_context_bit(void) pc & HARDIRQ_MASK ? TRACE_CTX_IRQ : TRACE_CTX_SOFTIRQ; } -static __always_inline int trace_test_and_set_recursion(int start, int max) +#ifdef CONFIG_FTRACE_RECORD_RECURSION +extern void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip); +# define do_ftrace_record_recursion(ip, pip) \ + do { \ + if (!trace_recursion_test(TRACE_RECORD_RECURSION_BIT)) { \ + trace_recursion_set(TRACE_RECORD_RECURSION_BIT); \ + ftrace_record_recursion(ip, pip); \ + trace_recursion_clear(TRACE_RECORD_RECURSION_BIT); \ + } \ + } while (0) +#else +# define do_ftrace_record_recursion(ip, pip) do { } while (0) +#endif + +static __always_inline int trace_test_and_set_recursion(unsigned long ip, unsigned long pip, + int start, int max) { unsigned int val = current->trace_recursion; int bit; @@ -158,8 +176,10 @@ static __always_inline int trace_test_and_set_recursion(int start, int max) * a switch between contexts. Allow for a single recursion. */ bit = TRACE_TRANSITION_BIT; - if (trace_recursion_test(bit)) + if (trace_recursion_test(bit)) { + do_ftrace_record_recursion(ip, pip); return -1; + } trace_recursion_set(bit); barrier(); return bit + 1; @@ -199,9 +219,10 @@ static __always_inline void trace_clear_recursion(int bit) * Returns: -1 if a recursion happened. * >= 0 if no recursion */ -static __always_inline int ftrace_test_recursion_trylock(void) +static __always_inline int ftrace_test_recursion_trylock(unsigned long ip, + unsigned long parent_ip) { - return trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX); + return trace_test_and_set_recursion(ip, parent_ip, TRACE_FTRACE_START, TRACE_FTRACE_MAX); } /** diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 15480bf3ce88..875c5dbbdd33 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -49,7 +49,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, ops = container_of(fops, struct klp_ops, fops); - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (WARN_ON_ONCE(bit < 0)) return; /* diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a4020c0b4508..9b11c096d139 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -727,6 +727,31 @@ config TRACE_EVAL_MAP_FILE If unsure, say N. +config FTRACE_RECORD_RECURSION + bool "Record functions that recurse in function tracing" + depends on FUNCTION_TRACER + help + All callbacks that attach to the function tracing have some sort + of protection against recursion. Even though the protection exists, + it adds overhead. This option will create a file in the tracefs + file system called "recursed_functions" that will list the functions + that triggered a recursion. + + This will add more overhead to cases that have recursion. + + If unsure, say N + +config FTRACE_RECORD_RECURSION_SIZE + int "Max number of recursed functions to record" + default 128 + depends on FTRACE_RECORD_RECURSION + help + This defines the limit of number of functions that can be + listed in the "recursed_functions" file, that lists all + the functions that caused a recursion to happen. + This file can be reset, but the limit can not change in + size at runtime. + config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" depends on GCOV_KERNEL diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index e153be351548..7e44cea89fdc 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_DYNAMIC_EVENTS) += trace_dynevent.o obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o obj-$(CONFIG_BOOTTIME_TRACING) += trace_boot.o +obj-$(CONFIG_FTRACE_RECORD_RECURSION) += trace_recursion_record.o obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 39f2bba89b76..03aad2b5cd5e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -6918,7 +6918,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op; int bit; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; @@ -6993,7 +6993,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, { int bit; - bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); + bit = trace_test_and_set_recursion(ip, parent_ip, TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a2b9fddb8148..1b202e28dfaa 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -447,7 +447,7 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, if ((unsigned long)ops->private != smp_processor_id()) return; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 89c414ce1388..646eda6c44a5 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -141,7 +141,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, if (unlikely(!tr->function_enabled)) return; - bit = ftrace_test_recursion_trylock(); + bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 000e9dc224c6..92b1575ae0ca 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -353,8 +353,8 @@ static inline const char *kretprobed(const char *name) } #endif /* CONFIG_KRETPROBES */ -static void -seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) +void +trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset) { #ifdef CONFIG_KALLSYMS char str[KSYM_SYMBOL_LEN]; @@ -420,7 +420,7 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags) goto out; } - seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); + trace_seq_print_sym(s, ip, sym_flags & TRACE_ITER_SYM_OFFSET); if (sym_flags & TRACE_ITER_SYM_ADDR) trace_seq_printf(s, " <" IP_FMT ">", ip); diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 2f742b74e7e6..4c954636caf0 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -16,6 +16,7 @@ extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); +extern void trace_seq_print_sym(struct trace_seq *s, unsigned long address, bool offset); extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); diff --git a/kernel/trace/trace_recursion_record.c b/kernel/trace/trace_recursion_record.c new file mode 100644 index 000000000000..b2edac1fe156 --- /dev/null +++ b/kernel/trace/trace_recursion_record.c @@ -0,0 +1,236 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include + +#include "trace_output.h" + +struct recursed_functions { + unsigned long ip; + unsigned long parent_ip; +}; + +static struct recursed_functions recursed_functions[CONFIG_FTRACE_RECORD_RECURSION_SIZE]; +static atomic_t nr_records; + +/* + * Cache the last found function. Yes, updates to this is racey, but + * so is memory cache ;-) + */ +static unsigned long cached_function; + +void ftrace_record_recursion(unsigned long ip, unsigned long parent_ip) +{ + int index = 0; + int i; + unsigned long old; + + again: + /* First check the last one recorded */ + if (ip == cached_function) + return; + + i = atomic_read(&nr_records); + /* nr_records is -1 when clearing records */ + smp_mb__after_atomic(); + if (i < 0) + return; + + /* + * If there's two writers and this writer comes in second, + * the cmpxchg() below to update the ip will fail. Then this + * writer will try again. It is possible that index will now + * be greater than nr_records. This is because the writer + * that succeeded has not updated the nr_records yet. + * This writer could keep trying again until the other writer + * updates nr_records. But if the other writer takes an + * interrupt, and that interrupt locks up that CPU, we do + * not want this CPU to lock up due to the recursion protection, + * and have a bug report showing this CPU as the cause of + * locking up the computer. To not lose this record, this + * writer will simply use the next position to update the + * recursed_functions, and it will update the nr_records + * accordingly. + */ + if (index < i) + index = i; + if (index >= CONFIG_FTRACE_RECORD_RECURSION_SIZE) + return; + + for (i = index - 1; i >= 0; i--) { + if (recursed_functions[i].ip == ip) { + cached_function = ip; + return; + } + } + + cached_function = ip; + + /* + * We only want to add a function if it hasn't been added before. + * Add to the current location before incrementing the count. + * If it fails to add, then increment the index (save in i) + * and try again. + */ + old = cmpxchg(&recursed_functions[index].ip, 0, ip); + if (old != 0) { + /* Did something else already added this for us? */ + if (old == ip) + return; + /* Try the next location (use i for the next index) */ + index++; + goto again; + } + + recursed_functions[index].parent_ip = parent_ip; + + /* + * It's still possible that we could race with the clearing + * CPU0 CPU1 + * ---- ---- + * ip = func + * nr_records = -1; + * recursed_functions[0] = 0; + * i = -1 + * if (i < 0) + * nr_records = 0; + * (new recursion detected) + * recursed_functions[0] = func + * cmpxchg(recursed_functions[0], + * func, 0) + * + * But the worse that could happen is that we get a zero in + * the recursed_functions array, and it's likely that "func" will + * be recorded again. + */ + i = atomic_read(&nr_records); + smp_mb__after_atomic(); + if (i < 0) + cmpxchg(&recursed_functions[index].ip, ip, 0); + else if (i <= index) + atomic_cmpxchg(&nr_records, i, index + 1); +} +EXPORT_SYMBOL_GPL(ftrace_record_recursion); + +static DEFINE_MUTEX(recursed_function_lock); +static struct trace_seq *tseq; + +static void *recursed_function_seq_start(struct seq_file *m, loff_t *pos) +{ + void *ret = NULL; + int index; + + mutex_lock(&recursed_function_lock); + index = atomic_read(&nr_records); + if (*pos < index) { + ret = &recursed_functions[*pos]; + } + + tseq = kzalloc(sizeof(*tseq), GFP_KERNEL); + if (!tseq) + return ERR_PTR(-ENOMEM); + + trace_seq_init(tseq); + + return ret; +} + +static void *recursed_function_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + int index; + int p; + + index = atomic_read(&nr_records); + p = ++(*pos); + + return p < index ? &recursed_functions[p] : NULL; +} + +static void recursed_function_seq_stop(struct seq_file *m, void *v) +{ + kfree(tseq); + mutex_unlock(&recursed_function_lock); +} + +static int recursed_function_seq_show(struct seq_file *m, void *v) +{ + struct recursed_functions *record = v; + int ret = 0; + + if (record) { + trace_seq_print_sym(tseq, record->parent_ip, true); + trace_seq_puts(tseq, ":\t"); + trace_seq_print_sym(tseq, record->ip, true); + trace_seq_putc(tseq, '\n'); + ret = trace_print_seq(m, tseq); + } + + return ret; +} + +static const struct seq_operations recursed_function_seq_ops = { + .start = recursed_function_seq_start, + .next = recursed_function_seq_next, + .stop = recursed_function_seq_stop, + .show = recursed_function_seq_show +}; + +static int recursed_function_open(struct inode *inode, struct file *file) +{ + int ret = 0; + + mutex_lock(&recursed_function_lock); + /* If this file was opened for write, then erase contents */ + if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + /* disable updating records */ + atomic_set(&nr_records, -1); + smp_mb__after_atomic(); + memset(recursed_functions, 0, sizeof(recursed_functions)); + smp_wmb(); + /* enable them again */ + atomic_set(&nr_records, 0); + } + if (file->f_mode & FMODE_READ) + ret = seq_open(file, &recursed_function_seq_ops); + mutex_unlock(&recursed_function_lock); + + return ret; +} + +static ssize_t recursed_function_write(struct file *file, + const char __user *buffer, + size_t count, loff_t *ppos) +{ + return count; +} + +static int recursed_function_release(struct inode *inode, struct file *file) +{ + if (file->f_mode & FMODE_READ) + seq_release(inode, file); + return 0; +} + +static const struct file_operations recursed_functions_fops = { + .open = recursed_function_open, + .write = recursed_function_write, + .read = seq_read, + .llseek = seq_lseek, + .release = recursed_function_release, +}; + +__init static int create_recursed_functions(void) +{ + struct dentry *dentry; + + dentry = trace_create_file("recursed_functions", 0644, NULL, NULL, + &recursed_functions_fops); + if (!dentry) + pr_warn("WARNING: Failed to create recursed_functions\n"); + return 0; +} + +fs_initcall(create_recursed_functions); -- cgit v1.2.3 From 9e7a4d9831e836eb03dedab89902277ee94eb7a6 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:39 +0000 Subject: bpf: Allow LSM programs to use bpf spin locks Usage of spin locks was not allowed for tracing programs due to insufficient preemption checks. The verifier does not currently prevent LSM programs from using spin locks, but the helpers are not exposed via bpf_lsm_func_proto. Based on the discussion in [1], non-sleepable LSM programs should be able to use bpf_spin_{lock, unlock}. Sleepable LSM programs can be preempted which means that allowng spin locks will need more work (disabling preemption and the verifier ensuring that no sleepable helpers are called when a spin lock is held). [1]: https://lore.kernel.org/bpf/20201103153132.2717326-1-kpsingh@chromium.org/T/#md601a053229287659071600d3483523f752cd2fb Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-2-kpsingh@chromium.org --- kernel/bpf/bpf_lsm.c | 4 ++++ kernel/bpf/verifier.c | 20 +++++++++++++++----- 2 files changed, 19 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 78ea8a7bd27f..cd8a617f2109 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -59,6 +59,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; + case BPF_FUNC_spin_lock: + return &bpf_spin_lock_proto; + case BPF_FUNC_spin_unlock: + return &bpf_spin_unlock_proto; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6200519582a6..f863aa84d0a2 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9719,11 +9719,21 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, verbose(env, "trace type programs with run-time allocated hash maps are unsafe. Switch to preallocated hash maps.\n"); } - if ((is_tracing_prog_type(prog_type) || - prog_type == BPF_PROG_TYPE_SOCKET_FILTER) && - map_value_has_spin_lock(map)) { - verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); - return -EINVAL; + if (map_value_has_spin_lock(map)) { + if (prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + verbose(env, "socket filter progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + + if (is_tracing_prog_type(prog_type)) { + verbose(env, "tracing progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } + + if (prog->aux->sleepable) { + verbose(env, "sleepable progs cannot use bpf_spin_lock yet\n"); + return -EINVAL; + } } if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && -- cgit v1.2.3 From 4cf1bc1f10452065a29d576fc5693fc4fab5b919 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:40 +0000 Subject: bpf: Implement task local storage Similar to bpf_local_storage for sockets and inodes add local storage for task_struct. The life-cycle of storage is managed with the life-cycle of the task_struct. i.e. the storage is destroyed along with the owning task with a callback to the bpf_task_storage_free from the task_free LSM hook. The BPF LSM allocates an __rcu pointer to the bpf_local_storage in the security blob which are now stackable and can co-exist with other LSMs. The userspace map operations can be done by using a pid fd as a key passed to the lookup, update and delete operations. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-3-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 23 +++ include/linux/bpf_types.h | 1 + include/uapi/linux/bpf.h | 39 +++++ kernel/bpf/Makefile | 1 + kernel/bpf/bpf_lsm.c | 4 + kernel/bpf/bpf_task_storage.c | 315 +++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 10 ++ security/bpf/hooks.c | 2 + tools/include/uapi/linux/bpf.h | 39 +++++ 10 files changed, 436 insertions(+), 1 deletion(-) create mode 100644 kernel/bpf/bpf_task_storage.c (limited to 'kernel') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index aaacb6aafc87..73226181b744 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -7,6 +7,7 @@ #ifndef _LINUX_BPF_LSM_H #define _LINUX_BPF_LSM_H +#include #include #include @@ -35,9 +36,21 @@ static inline struct bpf_storage_blob *bpf_inode( return inode->i_security + bpf_lsm_blob_sizes.lbs_inode; } +static inline struct bpf_storage_blob *bpf_task( + const struct task_struct *task) +{ + if (unlikely(!task->security)) + return NULL; + + return task->security + bpf_lsm_blob_sizes.lbs_task; +} + extern const struct bpf_func_proto bpf_inode_storage_get_proto; extern const struct bpf_func_proto bpf_inode_storage_delete_proto; +extern const struct bpf_func_proto bpf_task_storage_get_proto; +extern const struct bpf_func_proto bpf_task_storage_delete_proto; void bpf_inode_storage_free(struct inode *inode); +void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ @@ -53,10 +66,20 @@ static inline struct bpf_storage_blob *bpf_inode( return NULL; } +static inline struct bpf_storage_blob *bpf_task( + const struct task_struct *task) +{ + return NULL; +} + static inline void bpf_inode_storage_free(struct inode *inode) { } +static inline void bpf_task_storage_free(struct task_struct *task) +{ +} + #endif /* CONFIG_BPF_LSM */ #endif /* _LINUX_BPF_LSM_H */ diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 2e6f568377f1..99f7fd657d87 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -109,6 +109,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKHASH, sock_hash_ops) #endif #ifdef CONFIG_BPF_LSM BPF_MAP_TYPE(BPF_MAP_TYPE_INODE_STORAGE, inode_storage_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_TASK_STORAGE, task_storage_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) #if defined(CONFIG_XDP_SOCKETS) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e6ceac3f7d62..f4037b2161a6 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -157,6 +157,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, }; /* Note that tracing related programs such as @@ -3742,6 +3743,42 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be an task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3900,6 +3937,8 @@ union bpf_attr { FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ FN(redirect_peer), \ + FN(task_storage_get), \ + FN(task_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index bdc8cd1b6767..f0b93ced5a7f 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -6,6 +6,7 @@ obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_i obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o +obj-${CONFIG_BPF_LSM} += bpf_task_storage.o obj-$(CONFIG_BPF_SYSCALL) += disasm.o obj-$(CONFIG_BPF_JIT) += trampoline.o obj-$(CONFIG_BPF_SYSCALL) += btf.o diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index cd8a617f2109..e92c51bebb47 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -63,6 +63,10 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; + case BPF_FUNC_task_storage_get: + return &bpf_task_storage_get_proto; + case BPF_FUNC_task_storage_delete: + return &bpf_task_storage_delete_proto; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c new file mode 100644 index 000000000000..39a45fba4fb0 --- /dev/null +++ b/kernel/bpf/bpf_task_storage.c @@ -0,0 +1,315 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2020 Facebook + * Copyright 2020 Google LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_BPF_STORAGE_CACHE(task_cache); + +static struct bpf_local_storage __rcu **task_storage_ptr(void *owner) +{ + struct task_struct *task = owner; + struct bpf_storage_blob *bsb; + + bsb = bpf_task(task); + if (!bsb) + return NULL; + return &bsb->storage; +} + +static struct bpf_local_storage_data * +task_storage_lookup(struct task_struct *task, struct bpf_map *map, + bool cacheit_lockit) +{ + struct bpf_local_storage *task_storage; + struct bpf_local_storage_map *smap; + struct bpf_storage_blob *bsb; + + bsb = bpf_task(task); + if (!bsb) + return NULL; + + task_storage = rcu_dereference(bsb->storage); + if (!task_storage) + return NULL; + + smap = (struct bpf_local_storage_map *)map; + return bpf_local_storage_lookup(task_storage, smap, cacheit_lockit); +} + +void bpf_task_storage_free(struct task_struct *task) +{ + struct bpf_local_storage_elem *selem; + struct bpf_local_storage *local_storage; + bool free_task_storage = false; + struct bpf_storage_blob *bsb; + struct hlist_node *n; + + bsb = bpf_task(task); + if (!bsb) + return; + + rcu_read_lock(); + + local_storage = rcu_dereference(bsb->storage); + if (!local_storage) { + rcu_read_unlock(); + return; + } + + /* Neither the bpf_prog nor the bpf-map's syscall + * could be modifying the local_storage->list now. + * Thus, no elem can be added-to or deleted-from the + * local_storage->list by the bpf_prog or by the bpf-map's syscall. + * + * It is racing with bpf_local_storage_map_free() alone + * when unlinking elem from the local_storage->list and + * the map's bucket->list. + */ + raw_spin_lock_bh(&local_storage->lock); + hlist_for_each_entry_safe(selem, n, &local_storage->list, snode) { + /* Always unlink from map before unlinking from + * local_storage. + */ + bpf_selem_unlink_map(selem); + free_task_storage = bpf_selem_unlink_storage_nolock( + local_storage, selem, false); + } + raw_spin_unlock_bh(&local_storage->lock); + rcu_read_unlock(); + + /* free_task_storage should always be true as long as + * local_storage->list was non-empty. + */ + if (free_task_storage) + kfree_rcu(local_storage, rcu); +} + +static void *bpf_pid_task_storage_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_local_storage_data *sdata; + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return ERR_CAST(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + sdata = task_storage_lookup(task, map, true); + put_pid(pid); + return sdata ? sdata->data : NULL; +out: + put_pid(pid); + return ERR_PTR(err); +} + +static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + struct bpf_local_storage_data *sdata; + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + sdata = bpf_local_storage_update( + task, (struct bpf_local_storage_map *)map, value, map_flags); + + err = PTR_ERR_OR_ZERO(sdata); +out: + put_pid(pid); + return err; +} + +static int task_storage_delete(struct task_struct *task, struct bpf_map *map) +{ + struct bpf_local_storage_data *sdata; + + sdata = task_storage_lookup(task, map, false); + if (!sdata) + return -ENOENT; + + bpf_selem_unlink(SELEM(sdata)); + + return 0; +} + +static int bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key) +{ + struct task_struct *task; + unsigned int f_flags; + struct pid *pid; + int fd, err; + + fd = *(int *)key; + pid = pidfd_get_pid(fd, &f_flags); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + /* We should be in an RCU read side critical section, it should be safe + * to call pid_task. + */ + WARN_ON_ONCE(!rcu_read_lock_held()); + task = pid_task(pid, PIDTYPE_PID); + if (!task) { + err = -ENOENT; + goto out; + } + + err = task_storage_delete(task, map); +out: + put_pid(pid); + return err; +} + +BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, + task, void *, value, u64, flags) +{ + struct bpf_local_storage_data *sdata; + + if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE)) + return (unsigned long)NULL; + + /* explicitly check that the task_storage_ptr is not + * NULL as task_storage_lookup returns NULL in this case and + * bpf_local_storage_update expects the owner to have a + * valid storage pointer. + */ + if (!task_storage_ptr(task)) + return (unsigned long)NULL; + + sdata = task_storage_lookup(task, map, true); + if (sdata) + return (unsigned long)sdata->data; + + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { + sdata = bpf_local_storage_update( + task, (struct bpf_local_storage_map *)map, value, + BPF_NOEXIST); + return IS_ERR(sdata) ? (unsigned long)NULL : + (unsigned long)sdata->data; + } + + return (unsigned long)NULL; +} + +BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, + task) +{ + /* This helper must only be called from places where the lifetime of the task + * is guaranteed. Either by being refcounted or by being protected + * by an RCU read-side critical section. + */ + return task_storage_delete(task, map); +} + +static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -ENOTSUPP; +} + +static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr) +{ + struct bpf_local_storage_map *smap; + + smap = bpf_local_storage_map_alloc(attr); + if (IS_ERR(smap)) + return ERR_CAST(smap); + + smap->cache_idx = bpf_local_storage_cache_idx_get(&task_cache); + return &smap->map; +} + +static void task_storage_map_free(struct bpf_map *map) +{ + struct bpf_local_storage_map *smap; + + smap = (struct bpf_local_storage_map *)map; + bpf_local_storage_cache_idx_free(&task_cache, smap->cache_idx); + bpf_local_storage_map_free(smap); +} + +static int task_storage_map_btf_id; +const struct bpf_map_ops task_storage_map_ops = { + .map_meta_equal = bpf_map_meta_equal, + .map_alloc_check = bpf_local_storage_map_alloc_check, + .map_alloc = task_storage_map_alloc, + .map_free = task_storage_map_free, + .map_get_next_key = notsupp_get_next_key, + .map_lookup_elem = bpf_pid_task_storage_lookup_elem, + .map_update_elem = bpf_pid_task_storage_update_elem, + .map_delete_elem = bpf_pid_task_storage_delete_elem, + .map_check_btf = bpf_local_storage_map_check_btf, + .map_btf_name = "bpf_local_storage_map", + .map_btf_id = &task_storage_map_btf_id, + .map_owner_storage_ptr = task_storage_ptr, +}; + +BTF_ID_LIST_SINGLE(bpf_task_storage_btf_ids, struct, task_struct) + +const struct bpf_func_proto bpf_task_storage_get_proto = { + .func = bpf_task_storage_get, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_task_storage_btf_ids[0], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, +}; + +const struct bpf_func_proto bpf_task_storage_delete_proto = { + .func = bpf_task_storage_delete, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &bpf_task_storage_btf_ids[0], +}; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8f50c9c19f1b..f3fe9f53f93c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -773,7 +773,8 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, map->map_type != BPF_MAP_TYPE_ARRAY && map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE && map->map_type != BPF_MAP_TYPE_SK_STORAGE && - map->map_type != BPF_MAP_TYPE_INODE_STORAGE) + map->map_type != BPF_MAP_TYPE_INODE_STORAGE && + map->map_type != BPF_MAP_TYPE_TASK_STORAGE) return -ENOTSUPP; if (map->spin_lock_off + sizeof(struct bpf_spin_lock) > map->value_size) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f863aa84d0a2..00960f6a83ec 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4469,6 +4469,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, func_id != BPF_FUNC_inode_storage_delete) goto error; break; + case BPF_MAP_TYPE_TASK_STORAGE: + if (func_id != BPF_FUNC_task_storage_get && + func_id != BPF_FUNC_task_storage_delete) + goto error; + break; default: break; } @@ -4547,6 +4552,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, if (map->map_type != BPF_MAP_TYPE_INODE_STORAGE) goto error; break; + case BPF_FUNC_task_storage_get: + case BPF_FUNC_task_storage_delete: + if (map->map_type != BPF_MAP_TYPE_TASK_STORAGE) + goto error; + break; default: break; } diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c index 788667d582ae..e5971fa74fd7 100644 --- a/security/bpf/hooks.c +++ b/security/bpf/hooks.c @@ -12,6 +12,7 @@ static struct security_hook_list bpf_lsm_hooks[] __lsm_ro_after_init = { #include #undef LSM_HOOK LSM_HOOK_INIT(inode_free_security, bpf_inode_storage_free), + LSM_HOOK_INIT(task_free, bpf_task_storage_free), }; static int __init bpf_lsm_init(void) @@ -23,6 +24,7 @@ static int __init bpf_lsm_init(void) struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = { .lbs_inode = sizeof(struct bpf_storage_blob), + .lbs_task = sizeof(struct bpf_storage_blob), }; DEFINE_LSM(bpf) = { diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e6ceac3f7d62..f4037b2161a6 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -157,6 +157,7 @@ enum bpf_map_type { BPF_MAP_TYPE_STRUCT_OPS, BPF_MAP_TYPE_RINGBUF, BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, }; /* Note that tracing related programs such as @@ -3742,6 +3743,42 @@ union bpf_attr { * Return * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be an task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3900,6 +3937,8 @@ union bpf_attr { FN(bpf_per_cpu_ptr), \ FN(bpf_this_cpu_ptr), \ FN(redirect_peer), \ + FN(task_storage_get), \ + FN(task_storage_delete), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 3ca1032ab7ab010eccb107aa515598788f7d93bb Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 6 Nov 2020 10:37:43 +0000 Subject: bpf: Implement get_current_task_btf and RET_PTR_TO_BTF_ID The currently available bpf_get_current_task returns an unsigned integer which can be used along with BPF_CORE_READ to read data from the task_struct but still cannot be used as an input argument to a helper that accepts an ARG_PTR_TO_BTF_ID of type task_struct. In order to implement this helper a new return type, RET_PTR_TO_BTF_ID, is added. This is similar to RET_PTR_TO_BTF_ID_OR_NULL but does not require checking the nullness of returned pointer. Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201106103747.2780972-6-kpsingh@chromium.org --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 9 +++++++++ kernel/bpf/verifier.c | 7 +++++-- kernel/trace/bpf_trace.c | 16 ++++++++++++++++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 5 files changed, 40 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 2fffd30e13ac..73d5381a5d5c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -310,6 +310,7 @@ enum bpf_return_type { RET_PTR_TO_BTF_ID_OR_NULL, /* returns a pointer to a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL, /* returns a pointer to a valid memory or a btf_id or NULL */ RET_PTR_TO_MEM_OR_BTF_ID, /* returns a pointer to a valid memory or a btf_id */ + RET_PTR_TO_BTF_ID, /* returns a pointer to a btf_id */ }; /* eBPF function prototype used by verifier to allow BPF_CALLs from eBPF programs diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index f4037b2161a6..9879d6793e90 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3779,6 +3779,14 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3939,6 +3947,7 @@ union bpf_attr { FN(redirect_peer), \ FN(task_storage_get), \ FN(task_storage_delete), \ + FN(get_current_task_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 00960f6a83ec..10da26e55130 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5186,11 +5186,14 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } - } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL) { + } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || + fn->ret_type == RET_PTR_TO_BTF_ID) { int ret_btf_id; mark_reg_known_zero(env, regs, BPF_REG_0); - regs[BPF_REG_0].type = PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_BTF_ID ? + PTR_TO_BTF_ID : + PTR_TO_BTF_ID_OR_NULL; ret_btf_id = *fn->ret_btf_id; if (ret_btf_id == 0) { verbose(env, "invalid return type %d of func %s#%d\n", diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b66518..e4515b0f62a8 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1022,6 +1022,20 @@ const struct bpf_func_proto bpf_get_current_task_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_get_current_task_btf) +{ + return (unsigned long) current; +} + +BTF_ID_LIST_SINGLE(bpf_get_current_btf_ids, struct, task_struct) + +static const struct bpf_func_proto bpf_get_current_task_btf_proto = { + .func = bpf_get_current_task_btf, + .gpl_only = true, + .ret_type = RET_PTR_TO_BTF_ID, + .ret_btf_id = &bpf_get_current_btf_ids[0], +}; + BPF_CALL_2(bpf_current_task_under_cgroup, struct bpf_map *, map, u32, idx) { struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1265,6 +1279,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_current_pid_tgid_proto; case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; + case BPF_FUNC_get_current_task_btf: + return &bpf_get_current_task_btf_proto; case BPF_FUNC_get_current_uid_gid: return &bpf_get_current_uid_gid_proto; case BPF_FUNC_get_current_comm: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index f4037b2161a6..9879d6793e90 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3779,6 +3779,14 @@ union bpf_attr { * 0 on success. * * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3939,6 +3947,7 @@ union bpf_attr { FN(redirect_peer), \ FN(task_storage_get), \ FN(task_storage_delete), \ + FN(get_current_task_btf), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 6f64e477830000746c1f992050fbd45c03c89429 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 5 Nov 2020 23:06:51 +0000 Subject: bpf: Update verification logic for LSM programs The current logic checks if the name of the BTF type passed in attach_btf_id starts with "bpf_lsm_", this is not sufficient as it also allows attachment to non-LSM hooks like the very function that performs this check, i.e. bpf_lsm_verify_prog. In order to ensure that this verification logic allows attachment to only LSM hooks, the LSM_HOOK definitions in lsm_hook_defs.h are used to generate a BTF_ID set. Upon verification, the attach_btf_id of the program being attached is checked for presence in this set. Fixes: 9e4e01dfd325 ("bpf: lsm: Implement attach, detach and execution") Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201105230651.2621917-1-kpsingh@chromium.org --- kernel/bpf/bpf_lsm.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 78ea8a7bd27f..56cc5a915f67 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -13,6 +13,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -26,7 +27,11 @@ noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ #include #undef LSM_HOOK -#define BPF_LSM_SYM_PREFX "bpf_lsm_" +#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) +BTF_SET_START(bpf_lsm_hooks) +#include +#undef LSM_HOOK +BTF_SET_END(bpf_lsm_hooks) int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) @@ -37,8 +42,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EINVAL; } - if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name, - sizeof(BPF_LSM_SYM_PREFX) - 1)) { + if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) { bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", prog->aux->attach_btf_id, prog->aux->attach_func_name); return -EINVAL; -- cgit v1.2.3 From 3fcd6a230fa7d03bffcb831a81b40435c146c12b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 Sep 2020 15:23:29 -0700 Subject: x86/cpu: Avoid cpuinfo-induced IPIing of idle CPUs Currently, accessing /proc/cpuinfo sends IPIs to idle CPUs in order to learn their clock frequency. Which is a bit strange, given that waking them from idle likely significantly changes their clock frequency. This commit therefore avoids sending /proc/cpuinfo-induced IPIs to idle CPUs. [ paulmck: Also check for idle in arch_freq_prepare_all(). ] Signed-off-by: Paul E. McKenney Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: "H. Peter Anvin" Cc: --- arch/x86/kernel/cpu/aperfmperf.c | 6 ++++++ include/linux/rcutiny.h | 2 ++ include/linux/rcutree.h | 1 + kernel/rcu/tree.c | 8 ++++++++ 4 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index dd3261dab0fb..22911deacb6e 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "cpu.h" @@ -93,6 +94,9 @@ unsigned int aperfmperf_get_khz(int cpu) if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) return 0; + if (rcu_is_idle_cpu(cpu)) + return 0; /* Idle CPUs are completely uninteresting. */ + aperfmperf_snapshot_cpu(cpu, ktime_get(), true); return per_cpu(samples.khz, cpu); } @@ -112,6 +116,8 @@ void arch_freq_prepare_all(void) for_each_online_cpu(cpu) { if (!housekeeping_cpu(cpu, HK_FLAG_MISC)) continue; + if (rcu_is_idle_cpu(cpu)) + continue; /* Idle CPUs are completely uninteresting. */ if (!aperfmperf_snapshot_cpu(cpu, now, false)) wait = true; } diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index 7c1ecdb356d8..2a97334eb786 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -89,6 +89,8 @@ static inline void rcu_irq_enter_irqson(void) { } static inline void rcu_irq_exit(void) { } static inline void rcu_irq_exit_preempt(void) { } static inline void rcu_irq_exit_check_preempt(void) { } +#define rcu_is_idle_cpu(cpu) \ + (is_idle_task(current) && !in_nmi() && !in_irq() && !in_serving_softirq()) static inline void exit_rcu(void) { } static inline bool rcu_preempt_need_deferred_qs(struct task_struct *t) { diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 59eb5cd567d7..df578b73960f 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -50,6 +50,7 @@ void rcu_irq_exit(void); void rcu_irq_exit_preempt(void); void rcu_irq_enter_irqson(void); void rcu_irq_exit_irqson(void); +bool rcu_is_idle_cpu(int cpu); #ifdef CONFIG_PROVE_RCU void rcu_irq_exit_check_preempt(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 06895ef85d69..1d84c0b6a9f3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -341,6 +341,14 @@ static bool rcu_dynticks_in_eqs(int snap) return !(snap & RCU_DYNTICK_CTRL_CTR); } +/* Return true if the specified CPU is currently idle from an RCU viewpoint. */ +bool rcu_is_idle_cpu(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + + return rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)); +} + /* * Return true if the CPU corresponding to the specified rcu_data * structure has spent some time in an extended quiescent state since -- cgit v1.2.3 From 3480d6774f07341e3e1cf3114f58bef98ea58ae0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 30 Aug 2020 21:48:23 -0700 Subject: locktorture: Track time of last ->writeunlock() This commit adds a last_lock_release variable that tracks the time of the last ->writeunlock() call, which allows easier diagnosing of lock hangs when using a kernel debugger. Acked-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 62d215b2e39f..316531de2a81 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -60,6 +60,7 @@ static struct task_struct **reader_tasks; static bool lock_is_write_held; static bool lock_is_read_held; +static unsigned long last_lock_release; struct lock_stress_stats { long n_lock_fail; @@ -632,6 +633,7 @@ static int lock_torture_writer(void *arg) lwsp->n_lock_acquired++; cxt.cur_ops->write_delay(&rand); lock_is_write_held = false; + WRITE_ONCE(last_lock_release, jiffies); cxt.cur_ops->writeunlock(); stutter_wait("lock_torture_writer"); -- cgit v1.2.3 From 19012b786ecccb29a9fa20c4ec0a67e2cdfbc010 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 1 Sep 2020 16:58:41 -0700 Subject: torture: Periodically pause in stutter_wait() Running locktorture scenario LOCK05 results in hangs: tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --torture lock --duration 3 --configs LOCK05 The lock_torture_writer() kthreads set themselves to MAX_NICE while running SCHED_OTHER. Other locktorture kthreads run at default niceness, also SCHED_OTHER. This results in these other locktorture kthreads indefinitely preempting the lock_torture_writer() kthreads. Note that the cond_resched() in the stutter_wait() function's loop is ineffective because this scenario is built with CONFIG_PREEMPT=y. It is not clear that such indefinite preemption is supposed to happen, but in the meantime this commit prevents kthreads running in stutter_wait() from being completely CPU-bound, thus allowing the other threads to get some CPU in a timely fashion. This commit also uses hrtimers to provide very short sleeps to avoid degrading the sudden-on testing that stutter is supposed to provide. Reviewed-by: Davidlohr Bueso Signed-off-by: Paul E. McKenney --- kernel/torture.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 1061492f14bd..be09377af6bc 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -602,8 +602,11 @@ static int stutter_gap; */ bool stutter_wait(const char *title) { - int spt; + ktime_t delay; + unsigned int i = 0; + int oldnice; bool ret = false; + int spt; cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); @@ -612,8 +615,17 @@ bool stutter_wait(const char *title) if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { - while (READ_ONCE(stutter_pause_test)) + oldnice = task_nice(current); + set_user_nice(current, MAX_NICE); + while (READ_ONCE(stutter_pause_test)) { + if (!(i++ & 0xffff)) { + set_current_state(TASK_INTERRUPTIBLE); + delay = 10 * NSEC_PER_USEC; + schedule_hrtimeout(&delay, HRTIMER_MODE_REL); + } cond_resched(); + } + set_user_nice(current, oldnice); } else { schedule_timeout_interruptible(round_jiffies_relative(HZ)); } -- cgit v1.2.3 From fda5ba9ed254727ac5761b81455d8e93c78eba4a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Sep 2020 21:08:41 -0700 Subject: torture: Make torture_stutter() use hrtimer The torture_stutter() function uses schedule_timeout_interruptible() to time the stutter duration, but this can miss race conditions due to its being time-synchronized with everything else that is based on the timer wheels. This commit therefore converts torture_stutter() to use the high-resolution timers via schedule_hrtimeout(), and also to fuzz the stutter interval. While in the area, this commit also limits the spin-loop portion of the stutter_wait() function's wait loop to two jiffies, down from about one second. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index be09377af6bc..56ff02bf444f 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -641,20 +641,27 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { + ktime_t delay; + DEFINE_TORTURE_RANDOM(rand); int wtime; VERBOSE_TOROUT_STRING("torture_stutter task started"); do { if (!torture_must_stop() && stutter > 1) { wtime = stutter; - if (stutter > HZ + 1) { + if (stutter > 2) { WRITE_ONCE(stutter_pause_test, 1); - wtime = stutter - HZ - 1; - schedule_timeout_interruptible(wtime); - wtime = HZ + 1; + wtime = stutter - 3; + delay = ktime_divns(NSEC_PER_SEC * wtime, HZ); + delay += (torture_random(&rand) >> 3) % NSEC_PER_MSEC; + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&delay, HRTIMER_MODE_REL); + wtime = 2; } WRITE_ONCE(stutter_pause_test, 2); - schedule_timeout_interruptible(wtime); + delay = ktime_divns(NSEC_PER_SEC * wtime, HZ); + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&delay, HRTIMER_MODE_REL); } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) -- cgit v1.2.3 From 1ac78b49d61d4a095ef8b861542549eef1823f36 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 3 Sep 2020 13:09:47 -0700 Subject: scftorture: Add an alternative IPI vector The scftorture tests currently use only smp_call_function() and friends, which means that these tests cannot locate bugs caused by interactions between different IPI vectors. This commit therefore adds the rescheduling IPI to the mix. Note that this commit permits resched_cpus() only when scftorture is built in. This is a workaround. Longer term, this will use real wakeups rather than resched_cpu(). Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 554a521ee235..3fbb7a7f8afa 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -62,6 +62,7 @@ torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s torture_param(int, stutter_cpus, 5, "Number of jiffies to change CPUs under test, 0=disable"); torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); +torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations."); torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations."); torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations."); torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations."); @@ -82,6 +83,7 @@ torture_param(bool, shutdown, SCFTORT_SHUTDOWN, "Shutdown at end of torture test struct scf_statistics { struct task_struct *task; int cpu; + long long n_resched; long long n_single; long long n_single_ofl; long long n_single_wait; @@ -97,12 +99,15 @@ static struct task_struct *scf_torture_stats_task; static DEFINE_PER_CPU(long long, scf_invoked_count); // Data for random primitive selection -#define SCF_PRIM_SINGLE 0 -#define SCF_PRIM_MANY 1 -#define SCF_PRIM_ALL 2 -#define SCF_NPRIMS (2 * 3) // Need wait and no-wait versions of each. +#define SCF_PRIM_RESCHED 0 +#define SCF_PRIM_SINGLE 1 +#define SCF_PRIM_MANY 2 +#define SCF_PRIM_ALL 3 +#define SCF_NPRIMS 7 // Need wait and no-wait versions of each, + // except for SCF_PRIM_RESCHED. static char *scf_prim_name[] = { + "resched_cpu", "smp_call_function_single", "smp_call_function_many", "smp_call_function", @@ -136,6 +141,8 @@ static char *bangstr = ""; static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand); +extern void resched_cpu(int cpu); // An alternative IPI vector. + // Print torture statistics. Caller must ensure serialization. static void scf_torture_stats_print(void) { @@ -148,6 +155,7 @@ static void scf_torture_stats_print(void) for_each_possible_cpu(cpu) invoked_count += data_race(per_cpu(scf_invoked_count, cpu)); for (i = 0; i < nthreads; i++) { + scfs.n_resched += scf_stats_p[i].n_resched; scfs.n_single += scf_stats_p[i].n_single; scfs.n_single_ofl += scf_stats_p[i].n_single_ofl; scfs.n_single_wait += scf_stats_p[i].n_single_wait; @@ -160,8 +168,8 @@ static void scf_torture_stats_print(void) if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs)) bangstr = "!!! "; - pr_alert("%s %sscf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", - SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, + pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ", + SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched, scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl, scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait); torture_onoff_stats(); @@ -314,6 +322,13 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra } } switch (scfsp->scfs_prim) { + case SCF_PRIM_RESCHED: + if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) { + cpu = torture_random(trsp) % nr_cpu_ids; + scfp->n_resched++; + resched_cpu(cpu); + } + break; case SCF_PRIM_SINGLE: cpu = torture_random(trsp) % nr_cpu_ids; if (scfsp->scfs_wait) @@ -433,8 +448,8 @@ static void scftorture_print_module_parms(const char *tag) { pr_alert(SCFTORT_FLAG - "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, - verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_resched, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); } static void scf_cleanup_handler(void *unused) @@ -475,6 +490,7 @@ static int __init scf_torture_init(void) { long i; int firsterr = 0; + unsigned long weight_resched1 = weight_resched; unsigned long weight_single1 = weight_single; unsigned long weight_single_wait1 = weight_single_wait; unsigned long weight_many1 = weight_many; @@ -487,9 +503,10 @@ static int __init scf_torture_init(void) scftorture_print_module_parms("Start of test"); - if (weight_single == -1 && weight_single_wait == -1 && + if (weight_resched == -1 && weight_single == -1 && weight_single_wait == -1 && weight_many == -1 && weight_many_wait == -1 && weight_all == -1 && weight_all_wait == -1) { + weight_resched1 = 2 * nr_cpu_ids; weight_single1 = 2 * nr_cpu_ids; weight_single_wait1 = 2 * nr_cpu_ids; weight_many1 = 2; @@ -497,6 +514,8 @@ static int __init scf_torture_init(void) weight_all1 = 1; weight_all_wait1 = 1; } else { + if (weight_resched == -1) + weight_resched1 = 0; if (weight_single == -1) weight_single1 = 0; if (weight_single_wait == -1) @@ -517,6 +536,10 @@ static int __init scf_torture_init(void) firsterr = -EINVAL; goto unwind; } + if (IS_BUILTIN(CONFIG_SCF_TORTURE_TEST)) + scf_sel_add(weight_resched1, SCF_PRIM_RESCHED, false); + else if (weight_resched1) + VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored"); scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false); scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true); scf_sel_add(weight_many1, SCF_PRIM_MANY, false); -- cgit v1.2.3 From 899f317e4886f916ed21027177177c11b577cea1 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 9 Sep 2020 12:27:03 -0700 Subject: rcuscale: Add RCU Tasks Trace This commit adds the ability to test performance and scalability of RCU Tasks Trace updaters. Reported-by: Alexei Starovoitov Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuscale.c | 32 +++++++++++++++++++++- .../selftests/rcutorture/configs/rcuscale/CFcommon | 3 ++ .../selftests/rcutorture/configs/rcuscale/TRACE01 | 15 ++++++++++ .../rcutorture/configs/rcuscale/TRACE01.boot | 1 + 4 files changed, 50 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 create mode 100644 tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01.boot (limited to 'kernel') diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index 2819b95479af..c42f2401c374 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "rcu.h" @@ -294,6 +295,35 @@ static struct rcu_scale_ops tasks_ops = { .name = "tasks" }; +/* + * Definitions for RCU-tasks-trace scalability testing. + */ + +static int tasks_trace_scale_read_lock(void) +{ + rcu_read_lock_trace(); + return 0; +} + +static void tasks_trace_scale_read_unlock(int idx) +{ + rcu_read_unlock_trace(); +} + +static struct rcu_scale_ops tasks_tracing_ops = { + .ptype = RCU_TASKS_FLAVOR, + .init = rcu_sync_scale_init, + .readlock = tasks_trace_scale_read_lock, + .readunlock = tasks_trace_scale_read_unlock, + .get_gp_seq = rcu_no_completed, + .gp_diff = rcu_seq_diff, + .async = call_rcu_tasks_trace, + .gp_barrier = rcu_barrier_tasks_trace, + .sync = synchronize_rcu_tasks_trace, + .exp_sync = synchronize_rcu_tasks_trace, + .name = "tasks-tracing" +}; + static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old) { if (!cur_ops->gp_diff) @@ -754,7 +784,7 @@ rcu_scale_init(void) long i; int firsterr = 0; static struct rcu_scale_ops *scale_ops[] = { - &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, + &rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops, &tasks_tracing_ops }; if (!torture_init_begin(scale_type, verbose)) diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon index 87caa0e932c7..90942bb5bebc 100644 --- a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon @@ -1,2 +1,5 @@ CONFIG_RCU_SCALE_TEST=y CONFIG_PRINTK_TIME=y +CONFIG_TASKS_RCU_GENERIC=y +CONFIG_TASKS_RCU=y +CONFIG_TASKS_TRACE_RCU=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 new file mode 100644 index 000000000000..e6baa2fbaeb3 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01 @@ -0,0 +1,15 @@ +CONFIG_SMP=y +CONFIG_PREEMPT_NONE=y +CONFIG_PREEMPT_VOLUNTARY=n +CONFIG_PREEMPT=n +CONFIG_HZ_PERIODIC=n +CONFIG_NO_HZ_IDLE=y +CONFIG_NO_HZ_FULL=n +CONFIG_RCU_FAST_NO_HZ=n +CONFIG_RCU_NOCB_CPU=n +CONFIG_DEBUG_LOCK_ALLOC=n +CONFIG_PROVE_LOCKING=n +CONFIG_RCU_BOOST=n +CONFIG_DEBUG_OBJECTS_RCU_HEAD=n +CONFIG_RCU_EXPERT=y +CONFIG_RCU_TRACE=y diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01.boot b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01.boot new file mode 100644 index 000000000000..af0aff1457a4 --- /dev/null +++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TRACE01.boot @@ -0,0 +1 @@ +rcuscale.scale_type=tasks-tracing -- cgit v1.2.3 From 2f2214d43ccd27ac6d124287107c136a0f7c6053 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Sep 2020 10:30:46 -0700 Subject: rcuscale: Prevent hangs for invalid arguments If an rcuscale torture-test run is given a bad kvm.sh argument, the test will complain to the console, which is good. What is bad is that from the user's perspective, it will just hang for the time specified by the --duration argument. This commit therefore forces an immediate kernel shutdown if a rcu_scale_init()-time error occurs, thus avoiding the appearance of a hang. It also forces a console splat in this case to clearly indicate the presence of an error. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcuscale.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c index c42f2401c374..06491d5530db 100644 --- a/kernel/rcu/rcuscale.c +++ b/kernel/rcu/rcuscale.c @@ -802,7 +802,6 @@ rcu_scale_init(void) for (i = 0; i < ARRAY_SIZE(scale_ops); i++) pr_cont(" %s", scale_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; @@ -876,6 +875,10 @@ rcu_scale_init(void) unwind: torture_init_end(); rcu_scale_cleanup(); + if (shutdown) { + WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST)); + kernel_power_off(); + } return firsterr; } -- cgit v1.2.3 From bc80d353b3f565138cda7e95ed4020e6e69360b2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Sep 2020 10:37:10 -0700 Subject: refscale: Prevent hangs for invalid arguments If an refscale torture-test run is given a bad kvm.sh argument, the test will complain to the console, which is good. What is bad is that from the user's perspective, it will just hang for the time specified by the --duration argument. This commit therefore forces an immediate kernel shutdown if a ref_scale_init()-time error occurs, thus avoiding the appearance of a hang. It also forces a console splat in this case to clearly indicate the presence of an error. Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index fb5f20d9486a..23ff36a66f97 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -658,7 +658,6 @@ ref_scale_init(void) for (i = 0; i < ARRAY_SIZE(scale_ops); i++) pr_cont(" %s", scale_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; @@ -718,6 +717,10 @@ ref_scale_init(void) unwind: torture_init_end(); ref_scale_cleanup(); + if (shutdown) { + WARN_ON(!IS_MODULE(CONFIG_RCU_REF_SCALE_TEST)); + kernel_power_off(); + } return firsterr; } -- cgit v1.2.3 From e5ace37d83af459bd491847df570b6763c602344 Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Fri, 18 Sep 2020 19:44:24 +0800 Subject: locktorture: Ignore nreaders_stress if no readlock support Exclusive locks do not have readlock support, which means that a locktorture run with the following module parameters will do nothing: torture_type=mutex_lock nwriters_stress=0 nreaders_stress=1 This commit therefore rejects this combination for exclusive locks by returning -EINVAL during module init. Signed-off-by: Hou Tao Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 316531de2a81..046ea2d2bc8c 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -870,7 +870,8 @@ static int __init lock_torture_init(void) goto unwind; } - if (nwriters_stress == 0 && nreaders_stress == 0) { + if (nwriters_stress == 0 && + (!cxt.cur_ops->readlock || nreaders_stress == 0)) { pr_alert("lock-torture: must run at least one locking thread\n"); firsterr = -EINVAL; goto unwind; -- cgit v1.2.3 From 6b74fa0a776e3715d385b23d29db469179c825b0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Sep 2020 11:18:06 -0700 Subject: locktorture: Prevent hangs for invalid arguments If an locktorture torture-test run is given a bad kvm.sh argument, the test will complain to the console, which is good. What is bad is that from the user's perspective, it will just hang for the time specified by the --duration argument. This commit therefore forces an immediate kernel shutdown if a lock_torture_init()-time error occurs, thus avoiding the appearance of a hang. It also forces a console splat in this case to clearly indicate the presence of an error. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 046ea2d2bc8c..79fbd97d3882 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -29,6 +29,7 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney "); @@ -1041,6 +1042,10 @@ static int __init lock_torture_init(void) unwind: torture_init_end(); lock_torture_cleanup(); + if (shutdown_secs) { + WARN_ON(!IS_MODULE(CONFIG_LOCK_TORTURE_TEST)); + kernel_power_off(); + } return firsterr; } -- cgit v1.2.3 From 4994684ce10924a0302567c315c91b0a64eeef46 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Sep 2020 13:30:33 -0700 Subject: rcutorture: Prevent hangs for invalid arguments If an rcutorture torture-test run is given a bad kvm.sh argument, the test will complain to the console, which is good. What is bad is that from the user's perspective, it will just hang for the time specified by the --duration argument. This commit therefore forces an immediate kernel shutdown if a rcu_torture_init()-time error occurs, thus avoiding the appearance of a hang. It also forces a console splat in this case to clearly indicate the presence of an error. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 916ea4f66e4b..db3767110c60 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2647,7 +2647,6 @@ rcu_torture_init(void) for (i = 0; i < ARRAY_SIZE(torture_ops); i++) pr_cont(" %s", torture_ops[i]->name); pr_cont("\n"); - WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); firsterr = -EINVAL; cur_ops = NULL; goto unwind; @@ -2815,6 +2814,10 @@ rcu_torture_init(void) unwind: torture_init_end(); rcu_torture_cleanup(); + if (shutdown_secs) { + WARN_ON(!IS_MODULE(CONFIG_RCU_TORTURE_TEST)); + kernel_power_off(); + } return firsterr; } -- cgit v1.2.3 From ab1b7880dec86bbdacd31a4c5cf104de4cf903f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Sep 2020 16:42:42 -0700 Subject: rcutorture: Make stutter_wait() caller restore priority Currently, stutter_wait() will happily spin waiting for the stutter interval to end even if the caller is running at a real-time priority level. This could starve normal-priority tasks for no good reason. This commit therefore drops the calling task's priority to SCHED_OTHER MAX_NICE if stutter_wait() needs to wait. But when it waits, stutter_wait() returns true, which allows the caller to restore the priority if needed. Callers that were already running at SCHED_OTHER MAX_NICE obviously do not need any changes, but this commit also restores priority for higher-priority callers. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 24 ++++++++++++++++++------ kernel/torture.c | 9 ++++----- 2 files changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index db3767110c60..4391d2fab5de 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -912,7 +912,8 @@ static int rcu_torture_boost(void *arg) oldstarttime = boost_starttime; while (time_before(jiffies, oldstarttime)) { schedule_timeout_interruptible(oldstarttime - jiffies); - stutter_wait("rcu_torture_boost"); + if (stutter_wait("rcu_torture_boost")) + sched_set_fifo_low(current); if (torture_must_stop()) goto checkwait; } @@ -932,7 +933,8 @@ static int rcu_torture_boost(void *arg) jiffies); call_rcu_time = jiffies; } - stutter_wait("rcu_torture_boost"); + if (stutter_wait("rcu_torture_boost")) + sched_set_fifo_low(current); if (torture_must_stop()) goto checkwait; } @@ -964,7 +966,8 @@ static int rcu_torture_boost(void *arg) } /* Go do the stutter. */ -checkwait: stutter_wait("rcu_torture_boost"); +checkwait: if (stutter_wait("rcu_torture_boost")) + sched_set_fifo_low(current); } while (!torture_must_stop()); /* Clean up and exit. */ @@ -987,6 +990,7 @@ rcu_torture_fqs(void *arg) { unsigned long fqs_resume_time; int fqs_burst_remaining; + int oldnice = task_nice(current); VERBOSE_TOROUT_STRING("rcu_torture_fqs task started"); do { @@ -1002,7 +1006,8 @@ rcu_torture_fqs(void *arg) udelay(fqs_holdoff); fqs_burst_remaining -= fqs_holdoff; } - stutter_wait("rcu_torture_fqs"); + if (stutter_wait("rcu_torture_fqs")) + sched_set_normal(current, oldnice); } while (!torture_must_stop()); torture_kthread_stopping("rcu_torture_fqs"); return 0; @@ -1022,9 +1027,11 @@ rcu_torture_writer(void *arg) bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; bool gp_sync1 = gp_sync; int i; + int oldnice = task_nice(current); struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); + bool stutter_waited; int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_SYNC }; int nsynctypes = 0; @@ -1143,7 +1150,8 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; - if (stutter_wait("rcu_torture_writer") && + stutter_waited = stutter_wait("rcu_torture_writer"); + if (stutter_waited && !READ_ONCE(rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && @@ -1155,6 +1163,8 @@ rcu_torture_writer(void *arg) rcu_ftrace_dump(DUMP_ALL); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); } + if (stutter_waited) + sched_set_normal(current, oldnice); } while (!torture_must_stop()); rcu_torture_current = NULL; // Let stats task know that we are done. /* Reset expediting back to unexpedited. */ @@ -2103,6 +2113,7 @@ static struct notifier_block rcutorture_oom_nb = { /* Carry out grace-period forward-progress testing. */ static int rcu_torture_fwd_prog(void *args) { + int oldnice = task_nice(current); struct rcu_fwd *rfp = args; int tested = 0; int tested_tries = 0; @@ -2121,7 +2132,8 @@ static int rcu_torture_fwd_prog(void *args) rcu_torture_fwd_prog_cr(rfp); /* Avoid slow periods, better to test when busy. */ - stutter_wait("rcu_torture_fwd_prog"); + if (stutter_wait("rcu_torture_fwd_prog")) + sched_set_normal(current, oldnice); } while (!torture_must_stop()); /* Short runs might not contain a valid forward-progress attempt. */ WARN_ON(!tested && tested_tries >= 5); diff --git a/kernel/torture.c b/kernel/torture.c index 56ff02bf444f..8562ac18d2eb 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -604,19 +604,19 @@ bool stutter_wait(const char *title) { ktime_t delay; unsigned int i = 0; - int oldnice; bool ret = false; int spt; cond_resched_tasks_rcu_qs(); spt = READ_ONCE(stutter_pause_test); for (; spt; spt = READ_ONCE(stutter_pause_test)) { - ret = true; + if (!ret) { + sched_set_normal(current, MAX_NICE); + ret = true; + } if (spt == 1) { schedule_timeout_interruptible(1); } else if (spt == 2) { - oldnice = task_nice(current); - set_user_nice(current, MAX_NICE); while (READ_ONCE(stutter_pause_test)) { if (!(i++ & 0xffff)) { set_current_state(TASK_INTERRUPTIBLE); @@ -625,7 +625,6 @@ bool stutter_wait(const char *title) } cond_resched(); } - set_user_nice(current, oldnice); } else { schedule_timeout_interruptible(round_jiffies_relative(HZ)); } -- cgit v1.2.3 From 293b93d66f149a9bd124aae195f048268e11870c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Sep 2020 16:46:36 -0700 Subject: rcutorture: Small code cleanups The rcu_torture_cleanup() function fails to NULL out the reader_tasks pointer after freeing it and its fakewriter_tasks loop has redundant braces. This commit therefore cleans these up. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 4391d2fab5de..e7d52fded3cd 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2496,13 +2496,13 @@ rcu_torture_cleanup(void) torture_stop_kthread(rcu_torture_reader, reader_tasks[i]); kfree(reader_tasks); + reader_tasks = NULL; } if (fakewriter_tasks) { - for (i = 0; i < nfakewriters; i++) { + for (i = 0; i < nfakewriters; i++) torture_stop_kthread(rcu_torture_fakewriter, fakewriter_tasks[i]); - } kfree(fakewriter_tasks); fakewriter_tasks = NULL; } -- cgit v1.2.3 From 85558182d545fe9c0583a39dbb6359ee954e35d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Sep 2020 12:11:57 -0700 Subject: scftorture: Add full-test stutter capability In virtual environments on systems with hardware assist, inter-processor interrupts must do very different things based on whether the target vCPU is running or not. This commit therefore enables torture-test stuttering to better test these running/not-running transitions. Suggested-by: Chris Mason Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index 3fbb7a7f8afa..d55a9f8cda3d 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -59,7 +59,7 @@ torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable"); torture_param(int, shutdown_secs, 0, "Shutdown time (ms), <= zero to disable."); torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s."); -torture_param(int, stutter_cpus, 5, "Number of jiffies to change CPUs under test, 0=disable"); +torture_param(int, stutter, 5, "Number of jiffies to run/halt test, 0=disable"); torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations."); @@ -436,6 +436,7 @@ static int scftorture_invoker(void *arg) was_offline = false; } cond_resched(); + stutter_wait("scftorture_invoker"); } while (!torture_must_stop()); VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu); @@ -448,8 +449,8 @@ static void scftorture_print_module_parms(const char *tag) { pr_alert(SCFTORT_FLAG - "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, - verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_resched, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); + "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag, + verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait); } static void scf_cleanup_handler(void *unused) @@ -558,6 +559,11 @@ static int __init scf_torture_init(void) if (firsterr) goto unwind; } + if (stutter > 0) { + firsterr = torture_stutter_init(stutter, stutter); + if (firsterr) + goto unwind; + } // Worker tasks invoking smp_call_function(). if (nthreads < 0) -- cgit v1.2.3 From 0d7202876bcb968a68f5608b9ff7a824fbc7e94d Mon Sep 17 00:00:00 2001 From: Hou Tao Date: Thu, 24 Sep 2020 22:18:54 +0800 Subject: locktorture: Invoke percpu_free_rwsem() to do percpu-rwsem cleanup When executing the LOCK06 locktorture scenario featuring percpu-rwsem, the RCU callback rcu_sync_func() may still be pending after locktorture module is removed. This can in turn lead to the following Oops: BUG: unable to handle page fault for address: ffffffffc00eb920 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 6500a067 P4D 6500a067 PUD 6500c067 PMD 13a36c067 PTE 800000013691c163 Oops: 0000 [#1] PREEMPT SMP CPU: 1 PID: 0 Comm: swapper/1 Not tainted 5.9.0-rc5+ #4 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) RIP: 0010:rcu_cblist_dequeue+0x12/0x30 Call Trace: rcu_core+0x1b1/0x860 __do_softirq+0xfe/0x326 asm_call_on_stack+0x12/0x20 do_softirq_own_stack+0x5f/0x80 irq_exit_rcu+0xaf/0xc0 sysvec_apic_timer_interrupt+0x2e/0xb0 asm_sysvec_apic_timer_interrupt+0x12/0x20 This commit avoids tis problem by adding an exit hook in lock_torture_ops and using it to call percpu_free_rwsem() for percpu rwsem torture during the module-cleanup function, thus ensuring that rcu_sync_func() completes before module exits. It is also necessary to call the exit hook if lock_torture_init() fails half-way, so this commit also adds an ->init_called field in lock_torture_cxt to indicate that exit hook, if present, must be called. Signed-off-by: Hou Tao Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 79fbd97d3882..fd838cea3934 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -76,6 +76,7 @@ static void lock_torture_cleanup(void); */ struct lock_torture_ops { void (*init)(void); + void (*exit)(void); int (*writelock)(void); void (*write_delay)(struct torture_random_state *trsp); void (*task_boost)(struct torture_random_state *trsp); @@ -92,12 +93,13 @@ struct lock_torture_cxt { int nrealwriters_stress; int nrealreaders_stress; bool debug_lock; + bool init_called; atomic_t n_lock_torture_errors; struct lock_torture_ops *cur_ops; struct lock_stress_stats *lwsa; /* writer statistics */ struct lock_stress_stats *lrsa; /* reader statistics */ }; -static struct lock_torture_cxt cxt = { 0, 0, false, +static struct lock_torture_cxt cxt = { 0, 0, false, false, ATOMIC_INIT(0), NULL, NULL}; /* @@ -573,6 +575,11 @@ static void torture_percpu_rwsem_init(void) BUG_ON(percpu_init_rwsem(&pcpu_rwsem)); } +static void torture_percpu_rwsem_exit(void) +{ + percpu_free_rwsem(&pcpu_rwsem); +} + static int torture_percpu_rwsem_down_write(void) __acquires(pcpu_rwsem) { percpu_down_write(&pcpu_rwsem); @@ -597,6 +604,7 @@ static void torture_percpu_rwsem_up_read(void) __releases(pcpu_rwsem) static struct lock_torture_ops percpu_rwsem_lock_ops = { .init = torture_percpu_rwsem_init, + .exit = torture_percpu_rwsem_exit, .writelock = torture_percpu_rwsem_down_write, .write_delay = torture_rwsem_write_delay, .task_boost = torture_boost_dummy, @@ -789,9 +797,10 @@ static void lock_torture_cleanup(void) /* * Indicates early cleanup, meaning that the test has not run, - * such as when passing bogus args when loading the module. As - * such, only perform the underlying torture-specific cleanups, - * and avoid anything related to locktorture. + * such as when passing bogus args when loading the module. + * However cxt->cur_ops.init() may have been invoked, so beside + * perform the underlying torture-specific cleanups, cur_ops.exit() + * will be invoked if needed. */ if (!cxt.lwsa && !cxt.lrsa) goto end; @@ -831,6 +840,11 @@ static void lock_torture_cleanup(void) cxt.lrsa = NULL; end: + if (cxt.init_called) { + if (cxt.cur_ops->exit) + cxt.cur_ops->exit(); + cxt.init_called = false; + } torture_cleanup_end(); } @@ -878,8 +892,10 @@ static int __init lock_torture_init(void) goto unwind; } - if (cxt.cur_ops->init) + if (cxt.cur_ops->init) { cxt.cur_ops->init(); + cxt.init_called = true; + } if (nwriters_stress >= 0) cxt.nrealwriters_stress = nwriters_stress; -- cgit v1.2.3 From a7eb937b67b64b8b4645f1ebca3ac2079c6de81b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 Oct 2020 19:51:55 -0700 Subject: rcutorture: Don't do need_resched() testing if ->sync is NULL If cur_ops->sync is NULL, rcu_torture_fwd_prog_nr() will nevertheless attempt to call through it. This commit therefore flags cases where neither need_resched() nor call_rcu() forward-progress testing can be performed due to NULL function pointers, and also causes rcu_torture_fwd_prog_nr() to take an early exit if cur_ops->sync() is NULL. Reported-by: Tom Rix Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index e7d52fded3cd..4dfd113882aa 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1923,7 +1923,9 @@ static void rcu_torture_fwd_prog_nr(struct rcu_fwd *rfp, unsigned long stopat; static DEFINE_TORTURE_RANDOM(trs); - if (cur_ops->call && cur_ops->sync && cur_ops->cb_barrier) { + if (!cur_ops->sync) + return; // Cannot do need_resched() forward progress testing without ->sync. + if (cur_ops->call && cur_ops->cb_barrier) { init_rcu_head_on_stack(&fcs.rh); selfpropcb = true; } @@ -2149,8 +2151,8 @@ static int __init rcu_torture_fwd_prog_init(void) if (!fwd_progress) return 0; /* Not requested, so don't do it. */ - if (!cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || - cur_ops == &rcu_busted_ops) { + if ((!cur_ops->sync && !cur_ops->call) || + !cur_ops->stall_dur || cur_ops->stall_dur() <= 0 || cur_ops == &rcu_busted_ops) { VERBOSE_TOROUT_STRING("rcu_torture_fwd_prog_init: Disabled, unsupported by RCU flavor under test"); return 0; } -- cgit v1.2.3 From 75dc2da5ecd65bdcbfc4d59b9d9b7342c61fe374 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 17 Sep 2020 16:17:17 -0700 Subject: rcu-tasks: Make the units of ->init_fract be jiffies Currently, the units of ->init_fract are milliseconds while those of ->gp_sleep are jiffies. For consistency with each other and with the argument of schedule_timeout_idle(), this commit changes the units of ->init_fract to jiffies. This change does affect the backoff algorithm, but only on systems where HZ is not 1000, and even there the change makes more sense, given that the current setup would "back off" to the same number of jiffies repeatedly. In contrast, with this change, the number of jiffies waited increases on each pass through the loop in the rcu_tasks_wait_gp() function. Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 0b459890fdcc..35bdcfd84d42 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -335,8 +335,6 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) // Start off with initial wait and slowly back off to 1 HZ wait. fract = rtp->init_fract; - if (fract > HZ) - fract = HZ; while (!list_empty(&holdouts)) { bool firstreport; @@ -345,10 +343,10 @@ static void rcu_tasks_wait_gp(struct rcu_tasks *rtp) /* Slowly back off waiting for holdouts */ set_tasks_gp_state(rtp, RTGS_WAIT_SCAN_HOLDOUTS); - schedule_timeout_idle(HZ/fract); + schedule_timeout_idle(fract); - if (fract > 1) - fract--; + if (fract < HZ) + fract++; rtst = READ_ONCE(rcu_task_stall_timeout); needreport = rtst > 0 && time_after(jiffies, lastreport + rtst); @@ -557,7 +555,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_tasks); static int __init rcu_spawn_tasks_kthread(void) { rcu_tasks.gp_sleep = HZ / 10; - rcu_tasks.init_fract = 10; + rcu_tasks.init_fract = HZ / 10; rcu_tasks.pregp_func = rcu_tasks_pregp_step; rcu_tasks.pertask_func = rcu_tasks_pertask; rcu_tasks.postscan_func = rcu_tasks_postscan; @@ -1178,12 +1176,12 @@ static int __init rcu_spawn_tasks_trace_kthread(void) { if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) { rcu_tasks_trace.gp_sleep = HZ / 10; - rcu_tasks_trace.init_fract = 10; + rcu_tasks_trace.init_fract = HZ / 10; } else { rcu_tasks_trace.gp_sleep = HZ / 200; if (rcu_tasks_trace.gp_sleep <= 0) rcu_tasks_trace.gp_sleep = 1; - rcu_tasks_trace.init_fract = HZ / 5; + rcu_tasks_trace.init_fract = HZ / 200; if (rcu_tasks_trace.init_fract <= 0) rcu_tasks_trace.init_fract = 1; } -- cgit v1.2.3 From 1d094cefc37e5ed4dec44a41841c8628f6b548a2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Fri, 6 Nov 2020 10:34:56 +0100 Subject: kcsan: Fix encoding masks and regain address bit The watchpoint encoding masks for size and address were off-by-one bit each, with the size mask using 1 unnecessary bit and the address mask missing 1 bit. However, due to the way the size is shifted into the encoded watchpoint, we were effectively wasting and never using the extra bit. For example, on x86 with PAGE_SIZE==4K, we have 1 bit for the is-write bit, 14 bits for the size bits, and then 49 bits left for the address. Prior to this fix we would end up with this usage: [ write<1> | size<14> | wasted<1> | address<48> ] Fix it by subtracting 1 bit from the GENMASK() end and start ranges of size and address respectively. The added static_assert()s verify that the masks are as expected. With the fixed version, we get the expected usage: [ write<1> | size<14> | address<49> ] Functionally no change is expected, since that extra address bit is insignificant for enabled architectures. Acked-by: Boqun Feng Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/encoding.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/encoding.h b/kernel/kcsan/encoding.h index 4f73db6d1407..7ee405524904 100644 --- a/kernel/kcsan/encoding.h +++ b/kernel/kcsan/encoding.h @@ -37,14 +37,12 @@ */ #define WATCHPOINT_ADDR_BITS (BITS_PER_LONG-1 - WATCHPOINT_SIZE_BITS) -/* - * Masks to set/retrieve the encoded data. - */ -#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1) -#define WATCHPOINT_SIZE_MASK \ - GENMASK(BITS_PER_LONG-2, BITS_PER_LONG-2 - WATCHPOINT_SIZE_BITS) -#define WATCHPOINT_ADDR_MASK \ - GENMASK(BITS_PER_LONG-3 - WATCHPOINT_SIZE_BITS, 0) +/* Bitmasks for the encoded watchpoint access information. */ +#define WATCHPOINT_WRITE_MASK BIT(BITS_PER_LONG-1) +#define WATCHPOINT_SIZE_MASK GENMASK(BITS_PER_LONG-2, WATCHPOINT_ADDR_BITS) +#define WATCHPOINT_ADDR_MASK GENMASK(WATCHPOINT_ADDR_BITS-1, 0) +static_assert(WATCHPOINT_ADDR_MASK == (1UL << WATCHPOINT_ADDR_BITS) - 1); +static_assert((WATCHPOINT_WRITE_MASK ^ WATCHPOINT_SIZE_MASK ^ WATCHPOINT_ADDR_MASK) == ~0UL); static inline bool check_encodable(unsigned long addr, size_t size) { -- cgit v1.2.3 From c3a877fea962d9d0fb1e3747334699978f566930 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 30 Oct 2020 18:59:17 +0200 Subject: irqdomain: Replace open coded of_node_to_fwnode() of_node_to_fwnode() should be used for conversion. Replace the open coded variant of it in of_phandle_args_to_fwspec(). Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20201030165919.86234-4-andriy.shevchenko@linux.intel.com --- kernel/irq/irqdomain.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf8b374b892d..831526f2e728 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -737,7 +737,7 @@ static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args, { int i; - fwspec->fwnode = np ? &np->fwnode : NULL; + fwspec->fwnode = of_node_to_fwnode(np); fwspec->param_count = count; for (i = 0; i < count; i++) -- cgit v1.2.3 From b6e95788fde8c9bc9da729102085dd36a5a0cda6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 30 Oct 2020 18:59:18 +0200 Subject: irqdomain: Introduce irq_domain_create_legacy() API Introduce irq_domain_create_legacy() API which is functional equivalent to the existing irq_domain_add_legacy(), but takes a pointer to the struct fwnode_handle as a parameter. This is useful for non OF systems. Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Reviewed-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20201030165919.86234-5-andriy.shevchenko@linux.intel.com --- Documentation/core-api/irq/irq-domain.rst | 6 ++++++ include/linux/irqdomain.h | 6 ++++++ kernel/irq/irqdomain.c | 17 ++++++++++++++--- 3 files changed, 26 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/core-api/irq/irq-domain.rst b/Documentation/core-api/irq/irq-domain.rst index 096db12f32d5..a77c24c27f7b 100644 --- a/Documentation/core-api/irq/irq-domain.rst +++ b/Documentation/core-api/irq/irq-domain.rst @@ -147,6 +147,7 @@ Legacy irq_domain_add_simple() irq_domain_add_legacy() irq_domain_add_legacy_isa() + irq_domain_create_legacy() The Legacy mapping is a special case for drivers that already have a range of irq_descs allocated for the hwirqs. It is used when the @@ -185,6 +186,11 @@ that the driver using the simple domain call irq_create_mapping() before any irq_find_mapping() since the latter will actually work for the static IRQ assignment case. +irq_domain_add_legacy() and irq_domain_create_legacy() are functionally +equivalent, except for the first argument is different - the former +accepts an Open Firmware specific 'struct device_node', while the latter +accepts a more general abstraction 'struct fwnode_handle'. + Hierarchy IRQ domain -------------------- diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index d21f75d294d7..77bf7d84c673 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -271,6 +271,12 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, irq_hw_number_t first_hwirq, const struct irq_domain_ops *ops, void *host_data); +struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, + unsigned int size, + unsigned int first_irq, + irq_hw_number_t first_hwirq, + const struct irq_domain_ops *ops, + void *host_data); extern struct irq_domain *irq_find_matching_fwspec(struct irq_fwspec *fwspec, enum irq_domain_bus_token bus_token); extern bool irq_domain_check_msi_remap(void); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 831526f2e728..9c9cb8829f7a 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -350,17 +350,28 @@ struct irq_domain *irq_domain_add_legacy(struct device_node *of_node, irq_hw_number_t first_hwirq, const struct irq_domain_ops *ops, void *host_data) +{ + return irq_domain_create_legacy(of_node_to_fwnode(of_node), size, + first_irq, first_hwirq, ops, host_data); +} +EXPORT_SYMBOL_GPL(irq_domain_add_legacy); + +struct irq_domain *irq_domain_create_legacy(struct fwnode_handle *fwnode, + unsigned int size, + unsigned int first_irq, + irq_hw_number_t first_hwirq, + const struct irq_domain_ops *ops, + void *host_data) { struct irq_domain *domain; - domain = __irq_domain_add(of_node_to_fwnode(of_node), first_hwirq + size, - first_hwirq + size, 0, ops, host_data); + domain = __irq_domain_add(fwnode, first_hwirq + size, first_hwirq + size, 0, ops, host_data); if (domain) irq_domain_associate_many(domain, first_irq, first_hwirq, size); return domain; } -EXPORT_SYMBOL_GPL(irq_domain_add_legacy); +EXPORT_SYMBOL_GPL(irq_domain_create_legacy); /** * irq_find_matching_fwspec() - Locates a domain for a given fwspec -- cgit v1.2.3 From 7bdb157cdebbf95a1cd94ed2e01b338714075d00 Mon Sep 17 00:00:00 2001 From: "kiyin(尹亮)" Date: Wed, 4 Nov 2020 08:23:22 +0300 Subject: perf/core: Fix a memory leak in perf_event_parse_addr_filter() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As shown through runtime testing, the "filename" allocation is not always freed in perf_event_parse_addr_filter(). There are three possible ways that this could happen: - It could be allocated twice on subsequent iterations through the loop, - or leaked on the success path, - or on the failure path. Clean up the code flow to make it obvious that 'filename' is always freed in the reallocation path and in the two return paths as well. We rely on the fact that kfree(NULL) is NOP and filename is initialized with NULL. This fixes the leak. No other side effects expected. [ Dan Carpenter: cleaned up the code flow & added a changelog. ] [ Ingo Molnar: updated the changelog some more. ] Fixes: 375637bc5249 ("perf/core: Introduce address range filtering") Signed-off-by: "kiyin(尹亮)" Signed-off-by: Dan Carpenter Signed-off-by: Ingo Molnar Cc: "Srivatsa S. Bhat" Cc: Anthony Liguori -- kernel/events/core.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) --- kernel/events/core.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index da467e1dd49a..5a29ab09e72d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10085,6 +10085,7 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) { int fpos = token == IF_SRC_FILE ? 2 : 1; + kfree(filename); filename = match_strdup(&args[fpos]); if (!filename) { ret = -ENOMEM; @@ -10131,16 +10132,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, */ ret = -EOPNOTSUPP; if (!event->ctx->task) - goto fail_free_name; + goto fail; /* look up the path and grab its inode */ ret = kern_path(filename, LOOKUP_FOLLOW, &filter->path); if (ret) - goto fail_free_name; - - kfree(filename); - filename = NULL; + goto fail; ret = -EINVAL; if (!filter->path.dentry || @@ -10160,13 +10158,13 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr, if (state != IF_STATE_ACTION) goto fail; + kfree(filename); kfree(orig); return 0; -fail_free_name: - kfree(filename); fail: + kfree(filename); free_filters_list(filters); kfree(orig); -- cgit v1.2.3 From 9f5d1c336a10c0d24e83e40b4c1b9539f7dba627 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 4 Nov 2020 16:12:44 +0100 Subject: futex: Handle transient "ownerless" rtmutex state correctly Gratian managed to trigger the BUG_ON(!newowner) in fixup_pi_state_owner(). This is one possible chain of events leading to this: Task Prio Operation T1 120 lock(F) T2 120 lock(F) -> blocks (top waiter) T3 50 (RT) lock(F) -> boosts T1 and blocks (new top waiter) XX timeout/ -> wakes T2 signal T1 50 unlock(F) -> wakes T3 (rtmutex->owner == NULL, waiter bit is set) T2 120 cleanup -> try_to_take_mutex() fails because T3 is the top waiter and the lower priority T2 cannot steal the lock. -> fixup_pi_state_owner() sees newowner == NULL -> BUG_ON() The comment states that this is invalid and rt_mutex_real_owner() must return a non NULL owner when the trylock failed, but in case of a queued and woken up waiter rt_mutex_real_owner() == NULL is a valid transient state. The higher priority waiter has simply not yet managed to take over the rtmutex. The BUG_ON() is therefore wrong and this is just another retry condition in fixup_pi_state_owner(). Drop the locks, so that T3 can make progress, and then try the fixup again. Gratian provided a great analysis, traces and a reproducer. The analysis is to the point, but it confused the hell out of that tglx dude who had to page in all the futex horrors again. Condensed version is above. [ tglx: Wrote comment and changelog ] Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex") Reported-by: Gratian Crisan Signed-off-by: Mike Galbraith Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/87a6w6x7bb.fsf@ni.com Link: https://lore.kernel.org/r/87sg9pkvf7.fsf@nanos.tec.linutronix.de --- kernel/futex.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index f8614ef4ff31..ac328874f6e5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2380,10 +2380,22 @@ retry: } /* - * Since we just failed the trylock; there must be an owner. + * The trylock just failed, so either there is an owner or + * there is a higher priority waiter than this one. */ newowner = rt_mutex_owner(&pi_state->pi_mutex); - BUG_ON(!newowner); + /* + * If the higher priority waiter has not yet taken over the + * rtmutex then newowner is NULL. We can't return here with + * that state because it's inconsistent vs. the user space + * state. So drop the locks and try again. It's a valid + * situation and not any different from the other retry + * conditions. + */ + if (unlikely(!newowner)) { + err = -EAGAIN; + goto handle_err; + } } else { WARN_ON_ONCE(argowner != current); if (oldowner == current) { -- cgit v1.2.3 From b4e00444cab4c3f3fec876dc0cccc8cbb0d1a948 Mon Sep 17 00:00:00 2001 From: Eddy Wu Date: Sat, 7 Nov 2020 14:47:22 +0800 Subject: fork: fix copy_process(CLONE_PARENT) race with the exiting ->real_parent current->group_leader->exit_signal may change during copy_process() if current->real_parent exits. Move the assignment inside tasklist_lock to avoid the race. Signed-off-by: Eddy Wu Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..6d266388d380 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2167,14 +2167,9 @@ static __latent_entropy struct task_struct *copy_process( /* ok, now we should be set up.. */ p->pid = pid_nr(pid); if (clone_flags & CLONE_THREAD) { - p->exit_signal = -1; p->group_leader = current->group_leader; p->tgid = current->tgid; } else { - if (clone_flags & CLONE_PARENT) - p->exit_signal = current->group_leader->exit_signal; - else - p->exit_signal = args->exit_signal; p->group_leader = p; p->tgid = p->pid; } @@ -2218,9 +2213,14 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; p->parent_exec_id = current->parent_exec_id; + if (clone_flags & CLONE_THREAD) + p->exit_signal = -1; + else + p->exit_signal = current->group_leader->exit_signal; } else { p->real_parent = current; p->parent_exec_id = current->self_exec_id; + p->exit_signal = args->exit_signal; } klp_copy_process(p); -- cgit v1.2.3 From 24389b610be31536328c655ae0a2cb0ef94be2c8 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Wed, 4 Nov 2020 23:34:59 +0300 Subject: module: fix up 'kernel-doc' comments Some 'kernel-doc' function comments do not fully comply with the specified format due to: - missing () after the function name; - "RETURNS:"/"Returns:" instead of "Return:" when documenting the function's result. - empty line before describing the function's arguments. Signed-off-by: Sergey Shtylyov Signed-off-by: Jessica Yu --- kernel/module.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 948d4bbbceb5..98b9e2ba8c3d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -727,13 +727,12 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) } /** - * is_module_percpu_address - test whether address is from module static percpu + * is_module_percpu_address() - test whether address is from module static percpu * @addr: address to test * * Test whether @addr belongs to module static percpu area. * - * RETURNS: - * %true if @addr is from module static percpu area + * Return: %true if @addr is from module static percpu area */ bool is_module_percpu_address(unsigned long addr) { @@ -957,11 +956,10 @@ static int try_stop_module(struct module *mod, int flags, int *forced) } /** - * module_refcount - return the refcount or -1 if unloading - * + * module_refcount() - return the refcount or -1 if unloading * @mod: the module we're checking * - * Returns: + * Return: * -1 if the module is in the process of unloading * otherwise the number of references in the kernel to the module */ -- cgit v1.2.3 From 2541743e99c301f9b9659d0928bd8b22708d59df Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Wed, 4 Nov 2020 23:35:51 +0300 Subject: module: add more 'kernel-doc' comments Some functions have the proper 'kernel-doc' comments but these don't start with proper /** -- fix that, along with adding () to the function name on the following lines to fully comply with the 'kernel-doc' format. Signed-off-by: Sergey Shtylyov Signed-off-by: Jessica Yu --- kernel/module.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 98b9e2ba8c3d..0310c80b90a3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -4491,8 +4491,8 @@ out: return e; } -/* - * is_module_address - is this address inside a module? +/** + * is_module_address() - is this address inside a module? * @addr: the address to check. * * See is_module_text_address() if you simply want to see if the address @@ -4509,8 +4509,8 @@ bool is_module_address(unsigned long addr) return ret; } -/* - * __module_address - get the module which contains an address. +/** + * __module_address() - get the module which contains an address. * @addr: the address. * * Must be called with preempt disabled or module mutex held so that @@ -4534,8 +4534,8 @@ struct module *__module_address(unsigned long addr) return mod; } -/* - * is_module_text_address - is this address inside module code? +/** + * is_module_text_address() - is this address inside module code? * @addr: the address to check. * * See is_module_address() if you simply want to see if the address is @@ -4553,8 +4553,8 @@ bool is_module_text_address(unsigned long addr) return ret; } -/* - * __module_text_address - get the module whose code contains an address. +/** + * __module_text_address() - get the module whose code contains an address. * @addr: the address. * * Must be called with preempt disabled or module mutex held so that -- cgit v1.2.3 From 24b9f0d22081455b6fd739c8365958c207a69973 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Sat, 7 Nov 2020 23:20:52 +0300 Subject: module: fix comment style Many comments in this module do not comply with the preferred multi-line comment style as reported by 'scripts/checkpatch.pl': WARNING: Block comments use * on subsequent lines WARNING: Block comments use a trailing */ on a separate line Fix those comments, along with (unreported for some reason?) the starts of the multi-line comments not being /* on their own line... Signed-off-by: Sergey Shtylyov Signed-off-by: Jessica Yu --- kernel/module.c | 117 +++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 74 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 0310c80b90a3..a40ec708f8f2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1,9 +1,8 @@ // SPDX-License-Identifier: GPL-2.0-or-later /* - Copyright (C) 2002 Richard Henderson - Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. - -*/ + * Copyright (C) 2002 Richard Henderson + * Copyright (C) 2001 Rusty Russell, 2002, 2010 Rusty Russell IBM. + */ #define INCLUDE_VERMAGIC @@ -86,7 +85,8 @@ * 1) List of modules (also safely readable with preempt_disable), * 2) module_use links, * 3) module_addr_min/module_addr_max. - * (delete and add uses RCU list operations). */ + * (delete and add uses RCU list operations). + */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); static LIST_HEAD(modules); @@ -586,8 +586,10 @@ static bool find_exported_symbol_in_section(const struct symsearch *syms, return false; } -/* Find an exported symbol and return it, along with, (optional) crc and - * (optional) module which owns it. Needs preempt disabled or module_mutex. */ +/* + * Find an exported symbol and return it, along with, (optional) crc and + * (optional) module which owns it. Needs preempt disabled or module_mutex. + */ static const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const s32 **crc, @@ -1644,8 +1646,10 @@ static void remove_sect_attrs(struct module *mod) if (mod->sect_attrs) { sysfs_remove_group(&mod->mkobj.kobj, &mod->sect_attrs->grp); - /* We are positive that no one is using any sect attrs - * at this point. Deallocate immediately. */ + /* + * We are positive that no one is using any sect attrs + * at this point. Deallocate immediately. + */ free_sect_attrs(mod->sect_attrs); mod->sect_attrs = NULL; } @@ -2216,8 +2220,10 @@ static void free_module(struct module *mod) mod_sysfs_teardown(mod); - /* We leave it in list to prevent duplicate loads, but make sure - * that noone uses it while it's being deconstructed. */ + /* + * We leave it in list to prevent duplicate loads, but make sure + * that noone uses it while it's being deconstructed. + */ mutex_lock(&module_mutex); mod->state = MODULE_STATE_UNFORMED; mutex_unlock(&module_mutex); @@ -2334,8 +2340,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info) if (!strncmp(name, "__gnu_lto", 9)) break; - /* We compiled with -fno-common. These are not - supposed to happen. */ + /* + * We compiled with -fno-common. These are not + * supposed to happen. + */ pr_debug("Common symbol: %s\n", name); pr_warn("%s: please compile with -fno-common\n", mod->name); @@ -2438,16 +2446,20 @@ static long get_offset(struct module *mod, unsigned int *size, return ret; } -/* Lay out the SHF_ALLOC sections in a way not dissimilar to how ld - might -- code, read-only data, read-write data, small data. Tally - sizes, and place the offsets into sh_entsize fields: high bit means it - belongs in init. */ +/* + * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld + * might -- code, read-only data, read-write data, small data. Tally + * sizes, and place the offsets into sh_entsize fields: high bit means it + * belongs in init. + */ static void layout_sections(struct module *mod, struct load_info *info) { static unsigned long const masks[][2] = { - /* NOTE: all executable code must be the first section + /* + * NOTE: all executable code must be the first section * in this array; otherwise modify the text_size - * finder in the two loops below */ + * finder in the two loops below + */ { SHF_EXECINSTR | SHF_ALLOC, ARCH_SHF_SMALL }, { SHF_ALLOC, SHF_WRITE | ARCH_SHF_SMALL }, { SHF_RO_AFTER_INIT | SHF_ALLOC, ARCH_SHF_SMALL }, @@ -3062,8 +3074,10 @@ static int rewrite_section_headers(struct load_info *info, int flags) return -ENOEXEC; } - /* Mark all sections sh_addr with their address in the - temporary image. */ + /* + * Mark all sections sh_addr with their address in the + * temporary image. + */ shdr->sh_addr = (size_t)info->hdr + shdr->sh_offset; #ifndef CONFIG_MODULE_UNLOAD @@ -3494,9 +3508,11 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) if (ndx) info->sechdrs[ndx].sh_flags |= SHF_RO_AFTER_INIT; - /* Determine total sizes, and put offsets in sh_entsize. For now - this is done generically; there doesn't appear to be any - special cases for the architectures. */ + /* + * Determine total sizes, and put offsets in sh_entsize. For now + * this is done generically; there doesn't appear to be any + * special cases for the architectures. + */ layout_sections(info->mod, info); layout_symtab(info->mod, info); @@ -3780,8 +3796,10 @@ static int complete_formation(struct module *mod, struct load_info *info) module_enable_nx(mod); module_enable_x(mod); - /* Mark state as coming so strong_try_module_get() ignores us, - * but kallsyms etc. can see us. */ + /* + * Mark state as coming so strong_try_module_get() ignores us, + * but kallsyms etc. can see us. + */ mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); @@ -3828,8 +3846,10 @@ static int unknown_module_param_cb(char *param, char *val, const char *modname, return 0; } -/* Allocate and load the module: note that size of section 0 is always - zero, and we rely on this for optional sections. */ +/* + * Allocate and load the module: note that size of section 0 is always + * zero, and we rely on this for optional sections. + */ static int load_module(struct load_info *info, const char __user *uargs, int flags) { @@ -3903,8 +3923,10 @@ static int load_module(struct load_info *info, const char __user *uargs, init_param_lock(mod); - /* Now we've got everything in the final locations, we can - * find optional sections. */ + /* + * Now we've got everything in the final locations, we can + * find optional sections. + */ err = find_module_sections(mod, info); if (err) goto free_unload; @@ -4118,8 +4140,10 @@ static const char *find_kallsyms_symbol(struct module *mod, bestval = kallsyms_symbol_value(&kallsyms->symtab[best]); - /* Scan for closest preceding symbol, and next symbol. (ELF - starts real symbols at 1). */ + /* + * Scan for closest preceding symbol, and next symbol. (ELF + * starts real symbols at 1). + */ for (i = 1; i < kallsyms->num_symtab; i++) { const Elf_Sym *sym = &kallsyms->symtab[i]; unsigned long thisval = kallsyms_symbol_value(sym); @@ -4127,8 +4151,10 @@ static const char *find_kallsyms_symbol(struct module *mod, if (sym->st_shndx == SHN_UNDEF) continue; - /* We ignore unnamed symbols: they're uninformative - * and inserted at a whim. */ + /* + * We ignore unnamed symbols: they're uninformative + * and inserted at a whim. + */ if (*kallsyms_symbol_name(kallsyms, i) == '\0' || is_arm_mapping_symbol(kallsyms_symbol_name(kallsyms, i))) continue; @@ -4158,8 +4184,10 @@ void * __weak dereference_module_function_descriptor(struct module *mod, return ptr; } -/* For kallsyms to ask for address resolution. NULL means not found. Careful - * not to lock to avoid deadlock on oopses, simply disable preemption. */ +/* + * For kallsyms to ask for address resolution. NULL means not found. Careful + * not to lock to avoid deadlock on oopses, simply disable preemption. + */ const char *module_address_lookup(unsigned long addr, unsigned long *size, unsigned long *offset, @@ -4417,11 +4445,12 @@ static int m_show(struct seq_file *m, void *p) return 0; } -/* Format: modulename size refcount deps address - - Where refcount is a number or -, and deps is a comma-separated list - of depends or -. -*/ +/* + * Format: modulename size refcount deps address + * + * Where refcount is a number or -, and deps is a comma-separated list + * of depends or -. + */ static const struct seq_operations modules_op = { .start = m_start, .next = m_next, @@ -4593,8 +4622,10 @@ void print_modules(void) } #ifdef CONFIG_MODVERSIONS -/* Generate the signature for all relevant module structures here. - * If these change, we don't want to try to parse the module. */ +/* + * Generate the signature for all relevant module structures here. + * If these change, we don't want to try to parse the module. + */ void module_layout(struct module *mod, struct modversion_info *ver, struct kernel_param *kp, -- cgit v1.2.3 From 1e106aa3509b86738769775969822ffc1ec21bf4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 6 Nov 2020 11:52:05 +0300 Subject: futex: Don't enable IRQs unconditionally in put_pi_state() The exit_pi_state_list() function calls put_pi_state() with IRQs disabled and is not expecting that IRQs will be enabled inside the function. Use the _irqsave() variant so that IRQs are restored to the original state instead of being enabled unconditionally. Fixes: 153fbd1226fb ("futex: Fix more put_pi_state() vs. exit_pi_state_list() races") Signed-off-by: Dan Carpenter Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201106085205.GA1159983@mwanda --- kernel/futex.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index ac328874f6e5..00259c7e288e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -788,8 +788,9 @@ static void put_pi_state(struct futex_pi_state *pi_state) */ if (pi_state->owner) { struct task_struct *owner; + unsigned long flags; - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); owner = pi_state->owner; if (owner) { raw_spin_lock(&owner->pi_lock); @@ -797,7 +798,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) raw_spin_unlock(&owner->pi_lock); } rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } if (current->pi_state_cache) { -- cgit v1.2.3 From 267fb27352b6fc9fdbad753127a239f75618ecbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 15:50:32 +0100 Subject: perf: Reduce stack usage of perf_output_begin() __perf_output_begin() has an on-stack struct perf_sample_data in the unlikely case it needs to generate a LOST record. However, every call to perf_output_begin() must already have a perf_sample_data on-stack. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201030151954.985416146@infradead.org --- arch/powerpc/perf/imc-pmu.c | 2 +- arch/s390/kernel/perf_cpum_sf.c | 2 +- arch/x86/events/intel/ds.c | 4 ++-- include/linux/perf_event.h | 7 +++++-- kernel/events/core.c | 32 +++++++++++++++++--------------- kernel/events/ring_buffer.c | 20 +++++++++++--------- 6 files changed, 37 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 9ed4fcccf8a9..7b25548ec42b 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1336,7 +1336,7 @@ static void dump_trace_imc_data(struct perf_event *event) /* If this is a valid record, create the sample */ struct perf_output_handle handle; - if (perf_output_begin(&handle, event, header.size)) + if (perf_output_begin(&handle, &data, event, header.size)) return; perf_output_sample(&handle, &header, &data, event); diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index 4f9e4626df55..00255ae3979d 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -672,7 +672,7 @@ static void cpumsf_output_event_pid(struct perf_event *event, rcu_read_lock(); perf_prepare_sample(&header, data, event, regs); - if (perf_output_begin(&handle, event, header.size)) + if (perf_output_begin(&handle, data, event, header.size)) goto out; /* Update the process ID (see also kernel/events/core.c) */ diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 404315df1e16..cd2ae14a0a98 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -642,8 +642,8 @@ int intel_pmu_drain_bts_buffer(void) rcu_read_lock(); perf_prepare_sample(&header, &data, event, ®s); - if (perf_output_begin(&handle, event, header.size * - (top - base - skip))) + if (perf_output_begin(&handle, &data, event, + header.size * (top - base - skip))) goto unlock; for (at = base; at < top; at++) { diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 0c19d279b97f..b775ae0a8c87 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1400,11 +1400,14 @@ perf_event_addr_filters(struct perf_event *event) extern void perf_event_addr_filters_sync(struct perf_event *event); extern int perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size); extern int perf_output_begin_forward(struct perf_output_handle *handle, - struct perf_event *event, - unsigned int size); + struct perf_sample_data *data, + struct perf_event *event, + unsigned int size); extern int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size); diff --git a/kernel/events/core.c b/kernel/events/core.c index 5a29ab09e72d..fc681c7c1e03 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7186,6 +7186,7 @@ __perf_event_output(struct perf_event *event, struct perf_sample_data *data, struct pt_regs *regs, int (*output_begin)(struct perf_output_handle *, + struct perf_sample_data *, struct perf_event *, unsigned int)) { @@ -7198,7 +7199,7 @@ __perf_event_output(struct perf_event *event, perf_prepare_sample(&header, data, event, regs); - err = output_begin(&handle, event, header.size); + err = output_begin(&handle, data, event, header.size); if (err) goto exit; @@ -7264,7 +7265,7 @@ perf_event_read_event(struct perf_event *event, int ret; perf_event_header__init_id(&read_event.header, &sample, event); - ret = perf_output_begin(&handle, event, read_event.header.size); + ret = perf_output_begin(&handle, &sample, event, read_event.header.size); if (ret) return; @@ -7533,7 +7534,7 @@ static void perf_event_task_output(struct perf_event *event, perf_event_header__init_id(&task_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, task_event->event_id.header.size); if (ret) goto out; @@ -7636,7 +7637,7 @@ static void perf_event_comm_output(struct perf_event *event, return; perf_event_header__init_id(&comm_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, comm_event->event_id.header.size); if (ret) @@ -7736,7 +7737,7 @@ static void perf_event_namespaces_output(struct perf_event *event, perf_event_header__init_id(&namespaces_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, namespaces_event->event_id.header.size); if (ret) goto out; @@ -7863,7 +7864,7 @@ static void perf_event_cgroup_output(struct perf_event *event, void *data) perf_event_header__init_id(&cgroup_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, cgroup_event->event_id.header.size); if (ret) goto out; @@ -7989,7 +7990,7 @@ static void perf_event_mmap_output(struct perf_event *event, } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, mmap_event->event_id.header.size); if (ret) goto out; @@ -8299,7 +8300,7 @@ void perf_event_aux_event(struct perf_event *event, unsigned long head, int ret; perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; @@ -8333,7 +8334,7 @@ void perf_log_lost_samples(struct perf_event *event, u64 lost) perf_event_header__init_id(&lost_samples_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, lost_samples_event.header.size); if (ret) return; @@ -8388,7 +8389,7 @@ static void perf_event_switch_output(struct perf_event *event, void *data) perf_event_header__init_id(&se->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, se->event_id.header.size); + ret = perf_output_begin(&handle, &sample, event, se->event_id.header.size); if (ret) return; @@ -8463,7 +8464,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_event_header__init_id(&throttle_event.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, throttle_event.header.size); if (ret) return; @@ -8506,7 +8507,7 @@ static void perf_event_ksymbol_output(struct perf_event *event, void *data) perf_event_header__init_id(&ksymbol_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, &sample, event, ksymbol_event->event_id.header.size); if (ret) return; @@ -8596,7 +8597,7 @@ static void perf_event_bpf_output(struct perf_event *event, void *data) perf_event_header__init_id(&bpf_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, + ret = perf_output_begin(&handle, data, event, bpf_event->event_id.header.size); if (ret) return; @@ -8705,7 +8706,8 @@ static void perf_event_text_poke_output(struct perf_event *event, void *data) perf_event_header__init_id(&text_poke_event->event_id.header, &sample, event); - ret = perf_output_begin(&handle, event, text_poke_event->event_id.header.size); + ret = perf_output_begin(&handle, &sample, event, + text_poke_event->event_id.header.size); if (ret) return; @@ -8786,7 +8788,7 @@ static void perf_log_itrace_start(struct perf_event *event) rec.tid = perf_event_tid(event, current); perf_event_header__init_id(&rec.header, &sample, event); - ret = perf_output_begin(&handle, event, rec.header.size); + ret = perf_output_begin(&handle, &sample, event, rec.header.size); if (ret) return; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 192b8abc6330..ef91ae75ca56 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -147,6 +147,7 @@ ring_buffer_has_space(unsigned long head, unsigned long tail, static __always_inline int __perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size, bool backward) { @@ -237,18 +238,16 @@ __perf_output_begin(struct perf_output_handle *handle, handle->size = (1UL << page_shift) - offset; if (unlikely(have_lost)) { - struct perf_sample_data sample_data; - lost_event.header.size = sizeof(lost_event); lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.id = event->id; lost_event.lost = local_xchg(&rb->lost, 0); - perf_event_header__init_id(&lost_event.header, - &sample_data, event); + /* XXX mostly redundant; @data is already fully initializes */ + perf_event_header__init_id(&lost_event.header, data, event); perf_output_put(handle, lost_event); - perf_event__output_id_sample(event, handle, &sample_data); + perf_event__output_id_sample(event, handle, data); } return 0; @@ -263,22 +262,25 @@ out: } int perf_output_begin_forward(struct perf_output_handle *handle, - struct perf_event *event, unsigned int size) + struct perf_sample_data *data, + struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, false); + return __perf_output_begin(handle, data, event, size, false); } int perf_output_begin_backward(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, true); + return __perf_output_begin(handle, data, event, size, true); } int perf_output_begin(struct perf_output_handle *handle, + struct perf_sample_data *data, struct perf_event *event, unsigned int size) { - return __perf_output_begin(handle, event, size, + return __perf_output_begin(handle, data, event, size, unlikely(is_write_backward(event))); } -- cgit v1.2.3 From ce0f17fc93f63ee91428af10b7b2ddef38cd19e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 12:49:45 +0100 Subject: perf: Fix get_recursion_context() One should use in_serving_softirq() to detect SoftIRQ context. Fixes: 96f6d4444302 ("perf_counter: avoid recursion") Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201030151955.120572175@infradead.org --- kernel/events/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index fcbf5616a441..402054e755f2 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -211,7 +211,7 @@ static inline int get_recursion_context(int *recursion) rctx = 3; else if (in_irq()) rctx = 2; - else if (in_softirq()) + else if (in_serving_softirq()) rctx = 1; else rctx = 0; -- cgit v1.2.3 From 09da9c81253dd8e43e0d2d7cea02de6f9f19499d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 13:43:16 +0100 Subject: perf: Optimize get_recursion_context() "Look ma, no branches!" Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jesper Dangaard Brouer Link: https://lkml.kernel.org/r/20201030151955.187580298@infradead.org --- kernel/events/internal.h | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 402054e755f2..228801e20788 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -205,16 +205,12 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) static inline int get_recursion_context(int *recursion) { - int rctx; - - if (unlikely(in_nmi())) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_serving_softirq()) - rctx = 1; - else - rctx = 0; + unsigned int pc = preempt_count(); + unsigned char rctx = 0; + + rctx += !!(pc & (NMI_MASK)); + rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK)); + rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)); if (recursion[rctx]) return -1; -- cgit v1.2.3 From 76a4efa80900fc40e0fdf243b42aec9fb8c35d24 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 30 Oct 2020 12:14:21 +0100 Subject: perf/arch: Remove perf_sample_data::regs_user_copy struct perf_sample_data lives on-stack, we should be careful about it's size. Furthermore, the pt_regs copy in there is only because x86_64 is a trainwreck, solve it differently. Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Tested-by: Steven Rostedt Link: https://lkml.kernel.org/r/20201030151955.258178461@infradead.org --- arch/arm/kernel/perf_regs.c | 3 +-- arch/arm64/kernel/perf_regs.c | 3 +-- arch/csky/kernel/perf_regs.c | 3 +-- arch/powerpc/perf/perf_regs.c | 3 +-- arch/riscv/kernel/perf_regs.c | 3 +-- arch/s390/kernel/perf_regs.c | 3 +-- arch/x86/kernel/perf_regs.c | 15 +++++++++++---- include/linux/perf_event.h | 6 ------ include/linux/perf_regs.h | 6 ++---- kernel/events/core.c | 8 +++----- 10 files changed, 22 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/perf_regs.c b/arch/arm/kernel/perf_regs.c index 05fe92aa7d98..0529f90395c9 100644 --- a/arch/arm/kernel/perf_regs.c +++ b/arch/arm/kernel/perf_regs.c @@ -32,8 +32,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c index 94e8718e7229..f6f58e6265df 100644 --- a/arch/arm64/kernel/perf_regs.c +++ b/arch/arm64/kernel/perf_regs.c @@ -73,8 +73,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/arch/csky/kernel/perf_regs.c b/arch/csky/kernel/perf_regs.c index eb32838b8210..09b7f88a2d6a 100644 --- a/arch/csky/kernel/perf_regs.c +++ b/arch/csky/kernel/perf_regs.c @@ -32,8 +32,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/arch/powerpc/perf/perf_regs.c b/arch/powerpc/perf/perf_regs.c index 8e53f2fc3fe0..6f681b105eec 100644 --- a/arch/powerpc/perf/perf_regs.c +++ b/arch/powerpc/perf/perf_regs.c @@ -144,8 +144,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = (regs_user->regs) ? perf_reg_abi(current) : diff --git a/arch/riscv/kernel/perf_regs.c b/arch/riscv/kernel/perf_regs.c index 04a38fbeb9c7..fd304a248de6 100644 --- a/arch/riscv/kernel/perf_regs.c +++ b/arch/riscv/kernel/perf_regs.c @@ -36,8 +36,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c index 4352a504f235..6e9e5d5e927e 100644 --- a/arch/s390/kernel/perf_regs.c +++ b/arch/s390/kernel/perf_regs.c @@ -53,8 +53,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { /* * Use the regs from the first interruption and let diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index bb7e1132290b..f9e5352b3bef 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -101,8 +101,7 @@ u64 perf_reg_abi(struct task_struct *task) } void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); @@ -129,12 +128,20 @@ u64 perf_reg_abi(struct task_struct *task) return PERF_SAMPLE_REGS_ABI_64; } +static DEFINE_PER_CPU(struct pt_regs, nmi_user_regs); + void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { + struct pt_regs *regs_user_copy = this_cpu_ptr(&nmi_user_regs); struct pt_regs *user_regs = task_pt_regs(current); + if (!in_nmi()) { + regs_user->regs = user_regs; + regs_user->abi = perf_reg_abi(current); + return; + } + /* * If we're in an NMI that interrupted task_pt_regs setup, then * we can't sample user regs at all. This check isn't really diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b775ae0a8c87..96450f6fb1de 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1022,13 +1022,7 @@ struct perf_sample_data { struct perf_callchain_entry *callchain; u64 aux_size; - /* - * regs_user may point to task_pt_regs or to regs_user_copy, depending - * on arch details. - */ struct perf_regs regs_user; - struct pt_regs regs_user_copy; - struct perf_regs regs_intr; u64 stack_user_size; diff --git a/include/linux/perf_regs.h b/include/linux/perf_regs.h index 2d12e97d5e7b..f632c5725f16 100644 --- a/include/linux/perf_regs.h +++ b/include/linux/perf_regs.h @@ -20,8 +20,7 @@ u64 perf_reg_value(struct pt_regs *regs, int idx); int perf_reg_validate(u64 mask); u64 perf_reg_abi(struct task_struct *task); void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy); + struct pt_regs *regs); #else #define PERF_REG_EXTENDED_MASK 0 @@ -42,8 +41,7 @@ static inline u64 perf_reg_abi(struct task_struct *task) } static inline void perf_get_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { regs_user->regs = task_pt_regs(current); regs_user->abi = perf_reg_abi(current); diff --git a/kernel/events/core.c b/kernel/events/core.c index fc681c7c1e03..d67c9cbb0f6a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6374,14 +6374,13 @@ perf_output_sample_regs(struct perf_output_handle *handle, } static void perf_sample_regs_user(struct perf_regs *regs_user, - struct pt_regs *regs, - struct pt_regs *regs_user_copy) + struct pt_regs *regs) { if (user_mode(regs)) { regs_user->abi = perf_reg_abi(current); regs_user->regs = regs; } else if (!(current->flags & PF_KTHREAD)) { - perf_get_regs_user(regs_user, regs, regs_user_copy); + perf_get_regs_user(regs_user, regs); } else { regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE; regs_user->regs = NULL; @@ -7083,8 +7082,7 @@ void perf_prepare_sample(struct perf_event_header *header, } if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER)) - perf_sample_regs_user(&data->regs_user, regs, - &data->regs_user_copy); + perf_sample_regs_user(&data->regs_user, regs); if (sample_type & PERF_SAMPLE_REGS_USER) { /* regs dump ABI info */ -- cgit v1.2.3 From 8c7855d82933bab7fa5e96f0e568fc125c2e1ab4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Oct 2020 16:28:25 +0100 Subject: perf: Simplify group_sched_out() Since event_sched_out() clears cpuctx->exclusive upon removal of an exclusive event (and only group leaders can be exclusive), there is no point in group_sched_out() trying to do it too. It is impossible for cpuctx->exclusive to still be set here. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201029162901.904060564@infradead.org --- kernel/events/core.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d67c9cbb0f6a..9a5736617a82 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2312,9 +2312,6 @@ group_sched_out(struct perf_event *group_event, event_sched_out(event, cpuctx, ctx); perf_pmu_enable(ctx->pmu); - - if (group_event->attr.exclusive) - cpuctx->exclusive = 0; } #define DETACH_GROUP 0x01UL -- cgit v1.2.3 From 251ff2d49347793d348babcff745289b11910e96 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Oct 2020 16:29:15 +0100 Subject: perf: Simplify group_sched_in() Collate the error paths. Code duplication only leads to divergence and extra bugs. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201029162901.972161394@infradead.org --- kernel/events/core.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 9a5736617a82..f0e526866a1c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2580,11 +2580,8 @@ group_sched_in(struct perf_event *group_event, pmu->start_txn(pmu, PERF_PMU_TXN_ADD); - if (event_sched_in(group_event, cpuctx, ctx)) { - pmu->cancel_txn(pmu); - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; - } + if (event_sched_in(group_event, cpuctx, ctx)) + goto error; /* * Schedule in siblings as one group (if any): @@ -2613,10 +2610,9 @@ group_error: } event_sched_out(group_event, cpuctx, ctx); +error: pmu->cancel_txn(pmu); - perf_mux_hrtimer_restart(cpuctx); - return -EAGAIN; } -- cgit v1.2.3 From 2714c3962f304d031d5016c963c4b459337b0749 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Oct 2020 16:29:53 +0100 Subject: perf: Fix event multiplexing for exclusive groups Commit 9e6302056f80 ("perf: Use hrtimers for event multiplexing") placed the hrtimer (re)start call in the wrong place. Instead of capturing all scheduling failures, it only considered the PMU failure. The result is that groups using perf_event_attr::exclusive are no longer rotated. Fixes: 9e6302056f80 ("perf: Use hrtimers for event multiplexing") Reported-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201029162902.038667689@infradead.org --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f0e526866a1c..00be48acdc36 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2612,7 +2612,6 @@ group_error: error: pmu->cancel_txn(pmu); - perf_mux_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -3672,6 +3671,7 @@ static int merge_sched_in(struct perf_event *event, void *data) *can_add_hw = 0; ctx->rotate_necessary = 1; + perf_mux_hrtimer_restart(cpuctx); } return 0; -- cgit v1.2.3 From 1908dc911792067287458fdb0800f036f4f4e0f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 29 Oct 2020 16:32:22 +0100 Subject: perf: Tweak perf_event_attr::exclusive semantics Currently perf_event_attr::exclusive can be used to ensure an event(group) is the sole group scheduled on the PMU. One consequence is that when you have a pinned event (say the watchdog) you can no longer have regular exclusive event(group)s. Inspired by the fact that !pinned events are considered less strict, allow !pinned,exclusive events to share the PMU with pinned,!exclusive events. Pinned,exclusive is still fully exclusive. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201029162902.105962225@infradead.org --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 00be48acdc36..dc568ca295bd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2637,7 +2637,7 @@ static int group_can_go_on(struct perf_event *event, * If this group is exclusive and there are already * events on the CPU, it can't go on. */ - if (event->attr.exclusive && cpuctx->active_oncpu) + if (event->attr.exclusive && !list_empty(get_event_list(event))) return 0; /* * Otherwise, try to add it if all previous groups were able -- cgit v1.2.3 From abbaa433de07076fb8ef524b77ce55d94bad5fc5 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Sat, 7 Nov 2020 15:45:44 +0800 Subject: bpf: Fix passing zero to PTR_ERR() in bpf_btf_printf_prepare There is a bug when passing zero to PTR_ERR() and return. Fix the smatch error. Fixes: c4d0bfb45068 ("bpf: Add bpf_snprintf_btf helper") Signed-off-by: Wang Qing Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/1604735144-686-1-git-send-email-wangqing@vivo.com --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b66518..5113fd423cdf 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1198,7 +1198,7 @@ static int bpf_btf_printf_prepare(struct btf_ptr *ptr, u32 btf_ptr_size, *btf = bpf_get_btf_vmlinux(); if (IS_ERR_OR_NULL(*btf)) - return PTR_ERR(*btf); + return IS_ERR(*btf) ? PTR_ERR(*btf) : -EINVAL; if (ptr->type_id > 0) *btf_id = ptr->type_id; -- cgit v1.2.3 From 666475ccbf1dc99c1e61e47975d5fbf86d6236aa Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Sat, 7 Nov 2020 16:10:50 +0800 Subject: bpf, btf: Remove the duplicate btf_ids.h include Remove duplicate btf_ids.h header which is included twice. Signed-off-by: Wang Qing Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/1604736650-11197-1-git-send-email-wangqing@vivo.com --- kernel/bpf/btf.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index ed7d02e8bc93..6324de8c59f7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -22,7 +22,6 @@ #include #include #include -#include #include /* BTF (BPF Type Format) is the meta data format which describes -- cgit v1.2.3 From b6d37a764a5b852db63101b3f2db0e699574b903 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Tue, 10 Nov 2020 10:11:59 +0800 Subject: sched/fair: Reorder throttle_cfs_rq() path As commit: 39f23ce07b93 ("sched/fair: Fix unthrottle_cfs_rq() for leaf_cfs_rq list") does in unthrottle_cfs_rq(), throttle_cfs_rq() can also use the same pattern as dequeue_task_fair(). No functional changes. Signed-off-by: Peng Wang Signed-off-by: Ingo Molnar Cc: Vincent Guittot Cc: Peter Zijlstra (Intel) Cc: Phil Auld Cc: Ben Segall Link: https://lore.kernel.org/r/f11dd2e3ab35cc538e2eb57bf0c99b6eaffce127.1604973978.git.rocking@linux.alibaba.com --- kernel/sched/fair.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 52cacfc62922..2755a7e0f1ce 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4788,25 +4788,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) - break; + goto done; - if (dequeue) { - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - } else { - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - } + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; - if (qcfs_rq->load.weight) - dequeue = 0; + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; + } } - if (!se) - sub_nr_running(rq, task_delta); + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + } + + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta); + +done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. -- cgit v1.2.3 From 04e613ded8c26489b3e0f9101b44462f780d1a35 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 6 Nov 2020 10:25:49 +0000 Subject: arm64: smp: Tell RCU about CPUs that fail to come online Commit ce3d31ad3cac ("arm64/smp: Move rcu_cpu_starting() earlier") ensured that RCU is informed early about incoming CPUs that might end up calling into printk() before they are online. However, if such a CPU fails the early CPU feature compatibility checks in check_local_cpu_capabilities(), then it will be powered off or parked without informing RCU, leading to an endless stream of stalls: | rcu: INFO: rcu_preempt detected stalls on CPUs/tasks: | rcu: 2-O...: (0 ticks this GP) idle=002/1/0x4000000000000000 softirq=0/0 fqs=2593 | (detected by 0, t=5252 jiffies, g=9317, q=136) | Task dump for CPU 2: | task:swapper/2 state:R running task stack: 0 pid: 0 ppid: 1 flags:0x00000028 | Call trace: | ret_from_fork+0x0/0x30 Ensure that the dying CPU invokes rcu_report_dead() prior to being powered off or parked. Cc: Qian Cai Cc: "Paul E. McKenney" Reviewed-by: Paul E. McKenney Suggested-by: Qian Cai Link: https://lore.kernel.org/r/20201105222242.GA8842@willie-the-truck Link: https://lore.kernel.org/r/20201106103602.9849-3-will@kernel.org Signed-off-by: Will Deacon --- arch/arm64/kernel/smp.c | 1 + kernel/rcu/tree.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 09c96f57818c..18e9727d3f64 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -413,6 +413,7 @@ void cpu_die_early(void) /* Mark this CPU absent */ set_cpu_present(cpu, 0); + rcu_report_dead(cpu); if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) { update_cpu_boot_status(CPU_KILL_ME); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 06895ef85d69..946e7c0c4cf7 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4077,7 +4077,6 @@ void rcu_cpu_starting(unsigned int cpu) smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } -#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing function has no further need of RCU, so remove it from * the rcu_node tree's ->qsmaskinitnext bit masks. @@ -4117,6 +4116,7 @@ void rcu_report_dead(unsigned int cpu) rdp->cpu_started = false; } +#ifdef CONFIG_HOTPLUG_CPU /* * The outgoing CPU has just passed through the dying-idle state, and we * are being invoked from the CPU that was IPIed to continue the offline -- cgit v1.2.3 From 9a2a9ebc0a758d887ee06e067e9f7f0b36ff7574 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 Nov 2020 18:25:57 +0100 Subject: cpufreq: Introduce governor flags A new cpufreq governor flag will be added subsequently, so replace the bool dynamic_switching fleid in struct cpufreq_governor with a flags field and introduce CPUFREQ_GOV_DYNAMIC_SWITCHING to set for the "dynamic switching" governors instead of it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 2 +- drivers/cpufreq/cpufreq_governor.h | 2 +- include/linux/cpufreq.h | 9 +++++++-- kernel/sched/cpufreq_schedutil.c | 2 +- 4 files changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 336b5e94cbc8..0252903f1b43 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2254,7 +2254,7 @@ static int cpufreq_init_governor(struct cpufreq_policy *policy) return -EINVAL; /* Platform doesn't want dynamic frequency switching ? */ - if (policy->governor->dynamic_switching && + if (policy->governor->flags & CPUFREQ_GOV_DYNAMIC_SWITCHING && cpufreq_driver->flags & CPUFREQ_NO_AUTO_DYNAMIC_SWITCHING) { struct cpufreq_governor *gov = cpufreq_fallback_governor(); diff --git a/drivers/cpufreq/cpufreq_governor.h b/drivers/cpufreq/cpufreq_governor.h index c56773c25757..bab8e6140377 100644 --- a/drivers/cpufreq/cpufreq_governor.h +++ b/drivers/cpufreq/cpufreq_governor.h @@ -156,7 +156,7 @@ void cpufreq_dbs_governor_limits(struct cpufreq_policy *policy); #define CPUFREQ_DBS_GOVERNOR_INITIALIZER(_name_) \ { \ .name = _name_, \ - .dynamic_switching = true, \ + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, \ .owner = THIS_MODULE, \ .init = cpufreq_dbs_governor_init, \ .exit = cpufreq_dbs_governor_exit, \ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1eaa04f1bae6..9bdfcf3c4748 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -570,12 +570,17 @@ struct cpufreq_governor { char *buf); int (*store_setspeed) (struct cpufreq_policy *policy, unsigned int freq); - /* For governors which change frequency dynamically by themselves */ - bool dynamic_switching; struct list_head governor_list; struct module *owner; + u8 flags; }; +/* Governor flags */ + +/* For governors which change frequency dynamically by themselves */ +#define CPUFREQ_GOV_DYNAMIC_SWITCHING BIT(0) + + /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index d73bccde2720..97d318b0cd0c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -881,7 +881,7 @@ static void sugov_limits(struct cpufreq_policy *policy) struct cpufreq_governor schedutil_gov = { .name = "schedutil", .owner = THIS_MODULE, - .dynamic_switching = true, + .flags = CPUFREQ_GOV_DYNAMIC_SWITCHING, .init = sugov_init, .exit = sugov_exit, .start = sugov_start, -- cgit v1.2.3 From d61fc96a37603384cd531622c1e89de1096b5123 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Mon, 2 Nov 2020 13:37:41 +0800 Subject: lockdep: Avoid to modify chain keys in validate_chain() Chris Wilson reported a problem spotted by check_chain_key(): a chain key got changed in validate_chain() because we modify the ->read in validate_chain() to skip checks for dependency adding, and ->read is taken into calculation for chain key since commit f611e8cf98ec ("lockdep: Take read/write status in consideration when generate chainkey"). Fix this by avoiding to modify ->read in validate_chain() based on two facts: a) since we now support recursive read lock detection, there is no need to skip checks for dependency adding for recursive readers, b) since we have a), there is only one case left (nest_lock) where we want to skip checks in validate_chain(), we simply remove the modification for ->read and rely on the return value of check_deadlock() to skip the dependency adding. Reported-by: Chris Wilson Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201102053743.450459-1-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b71ad8d9f1c9..d9fb9e19d2ed 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2765,7 +2765,9 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, * (Note that this has to be done separately, because the graph cannot * detect such classes of deadlocks.) * - * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read + * Returns: 0 on deadlock detected, 1 on OK, 2 if another lock with the same + * lock class is held but nest_lock is also held, i.e. we rely on the + * nest_lock to avoid the deadlock. */ static int check_deadlock(struct task_struct *curr, struct held_lock *next) @@ -2788,7 +2790,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next) * lock class (i.e. read_lock(lock)+read_lock(lock)): */ if ((next->read == 2) && prev->read) - return 2; + continue; /* * We're holding the nest_lock, which serializes this lock's @@ -3592,16 +3594,13 @@ static int validate_chain(struct task_struct *curr, if (!ret) return 0; - /* - * Mark recursive read, as we jump over it when - * building dependencies (just like we jump over - * trylock entries): - */ - if (ret == 2) - hlock->read = 2; /* * Add dependency only if this lock is not the head - * of the chain, and if it's not a secondary read-lock: + * of the chain, and if the new lock introduces no more + * lock dependency (because we already hold a lock with the + * same lock class) nor deadlock (because the nest_lock + * serializes nesting locks), see the comments for + * check_deadlock(). */ if (!chain_head && ret != 2) { if (!check_prevs_add(curr, hlock)) -- cgit v1.2.3 From 16b0a7a1a0af9db6e008fecd195fe4d6cb366d83 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 2 Nov 2020 11:24:57 +0100 Subject: sched/fair: Ensure tasks spreading in LLC during LB schbench shows latency increase for 95 percentile above since: commit 0b0695f2b34a ("sched/fair: Rework load_balance()") Align the behavior of the load balancer with the wake up path, which tries to select an idle CPU which belongs to the LLC for a waking task. calculate_imbalance() will use nr_running instead of the spare capacity when CPUs share resources (ie cache) at the domain level. This will ensure a better spread of tasks on idle CPUs. Running schbench on a hikey (8cores arm64) shows the problem: tip/sched/core : schbench -m 2 -t 4 -s 10000 -c 1000000 -r 10 Latency percentiles (usec) 50.0th: 33 75.0th: 45 90.0th: 51 95.0th: 4152 *99.0th: 14288 99.5th: 14288 99.9th: 14288 min=0, max=14276 tip/sched/core + patch : schbench -m 2 -t 4 -s 10000 -c 1000000 -r 10 Latency percentiles (usec) 50.0th: 34 75.0th: 47 90.0th: 52 95.0th: 78 *99.0th: 94 99.5th: 94 99.9th: 94 min=0, max=94 Fixes: 0b0695f2b34a ("sched/fair: Rework load_balance()") Reported-by: Chris Mason Suggested-by: Rik van Riel Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Rik van Riel Tested-by: Rik van Riel Link: https://lkml.kernel.org/r/20201102102457.28808-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index aa4c6227cd6d..210b15f068a6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9031,7 +9031,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * emptying busiest. */ if (local->group_type == group_has_spare) { - if (busiest->group_type > group_fully_busy) { + if ((busiest->group_type > group_fully_busy) && + !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity -- cgit v1.2.3 From b4c9c9f15649c98a5b45408919d1ff4fd7f5531c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 29 Oct 2020 17:18:24 +0100 Subject: sched/fair: Prefer prev cpu in asymmetric wakeup path During fast wakeup path, scheduler always check whether local or prev cpus are good candidates for the task before looking for other cpus in the domain. With commit b7a331615d25 ("sched/fair: Add asymmetric CPU capacity wakeup scan") the heterogenous system gains a dedicated path but doesn't try to reuse prev cpu whenever possible. If the previous cpu is idle and belong to the LLC domain, we should check it 1st before looking for another cpu because it stays one of the best candidate and this also stabilizes task placement on the system. This change aligns asymmetric path behavior with symmetric one and reduces cases where the task migrates across all cpus of the sd_asym_cpucapacity domains at wakeup. This change does not impact normal EAS mode but only the overloaded case or when EAS is not used. - On hikey960 with performance governor (EAS disable) ./perf bench sched pipe -T -l 50000 mainline w/ patch # migrations 999364 0 ops/sec 149313(+/-0.28%) 182587(+/- 0.40) +22% - On hikey with performance governor ./perf bench sched pipe -T -l 50000 mainline w/ patch # migrations 0 0 ops/sec 47721(+/-0.76%) 47899(+/- 0.56) +0.4% According to test on hikey, the patch doesn't impact symmetric system compared to current implementation (only tested on arm64) Also read the uclamped value of task's utilization at most twice instead instead each time we compare task's utilization with cpu's capacity. Fixes: b7a331615d25 ("sched/fair: Add asymmetric CPU capacity wakeup scan") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Tested-by: Dietmar Eggemann Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20201029161824.26389-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 67 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 210b15f068a6..8e563cf2b5e7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6172,21 +6172,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long best_cap = 0; + unsigned long task_util, best_cap = 0; int cpu, best_cpu = -1; struct cpumask *cpus; - sync_entity_load_avg(&p->se); - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + task_util = uclamp_task_util(p); + for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (task_fits_capacity(p, cpu_cap)) + if (fits_capacity(task_util, cpu_cap)) return cpu; if (cpu_cap > best_cap) { @@ -6198,44 +6198,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } +static inline bool asym_fits_capacity(int task_util, int cpu) +{ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + return fits_capacity(task_util, capacity_of(cpu)); + + return true; +} + /* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; + unsigned long task_util; int i, recent_used_cpu; /* - * For asymmetric CPU capacity systems, our domain of interest is - * sd_asym_cpucapacity rather than sd_llc. + * On asymmetric system, update task utilization because we will check + * that the task fits with cpu's capacity. */ if (static_branch_unlikely(&sched_asym_cpucapacity)) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); - /* - * On an asymmetric CPU capacity system where an exclusive - * cpuset defines a symmetric island (i.e. one unique - * capacity_orig value through the cpuset), the key will be set - * but the CPUs within that cpuset will not have a domain with - * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric - * capacity path. - */ - if (!sd) - goto symmetric; - - i = select_idle_capacity(p, sd, target); - return ((unsigned)i < nr_cpumask_bits) ? i : target; + sync_entity_load_avg(&p->se); + task_util = uclamp_task_util(p); } -symmetric: - if (available_idle_cpu(target) || sched_idle_cpu(target)) + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev))) + (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + asym_fits_capacity(task_util, prev)) return prev; /* @@ -6258,7 +6256,8 @@ symmetric: recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + asym_fits_capacity(task_util, recent_used_cpu)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6267,6 +6266,26 @@ symmetric: return recent_used_cpu; } + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + /* + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. + */ + if (sd) { + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; -- cgit v1.2.3 From 8d4d9c7b4333abccb3bf310d76ef7ea2edb9828f Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 29 Oct 2020 15:11:03 +0000 Subject: sched/debug: Fix memory corruption caused by multiple small reads of flags Reading /proc/sys/kernel/sched_domain/cpu*/domain0/flags mutliple times with small reads causes oopses with slub corruption issues because the kfree is free'ing an offset from a previous allocation. Fix this by adding in a new pointer 'buf' for the allocation and kfree and use the temporary pointer tmp to handle memory copies of the buf offsets. Fixes: 5b9f8ff7b320 ("sched/debug: Output SD flag names rather than their values") Reported-by: Jeff Bastian Signed-off-by: Colin Ian King Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20201029151103.373410-1-colin.king@canonical.com --- kernel/sched/debug.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0655524700d2..2357921580f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -251,7 +251,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, unsigned long flags = *(unsigned long *)table->data; size_t data_size = 0; size_t len = 0; - char *tmp; + char *tmp, *buf; int idx; if (write) @@ -269,17 +269,17 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, return 0; } - tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL); - if (!tmp) + buf = kcalloc(data_size + 1, sizeof(*buf), GFP_KERNEL); + if (!buf) return -ENOMEM; for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { char *name = sd_flag_debug[idx].name; - len += snprintf(tmp + len, strlen(name) + 2, "%s ", name); + len += snprintf(buf + len, strlen(name) + 2, "%s ", name); } - tmp += *ppos; + tmp = buf + *ppos; len -= *ppos; if (len > *lenp) @@ -294,7 +294,7 @@ static int sd_ctl_doflags(struct ctl_table *table, int write, *lenp = len; *ppos += len; - kfree(tmp); + kfree(buf); return 0; } -- cgit v1.2.3 From a8b62fd0850503cf1e557d7e5a98d3f1f5c25eef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 21 Sep 2020 12:58:17 +0200 Subject: stop_machine: Add function and caller debug info Crashes in stop-machine are hard to connect to the calling code, add a little something to help with that. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102346.116513635@infradead.org --- include/linux/stop_machine.h | 5 +++++ kernel/sched/core.c | 1 + kernel/stop_machine.c | 27 ++++++++++++++++++++++++--- lib/dump_stack.c | 2 ++ 4 files changed, 32 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 76d8b09384a7..30577c3aecf8 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg); struct cpu_stop_work { struct list_head list; /* cpu_stopper->works */ cpu_stop_fn_t fn; + unsigned long caller; void *arg; struct cpu_stop_done *done; }; @@ -36,6 +37,8 @@ void stop_machine_park(int cpu); void stop_machine_unpark(int cpu); void stop_machine_yield(const struct cpumask *cpumask); +extern void print_stop_info(const char *log_lvl, struct task_struct *task); + #else /* CONFIG_SMP */ #include @@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu, return false; } +static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { } + #endif /* CONFIG_SMP */ /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..5e24104faafd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6447,6 +6447,7 @@ void sched_show_task(struct task_struct *p) (unsigned long)task_thread_info(p)->flags); print_worker_info(KERN_INFO, p); + print_stop_info(KERN_INFO, p); show_stack(p, NULL, KERN_INFO); put_task_stack(p); } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 865bb0228ab6..3cf567c2019d 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -42,11 +42,27 @@ struct cpu_stopper { struct list_head works; /* list of pending works */ struct cpu_stop_work stop_work; /* for stop_cpus */ + unsigned long caller; + cpu_stop_fn_t fn; }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static bool stop_machine_initialized = false; +void print_stop_info(const char *log_lvl, struct task_struct *task) +{ + /* + * If @task is a stopper task, it cannot migrate and task_cpu() is + * stable. + */ + struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task)); + + if (task != stopper->thread) + return; + + printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller); +} + /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); static bool stop_cpus_in_progress; @@ -123,7 +139,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) { struct cpu_stop_done done; - struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; + struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ }; cpu_stop_init_done(&done, 1); if (!cpu_stop_queue_work(cpu, &work)) @@ -331,7 +347,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * work1 = work2 = (struct cpu_stop_work){ .fn = multi_cpu_stop, .arg = &msdata, - .done = &done + .done = &done, + .caller = _RET_IP_, }; cpu_stop_init_done(&done, 2); @@ -367,7 +384,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { - *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; + *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; return cpu_stop_queue_work(cpu, work_buf); } @@ -487,6 +504,8 @@ repeat: int ret; /* cpu stop callbacks must not sleep, make in_atomic() == T */ + stopper->caller = work->caller; + stopper->fn = fn; preempt_count_inc(); ret = fn(arg); if (done) { @@ -495,6 +514,8 @@ repeat: cpu_stop_signal_done(done); } preempt_count_dec(); + stopper->fn = NULL; + stopper->caller = 0; WARN_ONCE(preempt_count(), "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); goto repeat; diff --git a/lib/dump_stack.c b/lib/dump_stack.c index a00ee6eedc7c..f5a33b6f773f 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -12,6 +12,7 @@ #include #include #include +#include static char dump_stack_arch_desc_str[128]; @@ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl) log_lvl, dump_stack_arch_desc_str); print_worker_info(log_lvl, current); + print_stop_info(log_lvl, current); } /** -- cgit v1.2.3 From 565790d28b1e33ee2f77bad5348b99f6dfc366fd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 May 2020 14:13:00 +0200 Subject: sched: Fix balance_callback() The intent of balance_callback() has always been to delay executing balancing operations until the end of the current rq->lock section. This is because balance operations must often drop rq->lock, and that isn't safe in general. However, as noted by Scott, there were a few holes in that scheme; balance_callback() was called after rq->lock was dropped, which means another CPU can interleave and touch the callback list. Rework code to call the balance callbacks before dropping rq->lock where possible, and otherwise splice the balance list onto a local stack. This guarantees that the balance list must be empty when we take rq->lock. IOW, we'll only ever run our own balance callbacks. Reported-by: Scott Wood Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102346.203901269@infradead.org --- kernel/sched/core.c | 119 ++++++++++++++++++++++++++++++++------------------- kernel/sched/sched.h | 3 ++ 2 files changed, 78 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e24104faafd..0196a3fba087 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3485,6 +3485,69 @@ static inline void finish_task(struct task_struct *prev) #endif } +#ifdef CONFIG_SMP + +static void do_balance_callbacks(struct rq *rq, struct callback_head *head) +{ + void (*func)(struct rq *rq); + struct callback_head *next; + + lockdep_assert_held(&rq->lock); + + while (head) { + func = (void (*)(struct rq *))head->func; + next = head->next; + head->next = NULL; + head = next; + + func(rq); + } +} + +static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +{ + struct callback_head *head = rq->balance_callback; + + lockdep_assert_held(&rq->lock); + if (head) + rq->balance_callback = NULL; + + return head; +} + +static void __balance_callbacks(struct rq *rq) +{ + do_balance_callbacks(rq, splice_balance_callbacks(rq)); +} + +static inline void balance_callbacks(struct rq *rq, struct callback_head *head) +{ + unsigned long flags; + + if (unlikely(head)) { + raw_spin_lock_irqsave(&rq->lock, flags); + do_balance_callbacks(rq, head); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } +} + +#else + +static inline void __balance_callbacks(struct rq *rq) +{ +} + +static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +{ + return NULL; +} + +static inline void balance_callbacks(struct rq *rq, struct callback_head *head) +{ +} + +#endif + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -3510,6 +3573,7 @@ static inline void finish_lock_switch(struct rq *rq) * prev into current: */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + __balance_callbacks(rq); raw_spin_unlock_irq(&rq->lock); } @@ -3651,43 +3715,6 @@ static struct rq *finish_task_switch(struct task_struct *prev) return rq; } -#ifdef CONFIG_SMP - -/* rq->lock is NOT held, but preemption is disabled */ -static void __balance_callback(struct rq *rq) -{ - struct callback_head *head, *next; - void (*func)(struct rq *rq); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - head = rq->balance_callback; - rq->balance_callback = NULL; - while (head) { - func = (void (*)(struct rq *))head->func; - next = head->next; - head->next = NULL; - head = next; - - func(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -static inline void balance_callback(struct rq *rq) -{ - if (unlikely(rq->balance_callback)) - __balance_callback(rq); -} - -#else - -static inline void balance_callback(struct rq *rq) -{ -} - -#endif - /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -3707,7 +3734,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ rq = finish_task_switch(prev); - balance_callback(rq); preempt_enable(); if (current->set_child_tid) @@ -4523,10 +4549,11 @@ static void __sched notrace __schedule(bool preempt) rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unlock_irq(rq, &rf); - } - balance_callback(rq); + rq_unpin_lock(rq, &rf); + __balance_callbacks(rq); + raw_spin_unlock_irq(&rq->lock); + } } void __noreturn do_task_dead(void) @@ -4937,9 +4964,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) out_unlock: /* Avoid rq from going away on us: */ preempt_disable(); - __task_rq_unlock(rq, &rf); - balance_callback(rq); + rq_unpin_lock(rq, &rf); + __balance_callbacks(rq); + raw_spin_unlock(&rq->lock); + preempt_enable(); } #else @@ -5213,6 +5242,7 @@ static int __sched_setscheduler(struct task_struct *p, int retval, oldprio, oldpolicy = -1, queued, running; int new_effective_prio, policy = attr->sched_policy; const struct sched_class *prev_class; + struct callback_head *head; struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; @@ -5451,6 +5481,7 @@ change: /* Avoid rq from going away on us: */ preempt_disable(); + head = splice_balance_callbacks(rq); task_rq_unlock(rq, p, &rf); if (pi) { @@ -5459,7 +5490,7 @@ change: } /* Run balance callbacks after we've adjusted the PI chain: */ - balance_callback(rq); + balance_callbacks(rq, head); preempt_enable(); return 0; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index df80bfcea92e..738a00b9237a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1221,6 +1221,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; #endif +#ifdef CONFIG_SMP + SCHED_WARN_ON(rq->balance_callback); +#endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) -- cgit v1.2.3 From 2558aacff8586699bcd248b406febb28b0a25de2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Sep 2020 09:54:27 +0200 Subject: sched/hotplug: Ensure only per-cpu kthreads run during hotplug In preparation for migrate_disable(), make sure only per-cpu kthreads are allowed to run on !active CPUs. This is ran (as one of the very first steps) from the cpu-hotplug task which is a per-cpu kthread and completion of the hotplug operation only requires such tasks. This constraint enables the migrate_disable() implementation to wait for completion of all migrate_disable regions on this CPU at hotplug time without fear of any new ones starting. This replaces the unlikely(rq->balance_callbacks) test at the tail of context_switch with an unlikely(rq->balance_work), the fast path is not affected. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102346.292709163@infradead.org --- kernel/sched/core.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 7 +++- 2 files changed, 118 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0196a3fba087..1f8bfc952015 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3509,8 +3509,10 @@ static inline struct callback_head *splice_balance_callbacks(struct rq *rq) struct callback_head *head = rq->balance_callback; lockdep_assert_held(&rq->lock); - if (head) + if (head) { rq->balance_callback = NULL; + rq->balance_flags &= ~BALANCE_WORK; + } return head; } @@ -3531,6 +3533,21 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head) } } +static void balance_push(struct rq *rq); + +static inline void balance_switch(struct rq *rq) +{ + if (likely(!rq->balance_flags)) + return; + + if (rq->balance_flags & BALANCE_PUSH) { + balance_push(rq); + return; + } + + __balance_callbacks(rq); +} + #else static inline void __balance_callbacks(struct rq *rq) @@ -3546,6 +3563,10 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head) { } +static inline void balance_switch(struct rq *rq) +{ +} + #endif static inline void @@ -3573,7 +3594,7 @@ static inline void finish_lock_switch(struct rq *rq) * prev into current: */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - __balance_callbacks(rq); + balance_switch(rq); raw_spin_unlock_irq(&rq->lock); } @@ -6833,6 +6854,90 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) rq->stop = stop; } + +static int __balance_push_cpu_stop(void *arg) +{ + struct task_struct *p = arg; + struct rq *rq = this_rq(); + struct rq_flags rf; + int cpu; + + raw_spin_lock_irq(&p->pi_lock); + rq_lock(rq, &rf); + + update_rq_clock(rq); + + if (task_rq(p) == rq && task_on_rq_queued(p)) { + cpu = select_fallback_rq(rq->cpu, p); + rq = __migrate_task(rq, &rf, p, cpu); + } + + rq_unlock(rq, &rf); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + return 0; +} + +static DEFINE_PER_CPU(struct cpu_stop_work, push_work); + +/* + * Ensure we only run per-cpu kthreads once the CPU goes !active. + */ +static void balance_push(struct rq *rq) +{ + struct task_struct *push_task = rq->curr; + + lockdep_assert_held(&rq->lock); + SCHED_WARN_ON(rq->cpu != smp_processor_id()); + + /* + * Both the cpu-hotplug and stop task are in this case and are + * required to complete the hotplug process. + */ + if (is_per_cpu_kthread(push_task)) + return; + + get_task_struct(push_task); + /* + * Temporarily drop rq->lock such that we can wake-up the stop task. + * Both preemption and IRQs are still disabled. + */ + raw_spin_unlock(&rq->lock); + stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, + this_cpu_ptr(&push_work)); + /* + * At this point need_resched() is true and we'll take the loop in + * schedule(). The next pick is obviously going to be the stop task + * which is_per_cpu_kthread() and will push this task away. + */ + raw_spin_lock(&rq->lock); +} + +static void balance_push_set(int cpu, bool on) +{ + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; + + rq_lock_irqsave(rq, &rf); + if (on) + rq->balance_flags |= BALANCE_PUSH; + else + rq->balance_flags &= ~BALANCE_PUSH; + rq_unlock_irqrestore(rq, &rf); +} + +#else + +static inline void balance_push(struct rq *rq) +{ +} + +static inline void balance_push_set(int cpu, bool on) +{ +} + #endif /* CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) @@ -6918,6 +7023,8 @@ int sched_cpu_activate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; + balance_push_set(cpu, false); + #ifdef CONFIG_SCHED_SMT /* * When going up, increment the number of cores with SMT present. @@ -6965,6 +7072,8 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu(); + balance_push_set(cpu, true); + #ifdef CONFIG_SCHED_SMT /* * When going down, decrement the number of cores with SMT present. @@ -6978,6 +7087,7 @@ int sched_cpu_deactivate(unsigned int cpu) ret = cpuset_cpu_inactive(cpu); if (ret) { + balance_push_set(cpu, false); set_cpu_active(cpu, true); return ret; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 738a00b9237a..a71ac84acc1e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -973,6 +973,7 @@ struct rq { unsigned long cpu_capacity_orig; struct callback_head *balance_callback; + unsigned char balance_flags; unsigned char nohz_idle_balance; unsigned char idle_balance; @@ -1385,6 +1386,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP +#define BALANCE_WORK 0x01 +#define BALANCE_PUSH 0x02 + static inline void queue_balance_callback(struct rq *rq, struct callback_head *head, @@ -1392,12 +1396,13 @@ queue_balance_callback(struct rq *rq, { lockdep_assert_held(&rq->lock); - if (unlikely(head->next)) + if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) return; head->func = (void (*)(struct callback_head *))func; head->next = rq->balance_callback; rq->balance_callback = head; + rq->balance_flags |= BALANCE_WORK; } #define rcu_dereference_check_sched_domain(p) \ -- cgit v1.2.3 From f2469a1fb43f85d243ce72638367fb6e15c33491 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 14 Sep 2020 14:47:28 +0200 Subject: sched/core: Wait for tasks being pushed away on hotplug RT kernels need to ensure that all tasks which are not per CPU kthreads have left the outgoing CPU to guarantee that no tasks are force migrated within a migrate disabled section. There is also some desire to (ab)use fine grained CPU hotplug control to clear a CPU from active state to force migrate tasks which are not per CPU kthreads away for power control purposes. Add a mechanism which waits until all tasks which should leave the CPU after the CPU active flag is cleared have moved to a different online CPU. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102346.377836842@infradead.org --- kernel/sched/core.c | 40 +++++++++++++++++++++++++++++++++++++++- kernel/sched/sched.h | 4 ++++ 2 files changed, 43 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 1f8bfc952015..e1093c443ff9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6896,8 +6896,21 @@ static void balance_push(struct rq *rq) * Both the cpu-hotplug and stop task are in this case and are * required to complete the hotplug process. */ - if (is_per_cpu_kthread(push_task)) + if (is_per_cpu_kthread(push_task)) { + /* + * If this is the idle task on the outgoing CPU try to wake + * up the hotplug control thread which might wait for the + * last task to vanish. The rcuwait_active() check is + * accurate here because the waiter is pinned on this CPU + * and can't obviously be running in parallel. + */ + if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) { + raw_spin_unlock(&rq->lock); + rcuwait_wake_up(&rq->hotplug_wait); + raw_spin_lock(&rq->lock); + } return; + } get_task_struct(push_task); /* @@ -6928,6 +6941,20 @@ static void balance_push_set(int cpu, bool on) rq_unlock_irqrestore(rq, &rf); } +/* + * Invoked from a CPUs hotplug control thread after the CPU has been marked + * inactive. All tasks which are not per CPU kernel threads are either + * pushed off this CPU now via balance_push() or placed on a different CPU + * during wakeup. Wait until the CPU is quiescent. + */ +static void balance_hotplug_wait(void) +{ + struct rq *rq = this_rq(); + + rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1, + TASK_UNINTERRUPTIBLE); +} + #else static inline void balance_push(struct rq *rq) @@ -6938,6 +6965,10 @@ static inline void balance_push_set(int cpu, bool on) { } +static inline void balance_hotplug_wait(void) +{ +} + #endif /* CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) @@ -7092,6 +7123,10 @@ int sched_cpu_deactivate(unsigned int cpu) return ret; } sched_domains_numa_masks_clear(cpu); + + /* Wait for all non per CPU kernel threads to vanish. */ + balance_hotplug_wait(); + return 0; } @@ -7332,6 +7367,9 @@ void __init sched_init(void) rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); #endif +#ifdef CONFIG_HOTPLUG_CPU + rcuwait_init(&rq->hotplug_wait); +#endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a71ac84acc1e..c6f707a6d9d4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1004,6 +1004,10 @@ struct rq { /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; + +#ifdef CONFIG_HOTPLUG_CPU + struct rcuwait hotplug_wait; +#endif #endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING -- cgit v1.2.3 From 06249738a41a70f2201a148866899f84cbebc45e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 25 Sep 2020 15:45:11 +0200 Subject: workqueue: Manually break affinity on hotplug Don't rely on the scheduler to force break affinity for us -- it will stop doing that for per-cpu-kthreads. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Acked-by: Tejun Heo Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102346.464718669@infradead.org --- kernel/workqueue.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 437935e7a199..c71da2a59e12 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4908,6 +4908,10 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; raw_spin_unlock_irq(&pool->lock); + + for_each_pool_worker(worker, pool) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); + mutex_unlock(&wq_pool_attach_mutex); /* -- cgit v1.2.3 From 1cf12e08bc4d50a76b80c42a3109c53d8794a0c9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 16 Sep 2020 09:27:18 +0200 Subject: sched/hotplug: Consolidate task migration on CPU unplug With the new mechanism which kicks tasks off the outgoing CPU at the end of schedule() the situation on an outgoing CPU right before the stopper thread brings it down completely is: - All user tasks and all unbound kernel threads have either been migrated away or are not running and the next wakeup will move them to a online CPU. - All per CPU kernel threads, except cpu hotplug thread and the stopper thread have either been unbound or parked by the responsible CPU hotplug callback. That means that at the last step before the stopper thread is invoked the cpu hotplug thread is the last legitimate running task on the outgoing CPU. Add a final wait step right before the stopper thread is kicked which ensures that any still running tasks on the way to park or on the way to kick themself of the CPU are either sleeping or gone. This allows to remove the migrate_tasks() crutch in sched_cpu_dying(). If sched_cpu_dying() detects that there is still another running task aside of the stopper thread then it will explode with the appropriate fireworks. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102346.547163969@infradead.org --- include/linux/cpuhotplug.h | 1 + include/linux/sched/hotplug.h | 2 + kernel/cpu.c | 9 ++- kernel/sched/core.c | 154 ++++++++++-------------------------------- 4 files changed, 46 insertions(+), 120 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index bc56287a1ed1..0042ef362511 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -152,6 +152,7 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_AP_ONLINE_IDLE, + CPUHP_AP_SCHED_WAIT_EMPTY, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h index 9a62ffdd296f..412cdaba33eb 100644 --- a/include/linux/sched/hotplug.h +++ b/include/linux/sched/hotplug.h @@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu); extern int sched_cpu_deactivate(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU +extern int sched_cpu_wait_empty(unsigned int cpu); extern int sched_cpu_dying(unsigned int cpu); #else +# define sched_cpu_wait_empty NULL # define sched_cpu_dying NULL #endif diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ff2578ecf17..fa535eaa4826 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { .name = "ap:online", }, /* - * Handled on controll processor until the plugged processor manages + * Handled on control processor until the plugged processor manages * this itself. */ [CPUHP_TEARDOWN_CPU] = { @@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states[] = { .teardown.single = takedown_cpu, .cant_stop = true, }, + + [CPUHP_AP_SCHED_WAIT_EMPTY] = { + .name = "sched:waitempty", + .startup.single = NULL, + .teardown.single = sched_cpu_wait_empty, + }, + /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e1093c443ff9..6c89806c834b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6741,120 +6741,6 @@ void idle_task_exit(void) /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } -/* - * Since this CPU is going 'away' for a while, fold any nr_active delta - * we might have. Assumes we're called after migrate_tasks() so that the - * nr_active count is stable. We need to take the teardown thread which - * is calling this into account, so we hand in adjust = 1 to the load - * calculation. - * - * Also see the comment "Global load-average calculations". - */ -static void calc_load_migrate(struct rq *rq) -{ - long delta = calc_load_fold_active(rq, 1); - if (delta) - atomic_long_add(delta, &calc_load_tasks); -} - -static struct task_struct *__pick_migrate_task(struct rq *rq) -{ - const struct sched_class *class; - struct task_struct *next; - - for_each_class(class) { - next = class->pick_next_task(rq); - if (next) { - next->sched_class->put_prev_task(rq, next); - return next; - } - } - - /* The idle class should always have a runnable task */ - BUG(); -} - -/* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). - * - * Called with rq->lock held even though we'er in stop_machine() and - * there's no concurrency possible, we hold the required locks anyway - * because of lock validation efforts. - */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) -{ - struct rq *rq = dead_rq; - struct task_struct *next, *stop = rq->stop; - struct rq_flags orf = *rf; - int dest_cpu; - - /* - * Fudge the rq selection such that the below task selection loop - * doesn't get stuck on the currently eligible stop task. - * - * We're currently inside stop_machine() and the rq is either stuck - * in the stop_machine_cpu_stop() loop, or we're executing this code, - * either way we should never end up calling schedule() until we're - * done here. - */ - rq->stop = NULL; - - /* - * put_prev_task() and pick_next_task() sched - * class method both need to have an up-to-date - * value of rq->clock[_task] - */ - update_rq_clock(rq); - - for (;;) { - /* - * There's this thread running, bail when that's the only - * remaining thread: - */ - if (rq->nr_running == 1) - break; - - next = __pick_migrate_task(rq); - - /* - * Rules for changing task_struct::cpus_mask are holding - * both pi_lock and rq->lock, such that holding either - * stabilizes the mask. - * - * Drop rq->lock is not quite as disastrous as it usually is - * because !cpu_active at this point, which means load-balance - * will not interfere. Also, stop-machine. - */ - rq_unlock(rq, rf); - raw_spin_lock(&next->pi_lock); - rq_relock(rq, rf); - - /* - * Since we're inside stop-machine, _nothing_ should have - * changed the task, WARN if weird stuff happened, because in - * that case the above rq->lock drop is a fail too. - */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { - raw_spin_unlock(&next->pi_lock); - continue; - } - - /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); - rq = __migrate_task(rq, rf, next, dest_cpu); - if (rq != dead_rq) { - rq_unlock(rq, rf); - rq = dead_rq; - *rf = orf; - rq_relock(rq, rf); - } - raw_spin_unlock(&next->pi_lock); - } - - rq->stop = stop; -} - static int __balance_push_cpu_stop(void *arg) { struct task_struct *p = arg; @@ -7123,10 +7009,6 @@ int sched_cpu_deactivate(unsigned int cpu) return ret; } sched_domains_numa_masks_clear(cpu); - - /* Wait for all non per CPU kernel threads to vanish. */ - balance_hotplug_wait(); - return 0; } @@ -7146,6 +7028,41 @@ int sched_cpu_starting(unsigned int cpu) } #ifdef CONFIG_HOTPLUG_CPU + +/* + * Invoked immediately before the stopper thread is invoked to bring the + * CPU down completely. At this point all per CPU kthreads except the + * hotplug thread (current) and the stopper thread (inactive) have been + * either parked or have been unbound from the outgoing CPU. Ensure that + * any of those which might be on the way out are gone. + * + * If after this point a bound task is being woken on this CPU then the + * responsible hotplug callback has failed to do it's job. + * sched_cpu_dying() will catch it with the appropriate fireworks. + */ +int sched_cpu_wait_empty(unsigned int cpu) +{ + balance_hotplug_wait(); + return 0; +} + +/* + * Since this CPU is going 'away' for a while, fold any nr_active delta we + * might have. Called from the CPU stopper task after ensuring that the + * stopper is the last running task on the CPU, so nr_active count is + * stable. We need to take the teardown thread which is calling this into + * account, so we hand in adjust = 1 to the load calculation. + * + * Also see the comment "Global load-average calculations". + */ +static void calc_load_migrate(struct rq *rq) +{ + long delta = calc_load_fold_active(rq, 1); + + if (delta) + atomic_long_add(delta, &calc_load_tasks); +} + int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); @@ -7159,7 +7076,6 @@ int sched_cpu_dying(unsigned int cpu) BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq, &rf); BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); -- cgit v1.2.3 From 120455c514f7321981c907a01c543b05aff3f254 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 25 Sep 2020 16:42:31 +0200 Subject: sched: Fix hotplug vs CPU bandwidth control Since we now migrate tasks away before DYING, we should also move bandwidth unthrottle, otherwise we can gain tasks from unthrottle after we expect all tasks to be gone already. Also; it looks like the RT balancers don't respect cpu_active() and instead rely on rq->online in part, complete this. This too requires we do set_rq_offline() earlier to match the cpu_active() semantics. (The bigger patch is to convert RT to cpu_active() entirely) Since set_rq_online() is called from sched_cpu_activate(), place set_rq_offline() in sched_cpu_deactivate(). Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102346.639538965@infradead.org --- kernel/sched/core.c | 14 ++++++++++---- kernel/sched/deadline.c | 2 +- kernel/sched/rt.c | 2 +- 3 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6c89806c834b..dcb88a06ef14 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6977,6 +6977,8 @@ int sched_cpu_activate(unsigned int cpu) int sched_cpu_deactivate(unsigned int cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; int ret; set_cpu_active(cpu, false); @@ -6991,6 +6993,14 @@ int sched_cpu_deactivate(unsigned int cpu) balance_push_set(cpu, true); + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + update_rq_clock(rq); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + rq_unlock_irqrestore(rq, &rf); + #ifdef CONFIG_SCHED_SMT /* * When going down, decrement the number of cores with SMT present. @@ -7072,10 +7082,6 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f232305dcefe..77880fea569f 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -543,7 +543,7 @@ static int push_dl_task(struct rq *rq); static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) { - return dl_task(prev); + return rq->online && dl_task(prev); } static DEFINE_PER_CPU(struct callback_head, dl_push_head); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 49ec096a8aa1..40a46639f78a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + return rq->online && rq->rt.highest_prio.curr > prev->prio; } static inline int rt_overloaded(struct rq *rq) -- cgit v1.2.3 From 9cfc3e18adb0362533e911bf3ce6ec8c821cfccc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Sep 2020 14:59:08 +0200 Subject: sched: Massage set_cpus_allowed() Thread a u32 flags word through the *set_cpus_allowed*() callchain. This will allow adding behavioural tweaks for future users. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102346.729082820@infradead.org --- kernel/sched/core.c | 28 ++++++++++++++++++---------- kernel/sched/deadline.c | 5 +++-- kernel/sched/sched.h | 7 +++++-- 3 files changed, 26 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dcb88a06ef14..396accb1d69c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1824,13 +1824,14 @@ static int migration_cpu_stop(void *data) * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. */ -void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { cpumask_copy(&p->cpus_mask, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { struct rq *rq = task_rq(p); bool queued, running; @@ -1851,7 +1852,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (running) put_prev_task(rq, p); - p->sched_class->set_cpus_allowed(p, new_mask); + p->sched_class->set_cpus_allowed(p, new_mask, flags); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -1859,6 +1860,11 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) set_next_task(rq, p); } +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + __do_set_cpus_allowed(p, new_mask, 0); +} + /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -1869,7 +1875,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * call is not atomic; no spinlocks may be held. */ static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) + const struct cpumask *new_mask, + u32 flags) { const struct cpumask *cpu_valid_mask = cpu_active_mask; unsigned int dest_cpu; @@ -1891,7 +1898,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. */ - if (check && (p->flags & PF_NO_SETAFFINITY)) { + if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out; } @@ -1910,7 +1917,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - do_set_cpus_allowed(p, new_mask); + __do_set_cpus_allowed(p, new_mask, flags); if (p->flags & PF_KTHREAD) { /* @@ -1947,7 +1954,7 @@ out: int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - return __set_cpus_allowed_ptr(p, new_mask, false); + return __set_cpus_allowed_ptr(p, new_mask, 0); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); @@ -2406,7 +2413,8 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) #else static inline int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) + const struct cpumask *new_mask, + u32 flags) { return set_cpus_allowed_ptr(p, new_mask); } @@ -6006,7 +6014,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -6590,7 +6598,7 @@ void init_idle(struct task_struct *idle, int cpu) * * And since this is boot we can forgo the serialization. */ - set_cpus_allowed_common(idle, cpumask_of(cpu)); + set_cpus_allowed_common(idle, cpumask_of(cpu), 0); #endif /* * We're having a chicken and egg problem, even though we are diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 77880fea569f..e97c7c2708bc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2301,7 +2301,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) } static void set_cpus_allowed_dl(struct task_struct *p, - const struct cpumask *new_mask) + const struct cpumask *new_mask, + u32 flags) { struct root_domain *src_rd; struct rq *rq; @@ -2330,7 +2331,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - set_cpus_allowed_common(p, new_mask); + set_cpus_allowed_common(p, new_mask, flags); } /* Assumes rq->lock is held */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c6f707a6d9d4..0420d80fb250 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1814,7 +1814,8 @@ struct sched_class { void (*task_woken)(struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, - const struct cpumask *newmask); + const struct cpumask *newmask, + u32 flags); void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); @@ -1907,7 +1908,9 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); +#define SCA_CHECK 0x01 + +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); #endif -- cgit v1.2.3 From af449901b84c98cbd84a0113223ba3bcfcb12a26 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 Sep 2020 10:38:30 +0200 Subject: sched: Add migrate_disable() Add the base migrate_disable() support (under protest). While migrate_disable() is (currently) required for PREEMPT_RT, it is also one of the biggest flaws in the system. Notably this is just the base implementation, it is broken vs sched_setaffinity() and hotplug, both solved in additional patches for ease of review. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102346.818170844@infradead.org --- include/linux/preempt.h | 65 ++++++++++++++++++++++++++++ include/linux/sched.h | 3 ++ kernel/sched/core.c | 112 +++++++++++++++++++++++++++++++++++++++++++++--- kernel/sched/sched.h | 6 ++- lib/smp_processor_id.c | 5 +++ 5 files changed, 183 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 7d9c1c0e149c..97ba7c920653 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -322,6 +322,69 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, #endif +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + +/* + * Migrate-Disable and why it is (strongly) undesired. + * + * The premise of the Real-Time schedulers we have on Linux + * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks + * concurrently, provided there are sufficient runnable tasks, also known as + * work-conserving. For instance SCHED_DEADLINE tries to schedule the M + * earliest deadline threads, and SCHED_FIFO the M highest priority threads. + * + * The correctness of various scheduling models depends on this, but is it + * broken by migrate_disable() that doesn't imply preempt_disable(). Where + * preempt_disable() implies an immediate priority ceiling, preemptible + * migrate_disable() allows nesting. + * + * The worst case is that all tasks preempt one another in a migrate_disable() + * region and stack on a single CPU. This then reduces the available bandwidth + * to a single CPU. And since Real-Time schedulability theory considers the + * Worst-Case only, all Real-Time analysis shall revert to single-CPU + * (instantly solving the SMP analysis problem). + * + * + * The reason we have it anyway. + * + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a + * number of primitives into becoming preemptible, they would also allow + * migration. This turns out to break a bunch of per-cpu usage. To this end, + * all these primitives employ migirate_disable() to restore this implicit + * assumption. + * + * This is a 'temporary' work-around at best. The correct solution is getting + * rid of the above assumptions and reworking the code to employ explicit + * per-cpu locking or short preempt-disable regions. + * + * The end goal must be to get rid of migrate_disable(), alternatively we need + * a schedulability theory that does not depend on abritrary migration. + * + * + * Notes on the implementation. + * + * The implementation is particularly tricky since existing code patterns + * dictate neither migrate_disable() nor migrate_enable() is allowed to block. + * This means that it cannot use cpus_read_lock() to serialize against hotplug, + * nor can it easily migrate itself into a pending affinity mask change on + * migrate_enable(). + * + * + * Note: even non-work-conserving schedulers like semi-partitioned depends on + * migration, so migrate_disable() is not only a problem for + * work-conserving schedulers. + * + */ +extern void migrate_disable(void); +extern void migrate_enable(void); + +#elif defined(CONFIG_PREEMPT_RT) + +static inline void migrate_disable(void) { } +static inline void migrate_enable(void) { } + +#else /* !CONFIG_PREEMPT_RT */ + /** * migrate_disable - Prevent migration of the current task * @@ -352,4 +415,6 @@ static __always_inline void migrate_enable(void) preempt_enable(); } +#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */ + #endif /* __LINUX_PREEMPT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 063cd120b459..0732356c0eca 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -714,6 +714,9 @@ struct task_struct { int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t cpus_mask; +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + int migration_disabled; +#endif #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 396accb1d69c..6a3f1c2e185b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1696,6 +1696,61 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT_RT + +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags); + +static void migrate_disable_switch(struct rq *rq, struct task_struct *p) +{ + if (likely(!p->migration_disabled)) + return; + + if (p->cpus_ptr != &p->cpus_mask) + return; + + /* + * Violates locking rules! see comment in __do_set_cpus_allowed(). + */ + __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); +} + +void migrate_disable(void) +{ + if (current->migration_disabled++) + return; + + barrier(); +} +EXPORT_SYMBOL_GPL(migrate_disable); + +void migrate_enable(void) +{ + struct task_struct *p = current; + + if (--p->migration_disabled) + return; + + barrier(); + + if (p->cpus_ptr == &p->cpus_mask) + return; + + __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); +} +EXPORT_SYMBOL_GPL(migrate_enable); + +static inline bool is_migration_disabled(struct task_struct *p) +{ + return p->migration_disabled; +} + +#endif + /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). @@ -1705,7 +1760,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; - if (is_per_cpu_kthread(p)) + if (is_per_cpu_kthread(p) || is_migration_disabled(p)) return cpu_online(cpu); return cpu_active(cpu); @@ -1826,6 +1881,11 @@ static int migration_cpu_stop(void *data) */ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { + if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { + p->cpus_ptr = new_mask; + return; + } + cpumask_copy(&p->cpus_mask, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } @@ -1836,7 +1896,22 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 struct rq *rq = task_rq(p); bool queued, running; - lockdep_assert_held(&p->pi_lock); + /* + * This here violates the locking rules for affinity, since we're only + * supposed to change these variables while holding both rq->lock and + * p->pi_lock. + * + * HOWEVER, it magically works, because ttwu() is the only code that + * accesses these variables under p->pi_lock and only does so after + * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() + * before finish_task(). + * + * XXX do further audits, this smells like something putrid. + */ + if (flags & SCA_MIGRATE_DISABLE) + SCHED_WARN_ON(!p->on_cpu); + else + lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -1887,9 +1962,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD) { + if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { /* - * Kernel threads are allowed on online && !active CPUs + * Kernel threads are allowed on online && !active CPUs. + * + * Specifically, migration_disabled() tasks must not fail the + * cpumask_any_and_distribute() pick below, esp. so on + * SCA_MIGRATE_ENABLE, otherwise we'll not call + * set_cpus_allowed_common() and actually reset p->cpus_ptr. */ cpu_valid_mask = cpu_online_mask; } @@ -1903,7 +1983,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(&p->cpus_mask, new_mask)) + if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask)) goto out; /* @@ -1995,6 +2075,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * Clearly, migrating tasks to offline CPUs is a fairly daft thing. */ WARN_ON_ONCE(!cpu_online(new_cpu)); + + WARN_ON_ONCE(is_migration_disabled(p)); #endif trace_sched_migrate_task(p, new_cpu); @@ -2325,6 +2407,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: + /* + * XXX When called from select_task_rq() we only + * hold p->pi_lock and again violate locking order. + * + * More yuck to audit. + */ do_set_cpus_allowed(p, cpu_possible_mask); state = fail; break; @@ -2359,7 +2447,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); else cpu = cpumask_any(p->cpus_ptr); @@ -2421,6 +2509,17 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, #endif /* CONFIG_SMP */ +#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT) + +static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } + +static inline bool is_migration_disabled(struct task_struct *p) +{ + return false; +} + +#endif + static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -4570,6 +4669,7 @@ static void __sched notrace __schedule(bool preempt) */ ++*switch_count; + migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); trace_sched_switch(preempt, prev, next); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0420d80fb250..72d8e47cf0bb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1902,14 +1902,16 @@ static inline bool sched_fair_runnable(struct rq *rq) extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); extern struct task_struct *pick_next_task_idle(struct rq *rq); +#define SCA_CHECK 0x01 +#define SCA_MIGRATE_DISABLE 0x02 +#define SCA_MIGRATE_ENABLE 0x04 + #ifdef CONFIG_SMP extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -#define SCA_CHECK 0x01 - extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); #endif diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 525222e4f409..faaa927ac2c8 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) if (current->nr_cpus_allowed == 1) goto out; +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + if (current->migration_disabled) + goto out; +#endif + /* * It is valid to assume CPU-locality during early bootup: */ -- cgit v1.2.3 From 6d337eab041d56bb8f0e7794f39906c21054c512 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Sep 2020 17:24:31 +0200 Subject: sched: Fix migrate_disable() vs set_cpus_allowed_ptr() Concurrent migrate_disable() and set_cpus_allowed_ptr() has interesting features. We rely on set_cpus_allowed_ptr() to not return until the task runs inside the provided mask. This expectation is exported to userspace. This means that any set_cpus_allowed_ptr() caller must wait until migrate_enable() allows migrations. At the same time, we don't want migrate_enable() to schedule, due to patterns like: preempt_disable(); migrate_disable(); ... migrate_enable(); preempt_enable(); And: raw_spin_lock(&B); spin_unlock(&A); this means that when migrate_enable() must restore the affinity mask, it cannot wait for completion thereof. Luck will have it that that is exactly the case where there is a pending set_cpus_allowed_ptr(), so let that provide storage for the async stop machine. Much thanks to Valentin who used TLA+ most effective and found lots of 'interesting' cases. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102346.921768277@infradead.org --- include/linux/sched.h | 1 + kernel/sched/core.c | 236 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 207 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0732356c0eca..90a0c92741d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -714,6 +714,7 @@ struct task_struct { int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t cpus_mask; + void *migration_pending; #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) int migration_disabled; #endif diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6a3f1c2e185b..0efc1e41bb60 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1732,15 +1732,26 @@ void migrate_enable(void) { struct task_struct *p = current; - if (--p->migration_disabled) + if (p->migration_disabled > 1) { + p->migration_disabled--; return; + } + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + preempt_disable(); + if (p->cpus_ptr != &p->cpus_mask) + __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ barrier(); - - if (p->cpus_ptr == &p->cpus_mask) - return; - - __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); + p->migration_disabled = 0; + preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -1805,8 +1816,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, } struct migration_arg { - struct task_struct *task; - int dest_cpu; + struct task_struct *task; + int dest_cpu; + struct set_affinity_pending *pending; +}; + +struct set_affinity_pending { + refcount_t refs; + struct completion done; + struct cpu_stop_work stop_work; + struct migration_arg arg; }; /* @@ -1838,16 +1857,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, */ static int migration_cpu_stop(void *data) { + struct set_affinity_pending *pending; struct migration_arg *arg = data; struct task_struct *p = arg->task; + int dest_cpu = arg->dest_cpu; struct rq *rq = this_rq(); + bool complete = false; struct rq_flags rf; /* * The original target CPU might have gone down and we might * be on another CPU but it doesn't matter. */ - local_irq_disable(); + local_irq_save(rf.flags); /* * We need to explicitly wake pending tasks before running * __migrate_task() such that we will not miss enforcing cpus_ptr @@ -1857,21 +1879,83 @@ static int migration_cpu_stop(void *data) raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); + + pending = p->migration_pending; /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ if (task_rq(p) == rq) { + if (is_migration_disabled(p)) + goto out; + + if (pending) { + p->migration_pending = NULL; + complete = true; + } + + /* migrate_enable() -- we must not race against SCA */ + if (dest_cpu < 0) { + /* + * When this was migrate_enable() but we no longer + * have a @pending, a concurrent SCA 'fixed' things + * and we should be valid again. Nothing to do. + */ + if (!pending) { + WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); + goto out; + } + + dest_cpu = cpumask_any_distribute(&p->cpus_mask); + } + if (task_on_rq_queued(p)) - rq = __migrate_task(rq, &rf, p, arg->dest_cpu); + rq = __migrate_task(rq, &rf, p, dest_cpu); else - p->wake_cpu = arg->dest_cpu; + p->wake_cpu = dest_cpu; + + } else if (dest_cpu < 0) { + /* + * This happens when we get migrated between migrate_enable()'s + * preempt_enable() and scheduling the stopper task. At that + * point we're a regular task again and not current anymore. + * + * A !PREEMPT kernel has a giant hole here, which makes it far + * more likely. + */ + + /* + * When this was migrate_enable() but we no longer have an + * @pending, a concurrent SCA 'fixed' things and we should be + * valid again. Nothing to do. + */ + if (!pending) { + WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); + goto out; + } + + /* + * When migrate_enable() hits a rq mis-match we can't reliably + * determine is_migration_disabled() and so have to chase after + * it. + */ + task_rq_unlock(rq, p, &rf); + stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, + &pending->arg, &pending->stop_work); + return 0; } - rq_unlock(rq, &rf); - raw_spin_unlock(&p->pi_lock); +out: + task_rq_unlock(rq, p, &rf); + + if (complete) + complete_all(&pending->done); + + /* For pending->{arg,stop_work} */ + pending = arg->pending; + if (pending && refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); - local_irq_enable(); return 0; } @@ -1940,6 +2024,112 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) __do_set_cpus_allowed(p, new_mask, 0); } +/* + * This function is wildly self concurrent, consider at least 3 times. + */ +static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, + int dest_cpu, unsigned int flags) +{ + struct set_affinity_pending my_pending = { }, *pending = NULL; + struct migration_arg arg = { + .task = p, + .dest_cpu = dest_cpu, + }; + bool complete = false; + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + pending = p->migration_pending; + if (pending) { + refcount_inc(&pending->refs); + p->migration_pending = NULL; + complete = true; + } + task_rq_unlock(rq, p, rf); + + if (complete) + goto do_complete; + + return 0; + } + + if (!(flags & SCA_MIGRATE_ENABLE)) { + /* serialized by p->pi_lock */ + if (!p->migration_pending) { + refcount_set(&my_pending.refs, 1); + init_completion(&my_pending.done); + p->migration_pending = &my_pending; + } else { + pending = p->migration_pending; + refcount_inc(&pending->refs); + } + } + pending = p->migration_pending; + /* + * - !MIGRATE_ENABLE: + * we'll have installed a pending if there wasn't one already. + * + * - MIGRATE_ENABLE: + * we're here because the current CPU isn't matching anymore, + * the only way that can happen is because of a concurrent + * set_cpus_allowed_ptr() call, which should then still be + * pending completion. + * + * Either way, we really should have a @pending here. + */ + if (WARN_ON_ONCE(!pending)) { + task_rq_unlock(rq, p, rf); + return -EINVAL; + } + + if (flags & SCA_MIGRATE_ENABLE) { + + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + task_rq_unlock(rq, p, rf); + + pending->arg = (struct migration_arg) { + .task = p, + .dest_cpu = -1, + .pending = pending, + }; + + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + + return 0; + } + + if (task_running(rq, p) || p->state == TASK_WAKING) { + + task_rq_unlock(rq, p, rf); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + + } else { + + if (!is_migration_disabled(p)) { + if (task_on_rq_queued(p)) + rq = move_queued_task(rq, rf, p, dest_cpu); + + p->migration_pending = NULL; + complete = true; + } + task_rq_unlock(rq, p, rf); + +do_complete: + if (complete) + complete_all(&pending->done); + } + + wait_for_completion(&pending->done); + + if (refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); + + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + + return 0; +} + /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -2009,23 +2199,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, p->nr_cpus_allowed != 1); } - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; + return affine_move_task(rq, p, &rf, dest_cpu, flags); - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &rf); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - return 0; - } else if (task_on_rq_queued(p)) { - /* - * OK, since we're going to drop the lock immediately - * afterwards anyway. - */ - rq = move_queued_task(rq, &rf, p, dest_cpu); - } out: task_rq_unlock(rq, p, &rf); @@ -3205,6 +3380,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; + p->migration_pending = NULL; #endif } -- cgit v1.2.3 From 3015ef4b98f53fe7eba4f5f82f562c0e074d213c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Aug 2020 14:08:10 +0200 Subject: sched/core: Make migrate disable and CPU hotplug cooperative On CPU unplug tasks which are in a migrate disabled region cannot be pushed to a different CPU until they returned to migrateable state. Account the number of tasks on a runqueue which are in a migrate disabled section and make the hotplug wait mechanism respect that. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102347.067278757@infradead.org --- kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++------ kernel/sched/sched.h | 4 ++++ 2 files changed, 34 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0efc1e41bb60..6ea593c79f83 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1721,10 +1721,17 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) void migrate_disable(void) { - if (current->migration_disabled++) + struct task_struct *p = current; + + if (p->migration_disabled) { + p->migration_disabled++; return; + } - barrier(); + preempt_disable(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; + preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_disable); @@ -1751,6 +1758,7 @@ void migrate_enable(void) */ barrier(); p->migration_disabled = 0; + this_rq()->nr_pinned--; preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -1760,6 +1768,11 @@ static inline bool is_migration_disabled(struct task_struct *p) return p->migration_disabled; } +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return rq->nr_pinned; +} + #endif /* @@ -2693,6 +2706,11 @@ static inline bool is_migration_disabled(struct task_struct *p) return false; } +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return false; +} + #endif static void @@ -7066,15 +7084,20 @@ static void balance_push(struct rq *rq) * Both the cpu-hotplug and stop task are in this case and are * required to complete the hotplug process. */ - if (is_per_cpu_kthread(push_task)) { + if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { /* * If this is the idle task on the outgoing CPU try to wake * up the hotplug control thread which might wait for the * last task to vanish. The rcuwait_active() check is * accurate here because the waiter is pinned on this CPU * and can't obviously be running in parallel. + * + * On RT kernels this also has to check whether there are + * pinned and scheduled out tasks on the runqueue. They + * need to leave the migrate disabled section first. */ - if (!rq->nr_running && rcuwait_active(&rq->hotplug_wait)) { + if (!rq->nr_running && !rq_has_pinned_tasks(rq) && + rcuwait_active(&rq->hotplug_wait)) { raw_spin_unlock(&rq->lock); rcuwait_wake_up(&rq->hotplug_wait); raw_spin_lock(&rq->lock); @@ -7121,7 +7144,8 @@ static void balance_hotplug_wait(void) { struct rq *rq = this_rq(); - rcuwait_wait_event(&rq->hotplug_wait, rq->nr_running == 1, + rcuwait_wait_event(&rq->hotplug_wait, + rq->nr_running == 1 && !rq_has_pinned_tasks(rq), TASK_UNINTERRUPTIBLE); } @@ -7366,7 +7390,7 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); - BUG_ON(rq->nr_running != 1); + BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 72d8e47cf0bb..42de1406c0dc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1053,6 +1053,10 @@ struct rq { /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; #endif + +#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) + unsigned int nr_pinned; +#endif }; #ifdef CONFIG_FAIR_GROUP_SCHED -- cgit v1.2.3 From 14e292f8d45380c519a83d9b0f37089a17eedcdf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Oct 2020 15:54:14 +0200 Subject: sched,rt: Use cpumask_any*_distribute() Replace a bunch of cpumask_any*() instances with cpumask_any*_distribute(), by injecting this little bit of random in cpu selection, we reduce the chance two competing balance operations working off the same lowest_mask pick the same CPU. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102347.190759694@infradead.org --- include/linux/cpumask.h | 6 ++++++ kernel/sched/deadline.c | 6 +++--- kernel/sched/rt.c | 6 +++--- lib/cpumask.c | 18 ++++++++++++++++++ 4 files changed, 30 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index f0d895d6ac39..383684e30f12 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p, return cpumask_next_and(-1, src1p, src2p); } +static inline int cpumask_any_distribute(const struct cpumask *srcp) +{ + return cpumask_first(srcp); +} + #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ @@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); unsigned int cpumask_local_spread(unsigned int i, int node); int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); +int cpumask_any_distribute(const struct cpumask *srcp); /** * for_each_cpu - iterate over every cpu in a mask diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e97c7c2708bc..206a0703fcbc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2002,8 +2002,8 @@ static int find_later_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(later_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(later_mask, + sched_domain_span(sd)); /* * Last chance: if a CPU being in both later_mask * and current sd span is valid, that becomes our @@ -2025,7 +2025,7 @@ static int find_later_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(later_mask); + cpu = cpumask_any_distribute(later_mask); if (cpu < nr_cpu_ids) return cpu; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 40a46639f78a..2525a1beed26 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1752,8 +1752,8 @@ static int find_lowest_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(lowest_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(lowest_mask, + sched_domain_span(sd)); if (best_cpu < nr_cpu_ids) { rcu_read_unlock(); return best_cpu; @@ -1770,7 +1770,7 @@ static int find_lowest_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(lowest_mask); + cpu = cpumask_any_distribute(lowest_mask); if (cpu < nr_cpu_ids) return cpu; diff --git a/lib/cpumask.c b/lib/cpumask.c index 85da6ab4fbb5..35924025097b 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -267,3 +267,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p, return next; } EXPORT_SYMBOL(cpumask_any_and_distribute); + +int cpumask_any_distribute(const struct cpumask *srcp) +{ + int next, prev; + + /* NOTE: our first selection will skip 0. */ + prev = __this_cpu_read(distribute_cpu_mask_prev); + + next = cpumask_next(prev, srcp); + if (next >= nr_cpu_ids) + next = cpumask_first(srcp); + + if (next < nr_cpu_ids) + __this_cpu_write(distribute_cpu_mask_prev, next); + + return next; +} +EXPORT_SYMBOL(cpumask_any_distribute); -- cgit v1.2.3 From 95158a89dd50035b4ff5b8aa913854166b50fe6d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Oct 2020 16:05:39 +0200 Subject: sched,rt: Use the full cpumask for balancing We want migrate_disable() tasks to get PULLs in order for them to PUSH away the higher priority task. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102347.310519774@infradead.org --- kernel/sched/cpudeadline.c | 4 ++-- kernel/sched/cpupri.c | 4 ++-- kernel/sched/deadline.c | 4 ++-- kernel/sched/rt.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8cb06c8c7eb1..ceb03d76c0cc 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { + cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) { unsigned long cap, max_cap = 0; int cpu, max_cpu = -1; @@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); - if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && + if (cpumask_test_cpu(best_cpu, &p->cpus_mask) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { if (later_mask) cpumask_set_cpu(best_cpu, later_mask); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 0033731a0797..11c4df2010de 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -73,11 +73,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) return 0; - if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids) return 0; if (lowest_mask) { - cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); + cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 206a0703fcbc..3d3fd8370447 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1912,7 +1912,7 @@ static void task_fork_dl(struct task_struct *p) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; } @@ -2062,7 +2062,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) /* Retry if something changed. */ if (double_lock_balance(rq, later_rq)) { if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr) || + !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask) || task_running(rq, task) || !dl_task(task) || !task_on_rq_queued(task))) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2525a1beed26..cf63346a07e4 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1658,7 +1658,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; @@ -1811,7 +1811,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * Also make sure that it wasn't scheduled on its rq. */ if (unlikely(task_rq(task) != rq || - !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr) || + !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || task_running(rq, task) || !rt_task(task) || !task_on_rq_queued(task))) { -- cgit v1.2.3 From ded467dc83ac7173f1532bb0faa25022ff8769e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 1 Oct 2020 16:13:01 +0200 Subject: sched, lockdep: Annotate ->pi_lock recursion There's a valid ->pi_lock recursion issue where the actual PI code tries to wake up the stop task. Make lockdep aware so it doesn't complain about this. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102347.406912197@infradead.org --- kernel/sched/core.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6ea593c79f83..9ce2fc7d3d51 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2658,6 +2658,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) void sched_set_stop_task(int cpu, struct task_struct *stop) { + static struct lock_class_key stop_pi_lock; struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; struct task_struct *old_stop = cpu_rq(cpu)->stop; @@ -2673,6 +2674,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); stop->sched_class = &stop_sched_class; + + /* + * The PI code calls rt_mutex_setprio() with ->pi_lock held to + * adjust the effective priority of a task. As a result, + * rt_mutex_setprio() can trigger (RT) balancing operations, + * which can then trigger wakeups of the stop thread to push + * around the current task. + * + * The stop task itself will never be part of the PI-chain, it + * never blocks, therefore that ->pi_lock recursion is safe. + * Tell lockdep about this by placing the stop->pi_lock in its + * own class. + */ + lockdep_set_class(&stop->pi_lock, &stop_pi_lock); } cpu_rq(cpu)->stop = stop; -- cgit v1.2.3 From a7c81556ec4d341dfdbf2cc478ead89d73e474a7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Sep 2020 17:06:07 +0200 Subject: sched: Fix migrate_disable() vs rt/dl balancing In order to minimize the interference of migrate_disable() on lower priority tasks, which can be deprived of runtime due to being stuck below a higher priority task. Teach the RT/DL balancers to push away these higher priority tasks when a lower priority task gets selected to run on a freshly demoted CPU (pull). This adds migration interference to the higher priority task, but restores bandwidth to system that would otherwise be irrevocably lost. Without this it would be possible to have all tasks on the system stuck on a single CPU, each task preempted in a migrate_disable() section with a single high priority task running. This way we can still approximate running the M highest priority tasks on the system. Migrating the top task away is (ofcourse) still subject to migrate_disable() too, which means the lower task is subject to an interference equivalent to the worst case migrate_disable() section. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201023102347.499155098@infradead.org --- include/linux/preempt.h | 40 ++++++++++++++++------------- include/linux/sched.h | 3 ++- kernel/sched/core.c | 67 +++++++++++++++++++++++++++++++++++++++++-------- kernel/sched/deadline.c | 29 +++++++++++++++------ kernel/sched/rt.c | 63 +++++++++++++++++++++++++++++++++++++--------- kernel/sched/sched.h | 32 +++++++++++++++++++++++ 6 files changed, 186 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 97ba7c920653..8b43922e65df 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -325,24 +325,28 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) /* - * Migrate-Disable and why it is (strongly) undesired. - * - * The premise of the Real-Time schedulers we have on Linux - * (SCHED_FIFO/SCHED_DEADLINE) is that M CPUs can/will run M tasks - * concurrently, provided there are sufficient runnable tasks, also known as - * work-conserving. For instance SCHED_DEADLINE tries to schedule the M - * earliest deadline threads, and SCHED_FIFO the M highest priority threads. - * - * The correctness of various scheduling models depends on this, but is it - * broken by migrate_disable() that doesn't imply preempt_disable(). Where - * preempt_disable() implies an immediate priority ceiling, preemptible - * migrate_disable() allows nesting. - * - * The worst case is that all tasks preempt one another in a migrate_disable() - * region and stack on a single CPU. This then reduces the available bandwidth - * to a single CPU. And since Real-Time schedulability theory considers the - * Worst-Case only, all Real-Time analysis shall revert to single-CPU - * (instantly solving the SMP analysis problem). + * Migrate-Disable and why it is undesired. + * + * When a preempted task becomes elegible to run under the ideal model (IOW it + * becomes one of the M highest priority tasks), it might still have to wait + * for the preemptee's migrate_disable() section to complete. Thereby suffering + * a reduction in bandwidth in the exact duration of the migrate_disable() + * section. + * + * Per this argument, the change from preempt_disable() to migrate_disable() + * gets us: + * + * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() + * it would have had to wait for the lower priority task. + * + * - a lower priority tasks; which under preempt_disable() could've instantly + * migrated away when another CPU becomes available, is now constrained + * by the ability to push the higher priority task away, which might itself be + * in a migrate_disable() section, reducing it's available bandwidth. + * + * IOW it trades latency / moves the interference term, but it stays in the + * system, and as long as it remains unbounded, the system is not fully + * deterministic. * * * The reason we have it anyway. diff --git a/include/linux/sched.h b/include/linux/sched.h index 90a0c92741d7..3af9d52fe093 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -716,8 +716,9 @@ struct task_struct { cpumask_t cpus_mask; void *migration_pending; #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) - int migration_disabled; + unsigned short migration_disabled; #endif + unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9ce2fc7d3d51..e92d7853057c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1763,11 +1763,6 @@ void migrate_enable(void) } EXPORT_SYMBOL_GPL(migrate_enable); -static inline bool is_migration_disabled(struct task_struct *p) -{ - return p->migration_disabled; -} - static inline bool rq_has_pinned_tasks(struct rq *rq) { return rq->nr_pinned; @@ -1972,6 +1967,49 @@ out: return 0; } +int push_cpu_stop(void *arg) +{ + struct rq *lowest_rq = NULL, *rq = this_rq(); + struct task_struct *p = arg; + + raw_spin_lock_irq(&p->pi_lock); + raw_spin_lock(&rq->lock); + + if (task_rq(p) != rq) + goto out_unlock; + + if (is_migration_disabled(p)) { + p->migration_flags |= MDF_PUSH; + goto out_unlock; + } + + p->migration_flags &= ~MDF_PUSH; + + if (p->sched_class->find_lock_rq) + lowest_rq = p->sched_class->find_lock_rq(p, rq); + + if (!lowest_rq) + goto out_unlock; + + // XXX validate p is still the highest prio task + if (task_rq(p) == rq) { + deactivate_task(rq, p, 0); + set_task_cpu(p, lowest_rq->cpu); + activate_task(lowest_rq, p, 0); + resched_curr(lowest_rq); + } + + double_unlock_balance(rq, lowest_rq); + +out_unlock: + rq->push_busy = false; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + return 0; +} + /* * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. @@ -2052,6 +2090,14 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag /* Can the task run on the task's current CPU? If so, we're done */ if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + struct task_struct *push_task = NULL; + + if ((flags & SCA_MIGRATE_ENABLE) && + (p->migration_flags & MDF_PUSH) && !rq->push_busy) { + rq->push_busy = true; + push_task = get_task_struct(p); + } + pending = p->migration_pending; if (pending) { refcount_inc(&pending->refs); @@ -2060,6 +2106,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } task_rq_unlock(rq, p, rf); + if (push_task) { + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + p, &rq->push_work); + } + if (complete) goto do_complete; @@ -2098,6 +2149,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag if (flags & SCA_MIGRATE_ENABLE) { refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + p->migration_flags &= ~MDF_PUSH; task_rq_unlock(rq, p, rf); pending->arg = (struct migration_arg) { @@ -2716,11 +2768,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } -static inline bool is_migration_disabled(struct task_struct *p) -{ - return false; -} - static inline bool rq_has_pinned_tasks(struct rq *rq) { return false; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3d3fd8370447..eed2e449b313 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2129,6 +2129,9 @@ static int push_dl_task(struct rq *rq) return 0; retry: + if (is_migration_disabled(next_task)) + return 0; + if (WARN_ON(next_task == rq->curr)) return 0; @@ -2206,7 +2209,7 @@ static void push_dl_tasks(struct rq *rq) static void pull_dl_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; - struct task_struct *p; + struct task_struct *p, *push_task; bool resched = false; struct rq *src_rq; u64 dmin = LONG_MAX; @@ -2236,6 +2239,7 @@ static void pull_dl_task(struct rq *this_rq) continue; /* Might drop this_rq->lock */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2267,17 +2271,27 @@ static void pull_dl_task(struct rq *this_rq) src_rq->curr->dl.deadline)) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - dmin = p->dl.deadline; + if (is_migration_disabled(p)) { + push_task = get_push_task(src_rq); + } else { + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + dmin = p->dl.deadline; + resched = true; + } /* Is there any other task even earlier? */ } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + raw_spin_unlock(&this_rq->lock); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + raw_spin_lock(&this_rq->lock); + } } if (resched) @@ -2524,6 +2538,7 @@ const struct sched_class dl_sched_class .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, + .find_lock_rq = find_lock_later_rq, #endif .task_tick = task_tick_dl, diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index cf63346a07e4..c592e47cafed 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1859,7 +1859,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) * running task can migrate over to a CPU that is running a task * of lesser priority. */ -static int push_rt_task(struct rq *rq) +static int push_rt_task(struct rq *rq, bool pull) { struct task_struct *next_task; struct rq *lowest_rq; @@ -1873,6 +1873,34 @@ static int push_rt_task(struct rq *rq) return 0; retry: + if (is_migration_disabled(next_task)) { + struct task_struct *push_task = NULL; + int cpu; + + if (!pull || rq->push_busy) + return 0; + + cpu = find_lowest_rq(rq->curr); + if (cpu == -1 || cpu == rq->cpu) + return 0; + + /* + * Given we found a CPU with lower priority than @next_task, + * therefore it should be running. However we cannot migrate it + * to this other CPU, instead attempt to push the current + * running task on this CPU away. + */ + push_task = get_push_task(rq); + if (push_task) { + raw_spin_unlock(&rq->lock); + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + push_task, &rq->push_work); + raw_spin_lock(&rq->lock); + } + + return 0; + } + if (WARN_ON(next_task == rq->curr)) return 0; @@ -1927,12 +1955,10 @@ retry: deactivate_task(rq, next_task, 0); set_task_cpu(next_task, lowest_rq->cpu); activate_task(lowest_rq, next_task, 0); - ret = 1; - resched_curr(lowest_rq); + ret = 1; double_unlock_balance(rq, lowest_rq); - out: put_task_struct(next_task); @@ -1942,7 +1968,7 @@ out: static void push_rt_tasks(struct rq *rq) { /* push_rt_task will return true if it moved an RT */ - while (push_rt_task(rq)) + while (push_rt_task(rq, false)) ; } @@ -2095,7 +2121,8 @@ void rto_push_irq_work_func(struct irq_work *work) */ if (has_pushable_tasks(rq)) { raw_spin_lock(&rq->lock); - push_rt_tasks(rq); + while (push_rt_task(rq, true)) + ; raw_spin_unlock(&rq->lock); } @@ -2120,7 +2147,7 @@ static void pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; bool resched = false; - struct task_struct *p; + struct task_struct *p, *push_task; struct rq *src_rq; int rt_overload_count = rt_overloaded(this_rq); @@ -2167,6 +2194,7 @@ static void pull_rt_task(struct rq *this_rq) * double_lock_balance, and another CPU could * alter this_rq */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2194,11 +2222,14 @@ static void pull_rt_task(struct rq *this_rq) if (p->prio < src_rq->curr->prio) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + if (is_migration_disabled(p)) { + push_task = get_push_task(src_rq); + } else { + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + resched = true; + } /* * We continue with the search, just in * case there's an even higher prio task @@ -2208,6 +2239,13 @@ static void pull_rt_task(struct rq *this_rq) } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + raw_spin_unlock(&this_rq->lock); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + raw_spin_lock(&this_rq->lock); + } } if (resched) @@ -2449,6 +2487,7 @@ const struct sched_class rt_sched_class .rq_offline = rq_offline_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, + .find_lock_rq = find_lock_lowest_rq, #endif .task_tick = task_tick_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 42de1406c0dc..56992aaca48e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1057,6 +1057,8 @@ struct rq { #if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) unsigned int nr_pinned; #endif + unsigned int push_busy; + struct cpu_stop_work push_work; }; #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1084,6 +1086,16 @@ static inline int cpu_of(struct rq *rq) #endif } +#define MDF_PUSH 0x01 + +static inline bool is_migration_disabled(struct task_struct *p) +{ +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + return p->migration_disabled; +#else + return false; +#endif +} #ifdef CONFIG_SCHED_SMT extern void __update_idle_core(struct rq *rq); @@ -1823,6 +1835,8 @@ struct sched_class { void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); + + struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); #endif void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); @@ -1918,6 +1932,24 @@ extern void trigger_load_balance(struct rq *rq); extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); +static inline struct task_struct *get_push_task(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + lockdep_assert_held(&rq->lock); + + if (rq->push_busy) + return NULL; + + if (p->nr_cpus_allowed == 1) + return NULL; + + rq->push_busy = true; + return get_task_struct(p); +} + +extern int push_cpu_stop(void *arg); + #endif #ifdef CONFIG_CPU_IDLE -- cgit v1.2.3 From 885b3ba47aa5cc16550beb8a42181ad5e8302ceb Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 13 Oct 2020 15:01:15 +0100 Subject: sched: Deny self-issued __set_cpus_allowed_ptr() when migrate_disable() migrate_disable(); set_cpus_allowed_ptr(current, {something excluding task_cpu(current)}); affine_move_task(); <-- never returns Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201013140116.26651-1-valentin.schneider@arm.com --- kernel/sched/core.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e92d7853057c..88c6fcb3bb65 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2238,8 +2238,17 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (!(flags & SCA_MIGRATE_ENABLE) && cpumask_equal(&p->cpus_mask, new_mask)) - goto out; + if (!(flags & SCA_MIGRATE_ENABLE)) { + if (cpumask_equal(&p->cpus_mask, new_mask)) + goto out; + + if (WARN_ON_ONCE(p == current && + is_migration_disabled(p) && + !cpumask_test_cpu(task_cpu(p), new_mask))) { + ret = -EBUSY; + goto out; + } + } /* * Picking a ~random cpu helps in cases where we are changing affinity -- cgit v1.2.3 From c777d847107e80df24dae87fc9cf4b4c0bf4dfed Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 13 Oct 2020 15:01:16 +0100 Subject: sched: Comment affine_move_task() Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201013140116.26651-2-valentin.schneider@arm.com --- kernel/sched/core.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 88c6fcb3bb65..c6409f34fa2d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2076,7 +2076,75 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) } /* - * This function is wildly self concurrent, consider at least 3 times. + * This function is wildly self concurrent; here be dragons. + * + * + * When given a valid mask, __set_cpus_allowed_ptr() must block until the + * designated task is enqueued on an allowed CPU. If that task is currently + * running, we have to kick it out using the CPU stopper. + * + * Migrate-Disable comes along and tramples all over our nice sandcastle. + * Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 + * + * migrate_disable(); + * + * set_cpus_allowed_ptr(P0, [1]); + * + * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes + * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). + * This means we need the following scheme: + * + * P0@CPU0 P1 + * + * migrate_disable(); + * + * set_cpus_allowed_ptr(P0, [1]); + * + * + * migrate_enable(); + * __set_cpus_allowed_ptr(); + * + * `--> + * + * Now the fun stuff: there may be several P1-like tasks, i.e. multiple + * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any + * task p are serialized by p->pi_lock, which we can leverage: the one that + * should come into effect at the end of the Migrate-Disable region is the last + * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), + * but we still need to properly signal those waiting tasks at the appropriate + * moment. + * + * This is implemented using struct set_affinity_pending. The first + * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will + * setup an instance of that struct and install it on the targeted task_struct. + * Any and all further callers will reuse that instance. Those then wait for + * a completion signaled at the tail of the CPU stopper callback (1), triggered + * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). + * + * + * (1) In the cases covered above. There is one more where the completion is + * signaled within affine_move_task() itself: when a subsequent affinity request + * cancels the need for an active migration. Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 P2 + * + * migrate_disable(); + * + * set_cpus_allowed_ptr(P0, [1]); + * + * set_cpus_allowed_ptr(P0, [0, 1]); + * + * + * + * Note that the above is safe vs a concurrent migrate_enable(), as any + * pending affinity completion is preceded by an uninstallation of + * p->migration_pending done with p->pi_lock held. */ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, int dest_cpu, unsigned int flags) @@ -2120,6 +2188,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag if (!(flags & SCA_MIGRATE_ENABLE)) { /* serialized by p->pi_lock */ if (!p->migration_pending) { + /* Install the request */ refcount_set(&my_pending.refs, 1); init_completion(&my_pending.done); p->migration_pending = &my_pending; @@ -2165,7 +2234,11 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag } if (task_running(rq, p) || p->state == TASK_WAKING) { - + /* + * Lessen races (and headaches) by delegating + * is_migration_disabled(p) checks to the stopper, which will + * run on the same CPU as said p. + */ task_rq_unlock(rq, p, rf); stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); @@ -2190,6 +2263,10 @@ do_complete: if (refcount_dec_and_test(&pending->refs)) wake_up_var(&pending->refs); + /* + * Block the original owner of &pending until all subsequent callers + * have seen the completion and decremented the refcount + */ wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); return 0; -- cgit v1.2.3 From cdb310474dece99985e4cdd2b96b1324e39c1c9d Mon Sep 17 00:00:00 2001 From: Hui Su Date: Fri, 30 Oct 2020 22:46:21 +0800 Subject: sched/fair: Remove superfluous lock section in do_sched_cfs_slack_timer() Since ab93a4bc955b ("sched/fair: Remove distribute_running fromCFS bandwidth"), there is nothing to protect between raw_spin_lock_irqsave/store() in do_sched_cfs_slack_timer(). Signed-off-by: Hui Su Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Reviewed-by: Ben Segall Link: https://lkml.kernel.org/r/20201030144621.GA96974@rlk --- kernel/sched/fair.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2755a7e0f1ce..3e5d98f861a1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5126,9 +5126,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) return; distribute_cfs_runtime(cfs_b); - - raw_spin_lock_irqsave(&cfs_b->lock, flags); - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } /* -- cgit v1.2.3 From 17770579059258c5f1eef759e941af5f1a54f482 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 2 Nov 2020 18:45:12 +0000 Subject: sched: Add WF_TTWU, WF_EXEC wakeup flags To remove the sd_flag parameter of select_task_rq(), we need another way of encoding wakeup types. There already is a WF_FORK flag, add the missing two. With that said, we still need an easy way to turn WF_foo into SD_bar (e.g. WF_TTWU into SD_BALANCE_WAKE). As suggested by Peter, let's make our lives easier and make them match exactly, and throw in some compile-time checks for good measure. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201102184514.2733-2-valentin.schneider@arm.com --- kernel/sched/sched.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e897d779839f..47258735a93f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1744,13 +1744,20 @@ static inline int task_on_rq_migrating(struct task_struct *p) return READ_ONCE(p->on_rq) == TASK_ON_RQ_MIGRATING; } -/* - * wake flags - */ -#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ -#define WF_FORK 0x02 /* Child wakeup after fork */ -#define WF_MIGRATED 0x04 /* Internal use, task got migrated */ -#define WF_ON_CPU 0x08 /* Wakee is on_cpu */ +/* Wake flags. The first three directly map to some SD flag value */ +#define WF_EXEC 0x02 /* Wakeup after exec; maps to SD_BALANCE_EXEC */ +#define WF_FORK 0x04 /* Wakeup after fork; maps to SD_BALANCE_FORK */ +#define WF_TTWU 0x08 /* Wakeup; maps to SD_BALANCE_WAKE */ + +#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ +#define WF_MIGRATED 0x20 /* Internal use, task got migrated */ +#define WF_ON_CPU 0x40 /* Wakee is on_cpu */ + +#ifdef CONFIG_SMP +static_assert(WF_EXEC == SD_BALANCE_EXEC); +static_assert(WF_FORK == SD_BALANCE_FORK); +static_assert(WF_TTWU == SD_BALANCE_WAKE); +#endif /* * To aid in avoiding the subversion of "niceness" due to uneven distribution -- cgit v1.2.3 From 3aef1551e942860a3881087171ef0cd45f6ebda7 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 2 Nov 2020 18:45:13 +0000 Subject: sched: Remove select_task_rq()'s sd_flag parameter Only select_task_rq_fair() uses that parameter to do an actual domain search, other classes only care about what kind of wakeup is happening (fork, exec, or "regular") and thus just translate the flag into a wakeup type. WF_TTWU and WF_EXEC have just been added, use these along with WF_FORK to encode the wakeup types we care about. For select_task_rq_fair(), we can simply use the shiny new WF_flag : SD_flag mapping. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201102184514.2733-3-valentin.schneider@arm.com --- kernel/sched/core.c | 10 +++++----- kernel/sched/deadline.c | 4 ++-- kernel/sched/fair.c | 8 +++++--- kernel/sched/idle.c | 2 +- kernel/sched/rt.c | 4 ++-- kernel/sched/sched.h | 2 +- kernel/sched/stop_task.c | 2 +- 7 files changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 622f343413a6..a6aaf9fb3400 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2769,12 +2769,12 @@ out: * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable. */ static inline -int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) +int select_task_rq(struct task_struct *p, int cpu, int wake_flags) { lockdep_assert_held(&p->pi_lock); if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) - cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); + cpu = p->sched_class->select_task_rq(p, cpu, wake_flags); else cpu = cpumask_any(p->cpus_ptr); @@ -3409,7 +3409,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); - cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); + cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); @@ -3793,7 +3793,7 @@ void wake_up_new_task(struct task_struct *p) */ p->recent_used_cpu = task_cpu(p); rseq_migrate(p); - __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), WF_FORK)); #endif rq = __task_rq_lock(p, &rf); update_rq_clock(rq); @@ -4384,7 +4384,7 @@ void sched_exec(void) int dest_cpu; raw_spin_lock_irqsave(&p->pi_lock, flags); - dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); + dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), WF_EXEC); if (dest_cpu == smp_processor_id()) goto unlock; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cc1feb79d786..2a5836f440e0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1683,13 +1683,13 @@ static void yield_task_dl(struct rq *rq) static int find_later_rq(struct task_struct *task); static int -select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_dl(struct task_struct *p, int cpu, int flags) { struct task_struct *curr; bool select_rq; struct rq *rq; - if (sd_flag != SD_BALANCE_WAKE) + if (!(flags & WF_TTWU)) goto out; rq = cpu_rq(cpu); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e5d98f861a1..b1596fa21bbe 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6684,7 +6684,7 @@ fail: /* * select_task_rq_fair: Select target runqueue for the waking task in domains - * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * * Balances load by selecting the idlest CPU in the idlest group, or under @@ -6695,13 +6695,15 @@ fail: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) { + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; - int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + /* SD_flags and WF_flags share the first nibble */ + int sd_flag = wake_flags & 0xF; if (sd_flag & SD_BALANCE_WAKE) { record_wakee(p); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 9da69c4e0ee9..df91b198a74c 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -376,7 +376,7 @@ void cpu_startup_entry(enum cpuhp_state state) #ifdef CONFIG_SMP static int -select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_idle(struct task_struct *p, int cpu, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c961a3fc0166..dbe4629cf7ba 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1430,14 +1430,14 @@ static void yield_task_rt(struct rq *rq) static int find_lowest_rq(struct task_struct *task); static int -select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_rt(struct task_struct *p, int cpu, int flags) { struct task_struct *curr; struct rq *rq; bool test; /* For anything but wake ups, just return the task_cpu */ - if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK) + if (!(flags & (WF_TTWU | WF_FORK))) goto out; rq = cpu_rq(cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 47258735a93f..590e6f27068c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1833,7 +1833,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); - int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); void (*migrate_task_rq)(struct task_struct *p, int new_cpu); void (*task_woken)(struct rq *this_rq, struct task_struct *task); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 91bb10cc070e..55f39125c0e1 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -11,7 +11,7 @@ #ifdef CONFIG_SMP static int -select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags) +select_task_rq_stop(struct task_struct *p, int cpu, int flags) { return task_cpu(p); /* stop tasks as never migrate */ } -- cgit v1.2.3 From dc824eb898534cd8e34582874dae3bb7cf2fa008 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 2 Nov 2020 18:45:14 +0000 Subject: sched/fair: Dissociate wakeup decisions from SD flag value The CFS wakeup code will only ever go through EAS / its fast path on "regular" wakeups (i.e. not on forks or execs). These are currently gated by a check against 'sd_flag', which would be SD_BALANCE_WAKE at wakeup. However, we now have a flag that explicitly tells us whether a wakeup is a "regular" one, so hinge those conditions on that flag instead. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201102184514.2733-4-valentin.schneider@arm.com --- kernel/sched/fair.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b1596fa21bbe..6691e28fa3da 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6705,7 +6705,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) /* SD_flags and WF_flags share the first nibble */ int sd_flag = wake_flags & 0xF; - if (sd_flag & SD_BALANCE_WAKE) { + if (wake_flags & WF_TTWU) { record_wakee(p); if (sched_energy_enabled()) { @@ -6742,9 +6742,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) if (unlikely(sd)) { /* Slow path */ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); - } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); if (want_affine) -- cgit v1.2.3 From d60cd06331a3566d3305b3c7b566e79edf4e2095 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 30 Oct 2020 15:06:57 +0800 Subject: PM: ACPI: reboot: Use S5 for reboot After reboot, it's not possible to use hotkeys to enter BIOS setup and boot menu on some HP laptops. BIOS folks identified the root cause is the missing _PTS call, and BIOS is expecting _PTS to do proper reset. Using S5 for reboot is default behavior under Windows, "A full shutdown (S5) occurs when a system restart is requested" [1], so let's do the same here. [1] https://docs.microsoft.com/en-us/windows/win32/power/system-power-states Signed-off-by: Kai-Heng Feng [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki --- kernel/reboot.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index e7b78d5ae1ab..7e5aa1f78693 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -244,6 +244,8 @@ void migrate_to_reboot_cpu(void) void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); + if (pm_power_off_prepare) + pm_power_off_prepare(); migrate_to_reboot_cpu(); syscore_shutdown(); if (!cmd) -- cgit v1.2.3 From c250d50fe2ce627ca9805d9c8ac11cbbf922a4a6 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Thu, 5 Nov 2020 12:50:01 +0000 Subject: PM: EM: Add a flag indicating units of power values in Energy Model There are different platforms and devices which might use different scale for the power values. Kernel sub-systems might need to check if all Energy Model (EM) devices are using the same scale. Address that issue and store the information inside EM for each device. Thanks to that they can be easily compared and proper action triggered. Suggested-by: Daniel Lezcano Reviewed-by: Quentin Perret Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/scmi-cpufreq.c | 3 ++- drivers/opp/of.c | 2 +- include/linux/energy_model.h | 9 +++++++-- kernel/power/energy_model.c | 24 +++++++++++++++++++++++- 4 files changed, 33 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index e855e8612a67..3714a4cd07fa 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -188,7 +188,8 @@ static int scmi_cpufreq_init(struct cpufreq_policy *policy) policy->fast_switch_possible = handle->perf_ops->fast_switch_possible(handle, cpu_dev); - em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus); + em_dev_register_perf_domain(cpu_dev, nr_opp, &em_cb, policy->cpus, + false); return 0; diff --git a/drivers/opp/of.c b/drivers/opp/of.c index 9faeb83e4b32..16f39e2127a5 100644 --- a/drivers/opp/of.c +++ b/drivers/opp/of.c @@ -1335,7 +1335,7 @@ int dev_pm_opp_of_register_em(struct device *dev, struct cpumask *cpus) goto failed; } - ret = em_dev_register_perf_domain(dev, nr_opp, &em_cb, cpus); + ret = em_dev_register_perf_domain(dev, nr_opp, &em_cb, cpus, true); if (ret) goto failed; diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index b67a51c574b9..3a33c738d876 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -29,6 +29,8 @@ struct em_perf_state { * em_perf_domain - Performance domain * @table: List of performance states, in ascending order * @nr_perf_states: Number of performance states + * @milliwatts: Flag indicating the power values are in milli-Watts + * or some other scale. * @cpus: Cpumask covering the CPUs of the domain. It's here * for performance reasons to avoid potential cache * misses during energy calculations in the scheduler @@ -43,6 +45,7 @@ struct em_perf_state { struct em_perf_domain { struct em_perf_state *table; int nr_perf_states; + int milliwatts; unsigned long cpus[]; }; @@ -79,7 +82,8 @@ struct em_data_callback { struct em_perf_domain *em_cpu_get(int cpu); struct em_perf_domain *em_pd_get(struct device *dev); int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *span); + struct em_data_callback *cb, cpumask_t *span, + bool milliwatts); void em_dev_unregister_perf_domain(struct device *dev); /** @@ -186,7 +190,8 @@ struct em_data_callback {}; static inline int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *span) + struct em_data_callback *cb, cpumask_t *span, + bool milliwatts) { return -EINVAL; } diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index c1ff7fa030ab..efe2a595988e 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -52,6 +52,17 @@ static int em_debug_cpus_show(struct seq_file *s, void *unused) } DEFINE_SHOW_ATTRIBUTE(em_debug_cpus); +static int em_debug_units_show(struct seq_file *s, void *unused) +{ + struct em_perf_domain *pd = s->private; + char *units = pd->milliwatts ? "milliWatts" : "bogoWatts"; + + seq_printf(s, "%s\n", units); + + return 0; +} +DEFINE_SHOW_ATTRIBUTE(em_debug_units); + static void em_debug_create_pd(struct device *dev) { struct dentry *d; @@ -64,6 +75,8 @@ static void em_debug_create_pd(struct device *dev) debugfs_create_file("cpus", 0444, d, dev->em_pd->cpus, &em_debug_cpus_fops); + debugfs_create_file("units", 0444, d, dev->em_pd, &em_debug_units_fops); + /* Create a sub-directory for each performance state */ for (i = 0; i < dev->em_pd->nr_perf_states; i++) em_debug_create_ps(&dev->em_pd->table[i], d); @@ -250,17 +263,24 @@ EXPORT_SYMBOL_GPL(em_cpu_get); * @cpus : Pointer to cpumask_t, which in case of a CPU device is * obligatory. It can be taken from i.e. 'policy->cpus'. For other * type of devices this should be set to NULL. + * @milliwatts : Flag indicating that the power values are in milliWatts or + * in some other scale. It must be set properly. * * Create Energy Model tables for a performance domain using the callbacks * defined in cb. * + * The @milliwatts is important to set with correct value. Some kernel + * sub-systems might rely on this flag and check if all devices in the EM are + * using the same scale. + * * If multiple clients register the same performance domain, all but the first * registration will be ignored. * * Return 0 on success */ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, - struct em_data_callback *cb, cpumask_t *cpus) + struct em_data_callback *cb, cpumask_t *cpus, + bool milliwatts) { unsigned long cap, prev_cap = 0; int cpu, ret; @@ -313,6 +333,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states, if (ret) goto unlock; + dev->em_pd->milliwatts = milliwatts; + em_debug_create_pd(dev); dev_info(dev, "EM: created perf domain\n"); -- cgit v1.2.3 From f2c90b12e700fff6a0b5a1c32f446f05f9d0890c Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 3 Nov 2020 09:05:59 +0000 Subject: PM: EM: update the comments related to power scale The Energy Model supports power values expressed in milli-Watts or in an 'abstract scale'. Update the related comments is the code to reflect that state. Reviewed-by: Quentin Perret Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- include/linux/energy_model.h | 11 +++++------ kernel/power/energy_model.c | 2 +- 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index 3a33c738d876..9618c0a46ef4 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -13,9 +13,8 @@ /** * em_perf_state - Performance state of a performance domain * @frequency: The frequency in KHz, for consistency with CPUFreq - * @power: The power consumed at this level, in milli-watts (by 1 CPU or - by a registered device). It can be a total power: static and - dynamic. + * @power: The power consumed at this level (by 1 CPU or by a registered + * device). It can be a total power: static and dynamic. * @cost: The cost coefficient associated with this level, used during * energy calculation. Equal to: power * max_frequency / frequency */ @@ -58,7 +57,7 @@ struct em_data_callback { /** * active_power() - Provide power at the next performance state of * a device - * @power : Active power at the performance state in mW + * @power : Active power at the performance state * (modified) * @freq : Frequency at the performance state in kHz * (modified) @@ -69,8 +68,8 @@ struct em_data_callback { * and frequency. * * In case of CPUs, the power is the one of a single CPU in the domain, - * expressed in milli-watts. It is expected to fit in the - * [0, EM_MAX_POWER] range. + * expressed in milli-Watts or an abstract scale. It is expected to + * fit in the [0, EM_MAX_POWER] range. * * Return 0 on success. */ diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c index efe2a595988e..1358fa4abfa8 100644 --- a/kernel/power/energy_model.c +++ b/kernel/power/energy_model.c @@ -143,7 +143,7 @@ static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd, /* * The power returned by active_state() is expected to be - * positive, in milli-watts and to fit into 16 bits. + * positive and to fit into 16 bits. */ if (!power || power > EM_MAX_POWER) { dev_err(dev, "EM: invalid power: %lu\n", -- cgit v1.2.3 From 951bb64621b8139c0cd99dcadc13e6510c08aa73 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:28 -0800 Subject: bpf: Add in-kernel split BTF support Adjust in-kernel BTF implementation to support a split BTF mode of operation. Changes are mostly mirroring libbpf split BTF changes, with the exception of start_id being 0 for in-kernel implementation due to simpler read-only mode. Otherwise, for split BTF logic, most of the logic of jumping to base BTF, where necessary, is encapsulated in few helper functions. Type numbering and string offset in a split BTF are logically continuing where base BTF ends, so most of the high-level logic is kept without changes. Type verification and size resolution is only doing an added resolution of new split BTF types and relies on already cached size and type resolution results in the base BTF. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201110011932.3201430-2-andrii@kernel.org --- kernel/bpf/btf.c | 171 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 119 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6324de8c59f7..727c1c27053f 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -203,12 +203,17 @@ struct btf { const char *strings; void *nohdr_data; struct btf_header hdr; - u32 nr_types; + u32 nr_types; /* includes VOID for base BTF */ u32 types_size; u32 data_size; refcount_t refcnt; u32 id; struct rcu_head rcu; + + /* split BTF support */ + struct btf *base_btf; + u32 start_id; /* first type ID in this BTF (0 for base BTF) */ + u32 start_str_off; /* first string offset (0 for base BTF) */ }; enum verifier_phase { @@ -449,14 +454,27 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } +static u32 btf_nr_types_total(const struct btf *btf) +{ + u32 total = 0; + + while (btf) { + total += btf->nr_types; + btf = btf->base_btf; + } + + return total; +} + s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) { const struct btf_type *t; const char *tname; - u32 i; + u32 i, total; - for (i = 1; i <= btf->nr_types; i++) { - t = btf->types[i]; + total = btf_nr_types_total(btf); + for (i = 1; i < total; i++) { + t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != kind) continue; @@ -599,8 +617,14 @@ static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t) static bool btf_name_offset_valid(const struct btf *btf, u32 offset) { - return BTF_STR_OFFSET_VALID(offset) && - offset < btf->hdr.str_len; + if (!BTF_STR_OFFSET_VALID(offset)) + return false; + + while (offset < btf->start_str_off) + btf = btf->base_btf; + + offset -= btf->start_str_off; + return offset < btf->hdr.str_len; } static bool __btf_name_char_ok(char c, bool first, bool dot_ok) @@ -614,10 +638,22 @@ static bool __btf_name_char_ok(char c, bool first, bool dot_ok) return true; } +static const char *btf_str_by_offset(const struct btf *btf, u32 offset) +{ + while (offset < btf->start_str_off) + btf = btf->base_btf; + + offset -= btf->start_str_off; + if (offset < btf->hdr.str_len) + return &btf->strings[offset]; + + return NULL; +} + static bool __btf_name_valid(const struct btf *btf, u32 offset, bool dot_ok) { /* offset must be valid */ - const char *src = &btf->strings[offset]; + const char *src = btf_str_by_offset(btf, offset); const char *src_limit; if (!__btf_name_char_ok(*src, true, dot_ok)) @@ -650,27 +686,28 @@ static bool btf_name_valid_section(const struct btf *btf, u32 offset) static const char *__btf_name_by_offset(const struct btf *btf, u32 offset) { + const char *name; + if (!offset) return "(anon)"; - else if (offset < btf->hdr.str_len) - return &btf->strings[offset]; - else - return "(invalid-name-offset)"; + + name = btf_str_by_offset(btf, offset); + return name ?: "(invalid-name-offset)"; } const char *btf_name_by_offset(const struct btf *btf, u32 offset) { - if (offset < btf->hdr.str_len) - return &btf->strings[offset]; - - return NULL; + return btf_str_by_offset(btf, offset); } const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id) { - if (type_id > btf->nr_types) - return NULL; + while (type_id < btf->start_id) + btf = btf->base_btf; + type_id -= btf->start_id; + if (type_id >= btf->nr_types) + return NULL; return btf->types[type_id]; } @@ -1390,17 +1427,13 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) { struct btf *btf = env->btf; - /* < 2 because +1 for btf_void which is always in btf->types[0]. - * btf_void is not accounted in btf->nr_types because btf_void - * does not come from the BTF file. - */ - if (btf->types_size - btf->nr_types < 2) { + if (btf->types_size == btf->nr_types) { /* Expand 'types' array */ struct btf_type **new_types; u32 expand_by, new_size; - if (btf->types_size == BTF_MAX_TYPE) { + if (btf->start_id + btf->types_size == BTF_MAX_TYPE) { btf_verifier_log(env, "Exceeded max num of types"); return -E2BIG; } @@ -1414,18 +1447,23 @@ static int btf_add_type(struct btf_verifier_env *env, struct btf_type *t) if (!new_types) return -ENOMEM; - if (btf->nr_types == 0) - new_types[0] = &btf_void; - else + if (btf->nr_types == 0) { + if (!btf->base_btf) { + /* lazily init VOID type */ + new_types[0] = &btf_void; + btf->nr_types++; + } + } else { memcpy(new_types, btf->types, - sizeof(*btf->types) * (btf->nr_types + 1)); + sizeof(*btf->types) * btf->nr_types); + } kvfree(btf->types); btf->types = new_types; btf->types_size = new_size; } - btf->types[++(btf->nr_types)] = t; + btf->types[btf->nr_types++] = t; return 0; } @@ -1498,18 +1536,17 @@ static int env_resolve_init(struct btf_verifier_env *env) u32 *resolved_ids = NULL; u8 *visit_states = NULL; - /* +1 for btf_void */ - resolved_sizes = kvcalloc(nr_types + 1, sizeof(*resolved_sizes), + resolved_sizes = kvcalloc(nr_types, sizeof(*resolved_sizes), GFP_KERNEL | __GFP_NOWARN); if (!resolved_sizes) goto nomem; - resolved_ids = kvcalloc(nr_types + 1, sizeof(*resolved_ids), + resolved_ids = kvcalloc(nr_types, sizeof(*resolved_ids), GFP_KERNEL | __GFP_NOWARN); if (!resolved_ids) goto nomem; - visit_states = kvcalloc(nr_types + 1, sizeof(*visit_states), + visit_states = kvcalloc(nr_types, sizeof(*visit_states), GFP_KERNEL | __GFP_NOWARN); if (!visit_states) goto nomem; @@ -1561,21 +1598,27 @@ static bool env_type_is_resolve_sink(const struct btf_verifier_env *env, static bool env_type_is_resolved(const struct btf_verifier_env *env, u32 type_id) { - return env->visit_states[type_id] == RESOLVED; + /* base BTF types should be resolved by now */ + if (type_id < env->btf->start_id) + return true; + + return env->visit_states[type_id - env->btf->start_id] == RESOLVED; } static int env_stack_push(struct btf_verifier_env *env, const struct btf_type *t, u32 type_id) { + const struct btf *btf = env->btf; struct resolve_vertex *v; if (env->top_stack == MAX_RESOLVE_DEPTH) return -E2BIG; - if (env->visit_states[type_id] != NOT_VISITED) + if (type_id < btf->start_id + || env->visit_states[type_id - btf->start_id] != NOT_VISITED) return -EEXIST; - env->visit_states[type_id] = VISITED; + env->visit_states[type_id - btf->start_id] = VISITED; v = &env->stack[env->top_stack++]; v->t = t; @@ -1605,6 +1648,7 @@ static void env_stack_pop_resolved(struct btf_verifier_env *env, u32 type_id = env->stack[--(env->top_stack)].type_id; struct btf *btf = env->btf; + type_id -= btf->start_id; /* adjust to local type id */ btf->resolved_sizes[type_id] = resolved_size; btf->resolved_ids[type_id] = resolved_type_id; env->visit_states[type_id] = RESOLVED; @@ -1709,14 +1753,30 @@ btf_resolve_size(const struct btf *btf, const struct btf_type *type, return __btf_resolve_size(btf, type, type_size, NULL, NULL, NULL, NULL); } +static u32 btf_resolved_type_id(const struct btf *btf, u32 type_id) +{ + while (type_id < btf->start_id) + btf = btf->base_btf; + + return btf->resolved_ids[type_id - btf->start_id]; +} + /* The input param "type_id" must point to a needs_resolve type */ static const struct btf_type *btf_type_id_resolve(const struct btf *btf, u32 *type_id) { - *type_id = btf->resolved_ids[*type_id]; + *type_id = btf_resolved_type_id(btf, *type_id); return btf_type_by_id(btf, *type_id); } +static u32 btf_resolved_type_size(const struct btf *btf, u32 type_id) +{ + while (type_id < btf->start_id) + btf = btf->base_btf; + + return btf->resolved_sizes[type_id - btf->start_id]; +} + const struct btf_type *btf_type_id_size(const struct btf *btf, u32 *type_id, u32 *ret_size) { @@ -1731,7 +1791,7 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, if (btf_type_has_size(size_type)) { size = size_type->size; } else if (btf_type_is_array(size_type)) { - size = btf->resolved_sizes[size_type_id]; + size = btf_resolved_type_size(btf, size_type_id); } else if (btf_type_is_ptr(size_type)) { size = sizeof(void *); } else { @@ -1739,14 +1799,14 @@ const struct btf_type *btf_type_id_size(const struct btf *btf, !btf_type_is_var(size_type))) return NULL; - size_type_id = btf->resolved_ids[size_type_id]; + size_type_id = btf_resolved_type_id(btf, size_type_id); size_type = btf_type_by_id(btf, size_type_id); if (btf_type_nosize_or_null(size_type)) return NULL; else if (btf_type_has_size(size_type)) size = size_type->size; else if (btf_type_is_array(size_type)) - size = btf->resolved_sizes[size_type_id]; + size = btf_resolved_type_size(btf, size_type_id); else if (btf_type_is_ptr(size_type)) size = sizeof(void *); else @@ -3798,7 +3858,7 @@ static int btf_check_all_metas(struct btf_verifier_env *env) cur = btf->nohdr_data + hdr->type_off; end = cur + hdr->type_len; - env->log_type_id = 1; + env->log_type_id = btf->base_btf ? btf->start_id : 1; while (cur < end) { struct btf_type *t = cur; s32 meta_size; @@ -3825,8 +3885,8 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, return false; if (btf_type_is_struct(t) || btf_type_is_datasec(t)) - return !btf->resolved_ids[type_id] && - !btf->resolved_sizes[type_id]; + return !btf_resolved_type_id(btf, type_id) && + !btf_resolved_type_size(btf, type_id); if (btf_type_is_modifier(t) || btf_type_is_ptr(t) || btf_type_is_var(t)) { @@ -3846,7 +3906,7 @@ static bool btf_resolve_valid(struct btf_verifier_env *env, elem_type = btf_type_id_size(btf, &elem_type_id, &elem_size); return elem_type && !btf_type_is_modifier(elem_type) && (array->nelems * elem_size == - btf->resolved_sizes[type_id]); + btf_resolved_type_size(btf, type_id)); } return false; @@ -3888,7 +3948,8 @@ static int btf_resolve(struct btf_verifier_env *env, static int btf_check_all_types(struct btf_verifier_env *env) { struct btf *btf = env->btf; - u32 type_id; + const struct btf_type *t; + u32 type_id, i; int err; err = env_resolve_init(env); @@ -3896,8 +3957,9 @@ static int btf_check_all_types(struct btf_verifier_env *env) return err; env->phase++; - for (type_id = 1; type_id <= btf->nr_types; type_id++) { - const struct btf_type *t = btf_type_by_id(btf, type_id); + for (i = btf->base_btf ? 0 : 1; i < btf->nr_types; i++) { + type_id = btf->start_id + i; + t = btf_type_by_id(btf, type_id); env->log_type_id = type_id; if (btf_type_needs_resolve(t) && @@ -3934,7 +3996,7 @@ static int btf_parse_type_sec(struct btf_verifier_env *env) return -EINVAL; } - if (!hdr->type_len) { + if (!env->btf->base_btf && !hdr->type_len) { btf_verifier_log(env, "No type found"); return -EINVAL; } @@ -3961,13 +4023,18 @@ static int btf_parse_str_sec(struct btf_verifier_env *env) return -EINVAL; } - if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || - start[0] || end[-1]) { + btf->strings = start; + + if (btf->base_btf && !hdr->str_len) + return 0; + if (!hdr->str_len || hdr->str_len - 1 > BTF_MAX_NAME_OFFSET || end[-1]) { + btf_verifier_log(env, "Invalid string section"); + return -EINVAL; + } + if (!btf->base_btf && start[0]) { btf_verifier_log(env, "Invalid string section"); return -EINVAL; } - - btf->strings = start; return 0; } @@ -4908,7 +4975,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id, while (t && btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (!t) { - *bad_type = btf->types[0]; + *bad_type = btf_type_by_id(btf, 0); return -EINVAL; } if (btf_type_is_ptr(t)) -- cgit v1.2.3 From 5329722057d41aebc31e391907a501feaa42f7d9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:29 -0800 Subject: bpf: Assign ID to vmlinux BTF and return extra info for BTF in GET_OBJ_INFO Allocate ID for vmlinux BTF. This makes it visible when iterating over all BTF objects in the system. To allow distinguishing vmlinux BTF (and later kernel module BTF) from user-provided BTFs, expose extra kernel_btf flag, as well as BTF name ("vmlinux" for vmlinux BTF, will equal to module's name for module BTF). We might want to later allow specifying BTF name for user-provided BTFs as well, if that makes sense. But currently this is reserved only for in-kernel BTFs. Having in-kernel BTFs exposed IDs will allow to extend BPF APIs that require in-kernel BTF type with ability to specify BTF types from kernel modules, not just vmlinux BTF. This will be implemented in a follow up patch set for fentry/fexit/fmod_ret/lsm/etc. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201110011932.3201430-3-andrii@kernel.org --- include/uapi/linux/bpf.h | 3 +++ kernel/bpf/btf.c | 43 +++++++++++++++++++++++++++++++++++++++--- tools/include/uapi/linux/bpf.h | 3 +++ 3 files changed, 46 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 9879d6793e90..162999b12790 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4466,6 +4466,9 @@ struct bpf_btf_info { __aligned_u64 btf; __u32 btf_size; __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; } __attribute__((aligned(8))); struct bpf_link_info { diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 727c1c27053f..856585db7aa7 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -214,6 +214,8 @@ struct btf { struct btf *base_btf; u32 start_id; /* first type ID in this BTF (0 for base BTF) */ u32 start_str_off; /* first string offset (0 for base BTF) */ + char name[MODULE_NAME_LEN]; + bool kernel_btf; }; enum verifier_phase { @@ -4429,6 +4431,8 @@ struct btf *btf_parse_vmlinux(void) btf->data = __start_BTF; btf->data_size = __stop_BTF - __start_BTF; + btf->kernel_btf = true; + snprintf(btf->name, sizeof(btf->name), "vmlinux"); err = btf_parse_hdr(env); if (err) @@ -4454,8 +4458,13 @@ struct btf *btf_parse_vmlinux(void) bpf_struct_ops_init(btf, log); - btf_verifier_env_free(env); refcount_set(&btf->refcnt, 1); + + err = btf_alloc_id(btf); + if (err) + goto errout; + + btf_verifier_env_free(env); return btf; errout: @@ -5553,7 +5562,9 @@ int btf_get_info_by_fd(const struct btf *btf, struct bpf_btf_info info; u32 info_copy, btf_copy; void __user *ubtf; - u32 uinfo_len; + char __user *uname; + u32 uinfo_len, uname_len, name_len; + int ret = 0; uinfo = u64_to_user_ptr(attr->info.info); uinfo_len = attr->info.info_len; @@ -5570,11 +5581,37 @@ int btf_get_info_by_fd(const struct btf *btf, return -EFAULT; info.btf_size = btf->data_size; + info.kernel_btf = btf->kernel_btf; + + uname = u64_to_user_ptr(info.name); + uname_len = info.name_len; + if (!uname ^ !uname_len) + return -EINVAL; + + name_len = strlen(btf->name); + info.name_len = name_len; + + if (uname) { + if (uname_len >= name_len + 1) { + if (copy_to_user(uname, btf->name, name_len + 1)) + return -EFAULT; + } else { + char zero = '\0'; + + if (copy_to_user(uname, btf->name, uname_len - 1)) + return -EFAULT; + if (put_user(zero, uname + uname_len - 1)) + return -EFAULT; + /* let user-space know about too short buffer */ + ret = -ENOSPC; + } + } + if (copy_to_user(uinfo, &info, info_copy) || put_user(info_copy, &uattr->info.info_len)) return -EFAULT; - return 0; + return ret; } int btf_get_fd_by_id(u32 id) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 9879d6793e90..162999b12790 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4466,6 +4466,9 @@ struct bpf_btf_info { __aligned_u64 btf; __u32 btf_size; __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; } __attribute__((aligned(8))); struct bpf_link_info { -- cgit v1.2.3 From 36e68442d1afd4f720704ee1ea8486331507e834 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 9 Nov 2020 17:19:31 -0800 Subject: bpf: Load and verify kernel module BTFs Add kernel module listener that will load/validate and unload module BTF. Module BTFs gets ID generated for them, which makes it possible to iterate them with existing BTF iteration API. They are given their respective module's names, which will get reported through GET_OBJ_INFO API. They are also marked as in-kernel BTFs for tooling to distinguish them from user-provided BTFs. Also, similarly to vmlinux BTF, kernel module BTFs are exposed through sysfs as /sys/kernel/btf/. This is convenient for user-space tools to inspect module BTF contents and dump their types with existing tools: [vmuser@archvm bpf]$ ls -la /sys/kernel/btf total 0 drwxr-xr-x 2 root root 0 Nov 4 19:46 . drwxr-xr-x 13 root root 0 Nov 4 19:46 .. ... -r--r--r-- 1 root root 888 Nov 4 19:46 irqbypass -r--r--r-- 1 root root 100225 Nov 4 19:46 kvm -r--r--r-- 1 root root 35401 Nov 4 19:46 kvm_intel -r--r--r-- 1 root root 120 Nov 4 19:46 pcspkr -r--r--r-- 1 root root 399 Nov 4 19:46 serio_raw -r--r--r-- 1 root root 4094095 Nov 4 19:46 vmlinux Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Reviewed-by: Greg Kroah-Hartman Link: https://lore.kernel.org/bpf/20201110011932.3201430-5-andrii@kernel.org --- Documentation/ABI/testing/sysfs-kernel-btf | 8 ++ include/linux/bpf.h | 2 + include/linux/module.h | 4 + kernel/bpf/btf.c | 194 +++++++++++++++++++++++++++++ kernel/bpf/sysfs_btf.c | 2 +- kernel/module.c | 32 +++++ 6 files changed, 241 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-kernel-btf b/Documentation/ABI/testing/sysfs-kernel-btf index 2c9744b2cd59..fe96efdc9b6c 100644 --- a/Documentation/ABI/testing/sysfs-kernel-btf +++ b/Documentation/ABI/testing/sysfs-kernel-btf @@ -15,3 +15,11 @@ Description: information with description of all internal kernel types. See Documentation/bpf/btf.rst for detailed description of format itself. + +What: /sys/kernel/btf/ +Date: Nov 2020 +KernelVersion: 5.11 +Contact: bpf@vger.kernel.org +Description: + Read-only binary attribute exposing kernel module's BTF type + information as an add-on to the kernel's BTF (/sys/kernel/btf/vmlinux). diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 73d5381a5d5c..581b2a2e78eb 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -36,9 +36,11 @@ struct seq_operations; struct bpf_iter_aux_info; struct bpf_local_storage; struct bpf_local_storage_map; +struct kobject; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; +extern struct kobject *btf_kobj; typedef int (*bpf_iter_init_seq_priv_t)(void *private_data, struct bpf_iter_aux_info *aux); diff --git a/include/linux/module.h b/include/linux/module.h index a29187f7c360..20fce258ffba 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -475,6 +475,10 @@ struct module { unsigned int num_bpf_raw_events; struct bpf_raw_event_map *bpf_raw_events; #endif +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + unsigned int btf_data_size; + void *btf_data; +#endif #ifdef CONFIG_JUMP_LABEL struct jump_entry *jump_entries; unsigned int num_jump_entries; diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 856585db7aa7..0f1fd2669d69 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include /* BTF (BPF Type Format) is the meta data format which describes @@ -4476,6 +4478,75 @@ errout: return ERR_PTR(err); } +static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size) +{ + struct btf_verifier_env *env = NULL; + struct bpf_verifier_log *log; + struct btf *btf = NULL, *base_btf; + int err; + + base_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(base_btf)) + return base_btf; + if (!base_btf) + return ERR_PTR(-EINVAL); + + env = kzalloc(sizeof(*env), GFP_KERNEL | __GFP_NOWARN); + if (!env) + return ERR_PTR(-ENOMEM); + + log = &env->log; + log->level = BPF_LOG_KERNEL; + + btf = kzalloc(sizeof(*btf), GFP_KERNEL | __GFP_NOWARN); + if (!btf) { + err = -ENOMEM; + goto errout; + } + env->btf = btf; + + btf->base_btf = base_btf; + btf->start_id = base_btf->nr_types; + btf->start_str_off = base_btf->hdr.str_len; + btf->kernel_btf = true; + snprintf(btf->name, sizeof(btf->name), "%s", module_name); + + btf->data = kvmalloc(data_size, GFP_KERNEL | __GFP_NOWARN); + if (!btf->data) { + err = -ENOMEM; + goto errout; + } + memcpy(btf->data, data, data_size); + btf->data_size = data_size; + + err = btf_parse_hdr(env); + if (err) + goto errout; + + btf->nohdr_data = btf->data + btf->hdr.hdr_len; + + err = btf_parse_str_sec(env); + if (err) + goto errout; + + err = btf_check_all_metas(env); + if (err) + goto errout; + + btf_verifier_env_free(env); + refcount_set(&btf->refcnt, 1); + return btf; + +errout: + btf_verifier_env_free(env); + if (btf) { + kvfree(btf->data); + kvfree(btf->types); + kfree(btf); + } + return ERR_PTR(err); +} + struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { struct bpf_prog *tgt_prog = prog->aux->dst_prog; @@ -5651,3 +5722,126 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id) { return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL; } + +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES +struct btf_module { + struct list_head list; + struct module *module; + struct btf *btf; + struct bin_attribute *sysfs_attr; +}; + +static LIST_HEAD(btf_modules); +static DEFINE_MUTEX(btf_module_mutex); + +static ssize_t +btf_module_read(struct file *file, struct kobject *kobj, + struct bin_attribute *bin_attr, + char *buf, loff_t off, size_t len) +{ + const struct btf *btf = bin_attr->private; + + memcpy(buf, btf->data + off, len); + return len; +} + +static int btf_module_notify(struct notifier_block *nb, unsigned long op, + void *module) +{ + struct btf_module *btf_mod, *tmp; + struct module *mod = module; + struct btf *btf; + int err = 0; + + if (mod->btf_data_size == 0 || + (op != MODULE_STATE_COMING && op != MODULE_STATE_GOING)) + goto out; + + switch (op) { + case MODULE_STATE_COMING: + btf_mod = kzalloc(sizeof(*btf_mod), GFP_KERNEL); + if (!btf_mod) { + err = -ENOMEM; + goto out; + } + btf = btf_parse_module(mod->name, mod->btf_data, mod->btf_data_size); + if (IS_ERR(btf)) { + pr_warn("failed to validate module [%s] BTF: %ld\n", + mod->name, PTR_ERR(btf)); + kfree(btf_mod); + err = PTR_ERR(btf); + goto out; + } + err = btf_alloc_id(btf); + if (err) { + btf_free(btf); + kfree(btf_mod); + goto out; + } + + mutex_lock(&btf_module_mutex); + btf_mod->module = module; + btf_mod->btf = btf; + list_add(&btf_mod->list, &btf_modules); + mutex_unlock(&btf_module_mutex); + + if (IS_ENABLED(CONFIG_SYSFS)) { + struct bin_attribute *attr; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + goto out; + + sysfs_bin_attr_init(attr); + attr->attr.name = btf->name; + attr->attr.mode = 0444; + attr->size = btf->data_size; + attr->private = btf; + attr->read = btf_module_read; + + err = sysfs_create_bin_file(btf_kobj, attr); + if (err) { + pr_warn("failed to register module [%s] BTF in sysfs: %d\n", + mod->name, err); + kfree(attr); + err = 0; + goto out; + } + + btf_mod->sysfs_attr = attr; + } + + break; + case MODULE_STATE_GOING: + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->module != module) + continue; + + list_del(&btf_mod->list); + if (btf_mod->sysfs_attr) + sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr); + btf_put(btf_mod->btf); + kfree(btf_mod->sysfs_attr); + kfree(btf_mod); + break; + } + mutex_unlock(&btf_module_mutex); + break; + } +out: + return notifier_from_errno(err); +} + +static struct notifier_block btf_module_nb = { + .notifier_call = btf_module_notify, +}; + +static int __init btf_module_init(void) +{ + register_module_notifier(&btf_module_nb); + return 0; +} + +fs_initcall(btf_module_init); +#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c index 11b3380887fa..ef6911aee3bb 100644 --- a/kernel/bpf/sysfs_btf.c +++ b/kernel/bpf/sysfs_btf.c @@ -26,7 +26,7 @@ static struct bin_attribute bin_attr_btf_vmlinux __ro_after_init = { .read = btf_vmlinux_read, }; -static struct kobject *btf_kobj; +struct kobject *btf_kobj; static int __init btf_vmlinux_init(void) { diff --git a/kernel/module.c b/kernel/module.c index a4fa44a652a7..f2996b02ab2e 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -380,6 +380,35 @@ static void *section_objs(const struct load_info *info, return (void *)info->sechdrs[sec].sh_addr; } +/* Find a module section: 0 means not found. Ignores SHF_ALLOC flag. */ +static unsigned int find_any_sec(const struct load_info *info, const char *name) +{ + unsigned int i; + + for (i = 1; i < info->hdr->e_shnum; i++) { + Elf_Shdr *shdr = &info->sechdrs[i]; + if (strcmp(info->secstrings + shdr->sh_name, name) == 0) + return i; + } + return 0; +} + +/* + * Find a module section, or NULL. Fill in number of "objects" in section. + * Ignores SHF_ALLOC flag. + */ +static __maybe_unused void *any_section_objs(const struct load_info *info, + const char *name, + size_t object_size, + unsigned int *num) +{ + unsigned int sec = find_any_sec(info, name); + + /* Section 0 has sh_addr 0 and sh_size 0. */ + *num = info->sechdrs[sec].sh_size / object_size; + return (void *)info->sechdrs[sec].sh_addr; +} + /* Provided by the linker */ extern const struct kernel_symbol __start___ksymtab[]; extern const struct kernel_symbol __stop___ksymtab[]; @@ -3250,6 +3279,9 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->bpf_raw_events), &mod->num_bpf_raw_events); #endif +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + mod->btf_data = any_section_objs(info, ".BTF", 1, &mod->btf_data_size); +#endif #ifdef CONFIG_JUMP_LABEL mod->jump_entries = section_objs(info, "__jump_table", sizeof(*mod->jump_entries), -- cgit v1.2.3 From c583bcb8f5edd48c1798798e341f78afb9bf4f6f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 24 Sep 2020 15:11:55 -0700 Subject: rcu: Don't invoke try_invoke_on_locked_down_task() with irqs disabled The try_invoke_on_locked_down_task() function requires that interrupts be enabled, but it is called with interrupts disabled from rcu_print_task_stall(), resulting in an "IRQs not enabled as expected" diagnostic. This commit therefore updates rcu_print_task_stall() to accumulate a list of the first few tasks while holding the current leaf rcu_node structure's ->lock, then releases that lock and only then uses try_invoke_on_locked_down_task() to attempt to obtain per-task detailed information. Of course, as soon as ->lock is released, the task might exit, so the get_task_struct() function is used to prevent the task structure from going away in the meantime. Link: https://lore.kernel.org/lkml/000000000000903d5805ab908fc4@google.com/ Fixes: 5bef8da66a9c ("rcu: Add per-task state to RCU CPU stall warnings") Reported-by: syzbot+cb3b69ae80afd6535b0e@syzkaller.appspotmail.com Reported-by: syzbot+f04854e1c5c9e913cc27@syzkaller.appspotmail.com Tested-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 0fde39b8daab..ca21d28a0f98 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -249,13 +249,16 @@ static bool check_slow_task(struct task_struct *t, void *arg) /* * Scan the current list of tasks blocked within RCU read-side critical - * sections, printing out the tid of each. + * sections, printing out the tid of each of the first few of them. */ -static int rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) { + int i = 0; int ndetected = 0; struct rcu_stall_chk_rdr rscr; struct task_struct *t; + struct task_struct *ts[8]; if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; @@ -264,6 +267,14 @@ static int rcu_print_task_stall(struct rcu_node *rnp) t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { + get_task_struct(t); + ts[i++] = t; + if (i >= ARRAY_SIZE(ts)) + break; + } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + for (i--; i; i--) { + t = ts[i]; if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr)) pr_cont(" P%d", t->pid); else @@ -273,6 +284,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp) ".q"[rscr.rs.b.need_qs], ".e"[rscr.rs.b.exp_hint], ".l"[rscr.on_blkd_list]); + put_task_struct(t); ndetected++; } pr_cont("\n"); @@ -293,8 +305,9 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ -static int rcu_print_task_stall(struct rcu_node *rnp) +static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ @@ -472,7 +485,6 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) pr_err("INFO: %s detected stalls on CPUs/tasks:\n", rcu_state.name); rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); - ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for_each_leaf_node_possible_cpu(rnp, cpu) if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { @@ -480,7 +492,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) ndetected++; } } - raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock. } for_each_possible_cpu(cpu) -- cgit v1.2.3 From 60602cb549f1965a7edbc96026760dfb93911fab Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Oct 2020 08:19:24 -0400 Subject: fgraph: Make overruns 4 bytes in graph stack structure Inspecting the data structures of the function graph tracer, I found that the overrun value is unsigned long, which is 8 bytes on a 64 bit machine, and not only that, the depth is an int (4 bytes). The overrun can be simply an unsigned int (4 bytes) and pack the ftrace_graph_ret structure better. The depth is moved up next to the func, as it is used more often with func, and improves cache locality. Signed-off-by: Steven Rostedt (VMware) --- include/linux/ftrace.h | 4 ++-- kernel/trace/trace_entries.h | 4 ++-- kernel/trace/trace_functions_graph.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 806196345c3f..8dde9c17aaa5 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -864,11 +864,11 @@ struct ftrace_graph_ent { */ struct ftrace_graph_ret { unsigned long func; /* Current function */ + int depth; /* Number of functions that overran the depth limit for current task */ - unsigned long overrun; + unsigned int overrun; unsigned long long calltime; unsigned long long rettime; - int depth; } __packed; /* Type of the callback handlers for tracing function graph*/ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 18c4a58aff79..ceafe2dc97e1 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -93,10 +93,10 @@ FTRACE_ENTRY_PACKED(funcgraph_exit, ftrace_graph_ret_entry, F_STRUCT( __field_struct( struct ftrace_graph_ret, ret ) __field_packed( unsigned long, ret, func ) - __field_packed( unsigned long, ret, overrun ) + __field_packed( int, ret, depth ) + __field_packed( unsigned int, ret, overrun ) __field_packed( unsigned long long, ret, calltime) __field_packed( unsigned long long, ret, rettime ) - __field_packed( int, ret, depth ) ), F_printk("<-- %ps (%d) (start: %llx end: %llx) over: %d", diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 60d66278aa0d..d874dec87131 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -957,7 +957,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, /* Overrun */ if (flags & TRACE_GRAPH_PRINT_OVERRUN) - trace_seq_printf(s, " (Overruns: %lu)\n", + trace_seq_printf(s, " (Overruns: %u)\n", trace->overrun); print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, -- cgit v1.2.3 From 28575c61ea602537a3d86fe301a53554e59452ae Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 2 Nov 2020 14:43:10 -0500 Subject: ring-buffer: Add recording of ring buffer recursion into recursed_functions Add a new config RING_BUFFER_RECORD_RECURSION that will place functions that recurse from the ring buffer into the ftrace recused_functions file. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 14 ++++++++++++++ kernel/trace/ring_buffer.c | 12 +++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 9b11c096d139..6aa36ec73ccb 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -752,6 +752,20 @@ config FTRACE_RECORD_RECURSION_SIZE This file can be reset, but the limit can not change in size at runtime. +config RING_BUFFER_RECORD_RECURSION + bool "Record functions that recurse in the ring buffer" + depends on FTRACE_RECORD_RECURSION + # default y, because it is coupled with FTRACE_RECORD_RECURSION + default y + help + The ring buffer has its own internal recursion. Although when + recursion happens it wont cause harm because of the protection, + but it does cause an unwanted overhead. Enabling this option will + place where recursion was detected into the ftrace "recursed_functions" + file. + + This will add more overhead to cases that have recursion. + config GCOV_PROFILE_FTRACE bool "Enable GCOV profiling on ftrace subsystem" depends on GCOV_KERNEL diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dc83b3fa9fe7..ab68f28b8f4b 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -4,6 +4,7 @@ * * Copyright (C) 2008 Steven Rostedt */ +#include #include #include #include @@ -3006,6 +3007,13 @@ rb_wakeups(struct trace_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) irq_work_queue(&cpu_buffer->irq_work.work); } +#ifdef CONFIG_RING_BUFFER_RECORD_RECURSION +# define do_ring_buffer_record_recursion() \ + do_ftrace_record_recursion(_THIS_IP_, _RET_IP_) +#else +# define do_ring_buffer_record_recursion() do { } while (0) +#endif + /* * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified @@ -3088,8 +3096,10 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) * been updated yet. In this case, use the TRANSITION bit. */ bit = RB_CTX_TRANSITION; - if (val & (1 << (bit + cpu_buffer->nest))) + if (val & (1 << (bit + cpu_buffer->nest))) { + do_ring_buffer_record_recursion(); return 1; + } } val |= (1 << (bit + cpu_buffer->nest)); -- cgit v1.2.3 From 045e269c1eb2db5b5df9e034af617af8f4c1b35c Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 6 Nov 2020 22:54:46 +0800 Subject: ftrace: Remove unused varible 'ret' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 'ret' in 2 functions are not used. and one of them is a void function. So remove them to avoid gcc warning: kernel/trace/ftrace.c:4166:6: warning: variable ‘ret’ set but not used [-Wunused-but-set-variable] kernel/trace/ftrace.c:5571:6: warning: variable ‘ret’ set but not used [-Wunused-but-set-variable] Link: https://lkml.kernel.org/r/1604674486-52350-1-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 03aad2b5cd5e..3db64fb0cce8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4162,7 +4162,6 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, struct ftrace_hash **orig_hash, *new_hash; LIST_HEAD(process_mods); char *func; - int ret; mutex_lock(&ops->func_hash->regex_lock); @@ -4215,7 +4214,7 @@ static void process_mod_list(struct list_head *head, struct ftrace_ops *ops, mutex_lock(&ftrace_lock); - ret = ftrace_hash_move_and_update_ops(ops, orig_hash, + ftrace_hash_move_and_update_ops(ops, orig_hash, new_hash, enable); mutex_unlock(&ftrace_lock); @@ -5567,7 +5566,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) struct ftrace_hash **orig_hash; struct trace_parser *parser; int filter_hash; - int ret; if (file->f_mode & FMODE_READ) { iter = m->private; @@ -5595,7 +5593,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); - ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash, + ftrace_hash_move_and_update_ops(iter->ops, orig_hash, iter->hash, filter_hash); mutex_unlock(&ftrace_lock); } else { -- cgit v1.2.3 From 2b5894cc33e9dea189a7010c7ed57d414786d174 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 29 Oct 2020 23:05:54 +0800 Subject: tracing: Fix some typos in comments s/detetector/detector/ s/enfoced/enforced/ s/writen/written/ s/actualy/actually/ s/bascially/basically/ s/Regarldess/Regardless/ s/zeroes/zeros/ s/followd/followed/ s/incrememented/incremented/ s/separatelly/separately/ s/accesible/accessible/ s/sythetic/synthetic/ s/enabed/enabled/ s/heurisitc/heuristic/ s/assocated/associated/ s/otherwides/otherwise/ s/specfied/specified/ s/seaching/searching/ s/hierachry/hierarchy/ s/internel/internal/ s/Thise/This/ Link: https://lkml.kernel.org/r/20201029150554.3354-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/blktrace.c | 4 ++-- kernel/trace/bpf_trace.c | 2 +- kernel/trace/trace.c | 2 +- kernel/trace/trace_benchmark.c | 6 +++--- kernel/trace/trace_dynevent.c | 2 +- kernel/trace/trace_dynevent.h | 6 +++--- kernel/trace/trace_entries.h | 2 +- kernel/trace/trace_events.c | 4 ++-- kernel/trace/trace_events_filter.c | 2 +- kernel/trace/trace_events_hist.c | 2 +- kernel/trace/trace_events_synth.c | 4 ++-- kernel/trace/trace_export.c | 2 +- kernel/trace/trace_hwlat.c | 4 ++-- kernel/trace/tracing_map.c | 6 +++--- kernel/trace/tracing_map.h | 2 +- 15 files changed, 25 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f1022945e346..1c3d0f57d763 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1343,7 +1343,7 @@ static void blk_log_action(struct trace_iterator *iter, const char *act, * ones now use the 64bit ino as the whole ID and * no longer use generation. * - * Regarldess of the content, always output + * Regardless of the content, always output * "LOW32,HIGH32" so that FILEID_INO32_GEN fid can * be mapped back to @id on both 64 and 32bit ino * setups. See __kernfs_fh_to_dentry(). @@ -1385,7 +1385,7 @@ static void blk_log_dump_pdu(struct trace_seq *s, i == 0 ? "" : " ", pdu_buf[i]); /* - * stop when the rest is just zeroes and indicate so + * stop when the rest is just zeros and indicate so * with a ".." appended */ if (i == end && end != pdu_len - 1) { diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b66518..f4172b870377 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -113,7 +113,7 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) * Instead of moving rcu_read_lock/rcu_dereference/rcu_read_unlock * to all call sites, we did a bpf_prog_array_valid() there to check * whether call->prog_array is empty or not, which is - * a heurisitc to speed up execution. + * a heuristic to speed up execution. * * If bpf_prog_array_valid() fetched prog_array was * non-NULL, we go into trace_call_bpf() and do the actual diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 410cfeb16db5..6a282bbc7e7f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3118,7 +3118,7 @@ struct trace_buffer_struct { static struct trace_buffer_struct *trace_percpu_buffer; /* - * Thise allows for lockless recording. If we're nested too deeply, then + * This allows for lockless recording. If we're nested too deeply, then * this returns NULL. */ static char *get_trace_buf(void) diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 2e9a4746ea85..801c2a7f7605 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -31,7 +31,7 @@ static bool ok_to_run; * it simply writes "START". As the first write is cold cache and * the rest is hot, we save off that time in bm_first and it is * reported as "first", which is shown in the second write to the - * tracepoint. The "first" field is writen within the statics from + * tracepoint. The "first" field is written within the statics from * then on but never changes. */ static void trace_do_benchmark(void) @@ -112,7 +112,7 @@ static void trace_do_benchmark(void) int i = 0; /* * stddev is the square of standard deviation but - * we want the actualy number. Use the average + * we want the actually number. Use the average * as our seed to find the std. * * The next try is: @@ -155,7 +155,7 @@ static int benchmark_event_kthread(void *arg) /* * We don't go to sleep, but let others run as well. - * This is bascially a "yield()" to let any task that + * This is basically a "yield()" to let any task that * wants to run, schedule in, but if the CPU is idle, * we'll keep burning cycles. * diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 5fa49cfd2bb6..4f967d5cd917 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -276,7 +276,7 @@ int dynevent_arg_add(struct dynevent_cmd *cmd, * arguments of the form 'type variable_name;' or 'x+y'. * * The lhs argument string will be appended to the current cmd string, - * followed by an operator, if applicable, followd by the rhs string, + * followed by an operator, if applicable, followed by the rhs string, * followed finally by a separator, if applicable. Before the * argument is added, the @check_arg function, if present, will be * used to check the sanity of the current arg strings. diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index d6857a254ede..d6f72dcb7269 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -29,10 +29,10 @@ struct dyn_event; * @show: Showing method. This is invoked when user reads the event definitions * via dynamic_events interface. * @is_busy: Check whether given event is busy so that it can not be deleted. - * Return true if it is busy, otherwides false. - * @free: Delete the given event. Return 0 if success, otherwides error. + * Return true if it is busy, otherwise false. + * @free: Delete the given event. Return 0 if success, otherwise error. * @match: Check whether given event and system name match this event. The argc - * and argv is used for exact match. Return true if it matches, otherwides + * and argv is used for exact match. Return true if it matches, otherwise * false. * * Except for @create, these methods are called under holding event_mutex. diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ceafe2dc97e1..4547ac59da61 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -32,7 +32,7 @@ * to be deciphered for the format file. Although these macros * may become out of sync with the internal structure, they * will create a compile error if it happens. Since the - * internel structures are just tracing helpers, this is not + * internal structures are just tracing helpers, this is not * an issue. * * When an internal structure is used, it should use: diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 244abbcd1db5..f4b459bb6d33 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2436,7 +2436,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) /* * Since calls are grouped by systems, the likelyhood that the * next call in the iteration belongs to the same system as the - * previous call is high. As an optimization, we skip seaching + * previous call is high. As an optimization, we skip searching * for a map[] that matches the call's system if the last call * was from the same system. That's what last_i is for. If the * call has the same system as the previous call, then last_i @@ -3271,7 +3271,7 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) * * When a new instance is created, it needs to set up its events * directory, as well as other files associated with events. It also - * creates the event hierachry in the @parent/events directory. + * creates the event hierarchy in the @parent/events directory. * * Returns 0 on success. * diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 78a678eeb140..d0f515ac9b7c 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1950,7 +1950,7 @@ static int __ftrace_function_set_filter(int filter, char *buf, int len, /* * The 'ip' field could have multiple filters set, separated * either by space or comma. We first cut the filter and apply - * all pieces separatelly. + * all pieces separately. */ re = ftrace_function_filter_re(buf, len, &re_cnt); if (!re) diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 96c3f86b81c5..39ebe1826fc3 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -3355,7 +3355,7 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data, } else { field_var = NULL; /* - * If no explicit system.event is specfied, default to + * If no explicit system.event is specified, default to * looking for fields on the onmatch(system.event.xxx) * event. */ diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 881df991742a..5a8bc0b421f1 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -1276,7 +1276,7 @@ static int __create_synth_event(int argc, const char *name, const char **argv) /** * synth_event_create - Create a new synthetic event - * @name: The name of the new sythetic event + * @name: The name of the new synthetic event * @fields: An array of type/name field descriptions * @n_fields: The number of field descriptions contained in the fields array * @mod: The module creating the event, NULL if not created from a module @@ -1446,7 +1446,7 @@ __synth_event_trace_init(struct trace_event_file *file, * this code to be called, etc). Because this is called * directly by the user, we don't have that but we still need * to honor not logging when disabled. For the iterated - * trace case, we save the enabed state upon start and just + * trace case, we save the enabled state upon start and just * ignore the following data calls. */ if (!(file->flags & EVENT_FILE_FL_ENABLED) || diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 90f81d33fa3f..d960f6b11b5e 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -26,7 +26,7 @@ static int ftrace_event_register(struct trace_event_call *call, /* * The FTRACE_ENTRY_REG macro allows ftrace entry to define register - * function and thus become accesible via perf. + * function and thus become accessible via perf. */ #undef FTRACE_ENTRY_REG #define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index c9ad5c6fbaad..d3ab2f4a77df 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -485,11 +485,11 @@ hwlat_width_write(struct file *filp, const char __user *ubuf, * @ppos: The current position in @file * * This function provides a write implementation for the "window" interface - * to the hardware latency detetector. The window is the total time + * to the hardware latency detector. The window is the total time * in us that will be considered one sample period. Conceptually, windows * occur back-to-back and contain a sample width period during which * actual sampling occurs. Can be used to write a new total window size. It - * is enfoced that any value written must be greater than the sample width + * is enforced that any value written must be greater than the sample width * size, or an error results. */ static ssize_t diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c index 4b50fc0cb12c..d6bddb157ef2 100644 --- a/kernel/trace/tracing_map.c +++ b/kernel/trace/tracing_map.c @@ -609,7 +609,7 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) * signal that state. There are two user-visible tracing_map * variables, 'hits' and 'drops', which are updated by this function. * Every time an element is either successfully inserted or retrieved, - * the 'hits' value is incrememented. Every time an element insertion + * the 'hits' value is incremented. Every time an element insertion * fails, the 'drops' value is incremented. * * This is a lock-free tracing map insertion function implementing a @@ -642,9 +642,9 @@ struct tracing_map_elt *tracing_map_insert(struct tracing_map *map, void *key) * tracing_map_elt. This is a lock-free lookup; see * tracing_map_insert() for details on tracing_map and how it works. * Every time an element is retrieved, the 'hits' value is - * incrememented. There is one user-visible tracing_map variable, + * incremented. There is one user-visible tracing_map variable, * 'hits', which is updated by this function. Every time an element - * is successfully retrieved, the 'hits' value is incrememented. The + * is successfully retrieved, the 'hits' value is incremented. The * 'drops' value is never updated by this function. * * Return: the tracing_map_elt pointer val associated with the key. diff --git a/kernel/trace/tracing_map.h b/kernel/trace/tracing_map.h index a6de61fc22de..2c765ee2a4d4 100644 --- a/kernel/trace/tracing_map.h +++ b/kernel/trace/tracing_map.h @@ -50,7 +50,7 @@ typedef int (*tracing_map_cmp_fn_t) (void *val_a, void *val_b); * an instance of tracing_map_elt, where 'elt' in the latter part of * that variable name is short for 'element'. The purpose of a * tracing_map_elt is to hold values specific to the particular - * 32-bit hashed key it's assocated with. Things such as the unique + * 32-bit hashed key it's associated with. Things such as the unique * set of aggregated sums associated with the 32-bit hashed key, along * with a copy of the full key associated with the entry, and which * was used to produce the 32-bit hashed key. -- cgit v1.2.3 From 7112d127984bd7b0c8ded7973b358829f16735f5 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Tue, 10 Nov 2020 20:06:45 -0800 Subject: bpf: Compile out btf_parse_module() if module BTF is not enabled Make sure btf_parse_module() is compiled out if module BTFs are not enabled. Fixes: 36e68442d1af ("bpf: Load and verify kernel module BTFs") Reported-by: Stephen Rothwell Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201111040645.903494-1-andrii@kernel.org --- kernel/bpf/btf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0f1fd2669d69..6b2d508b33d4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4478,6 +4478,8 @@ errout: return ERR_PTR(err); } +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + static struct btf *btf_parse_module(const char *module_name, const void *data, unsigned int data_size) { struct btf_verifier_env *env = NULL; @@ -4547,6 +4549,8 @@ errout: return ERR_PTR(err); } +#endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ + struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { struct bpf_prog *tgt_prog = prog->aux->dst_prog; -- cgit v1.2.3 From f16e631333a8f12ae8128826e695db4b2a528407 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Wed, 11 Nov 2020 13:03:46 +0800 Subject: bpf: Fix unsigned 'datasec_id' compared with zero in check_pseudo_btf_id The unsigned variable datasec_id is assigned a return value from the call to check_pseudo_btf_id(), which may return negative error code. This fixes the following coccicheck warning: ./kernel/bpf/verifier.c:9616:5-15: WARNING: Unsigned expression compared with zero: datasec_id > 0 Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Reported-by: Tosk Robot Signed-off-by: Kaixu Xia Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: John Fastabend Cc: Hao Luo Link: https://lore.kernel.org/bpf/1605071026-25906-1-git-send-email-kaixuxia@tencent.com --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6200519582a6..6204ec705d80 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9572,12 +9572,13 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_insn_aux_data *aux) { - u32 datasec_id, type, id = insn->imm; const struct btf_var_secinfo *vsi; const struct btf_type *datasec; const struct btf_type *t; const char *sym_name; bool percpu = false; + u32 type, id = insn->imm; + s32 datasec_id; u64 addr; int i; -- cgit v1.2.3 From 584da076866f38ffb952efcc25af039f9551df81 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 10 Nov 2020 15:26:49 +0200 Subject: printk: ringbuffer: Reference text_data_ring directly in callees. A bunch of functions in the new ringbuffer code take both a printk_ringbuffer struct and a separate prb_data_ring. This is a relic from an earlier version of the code when a second data ring was present. Since this is no longer the case remove the extra function argument from: - data_make_reusable() - data_push_tail() - data_alloc() - data_realloc() Signed-off-by: Nikolay Borisov Reviewed-by: John Ogness Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek --- kernel/printk/printk_ringbuffer.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 24a960a89aa8..3b24c3aa55f4 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -559,11 +559,12 @@ static void desc_make_reusable(struct prb_desc_ring *desc_ring, * on error the caller can re-load the tail lpos to determine the situation. */ static bool data_make_reusable(struct printk_ringbuffer *rb, - struct prb_data_ring *data_ring, unsigned long lpos_begin, unsigned long lpos_end, unsigned long *lpos_out) { + + struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_desc_ring *desc_ring = &rb->desc_ring; struct prb_data_block *blk; enum desc_state d_state; @@ -625,10 +626,9 @@ static bool data_make_reusable(struct printk_ringbuffer *rb, * descriptors into the reusable state if the tail is pushed beyond * their associated data block. */ -static bool data_push_tail(struct printk_ringbuffer *rb, - struct prb_data_ring *data_ring, - unsigned long lpos) +static bool data_push_tail(struct printk_ringbuffer *rb, unsigned long lpos) { + struct prb_data_ring *data_ring = &rb->text_data_ring; unsigned long tail_lpos_new; unsigned long tail_lpos; unsigned long next_lpos; @@ -669,8 +669,7 @@ static bool data_push_tail(struct printk_ringbuffer *rb, * Make all descriptors reusable that are associated with * data blocks before @lpos. */ - if (!data_make_reusable(rb, data_ring, tail_lpos, lpos, - &next_lpos)) { + if (!data_make_reusable(rb, tail_lpos, lpos, &next_lpos)) { /* * 1. Guarantee the block ID loaded in * data_make_reusable() is performed before @@ -807,7 +806,7 @@ static bool desc_push_tail(struct printk_ringbuffer *rb, * data blocks once their associated descriptor is gone. */ - if (!data_push_tail(rb, &rb->text_data_ring, desc.text_blk_lpos.next)) + if (!data_push_tail(rb, desc.text_blk_lpos.next)) return false; /* @@ -1021,10 +1020,10 @@ static unsigned long get_next_lpos(struct prb_data_ring *data_ring, * if necessary. This function also associates the data block with * a specified descriptor. */ -static char *data_alloc(struct printk_ringbuffer *rb, - struct prb_data_ring *data_ring, unsigned int size, +static char *data_alloc(struct printk_ringbuffer *rb, unsigned int size, struct prb_data_blk_lpos *blk_lpos, unsigned long id) { + struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_data_block *blk; unsigned long begin_lpos; unsigned long next_lpos; @@ -1043,7 +1042,7 @@ static char *data_alloc(struct printk_ringbuffer *rb, do { next_lpos = get_next_lpos(data_ring, begin_lpos, size); - if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) { + if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) { /* Failed to allocate, specify a data-less block. */ blk_lpos->begin = FAILED_LPOS; blk_lpos->next = FAILED_LPOS; @@ -1102,10 +1101,10 @@ static char *data_alloc(struct printk_ringbuffer *rb, * Return a pointer to the beginning of the entire data buffer or NULL on * failure. */ -static char *data_realloc(struct printk_ringbuffer *rb, - struct prb_data_ring *data_ring, unsigned int size, +static char *data_realloc(struct printk_ringbuffer *rb, unsigned int size, struct prb_data_blk_lpos *blk_lpos, unsigned long id) { + struct prb_data_ring *data_ring = &rb->text_data_ring; struct prb_data_block *blk; unsigned long head_lpos; unsigned long next_lpos; @@ -1132,7 +1131,7 @@ static char *data_realloc(struct printk_ringbuffer *rb, return &blk->data[0]; } - if (!data_push_tail(rb, data_ring, next_lpos - DATA_SIZE(data_ring))) + if (!data_push_tail(rb, next_lpos - DATA_SIZE(data_ring))) return NULL; /* The memory barrier involvement is the same as data_alloc:A. */ @@ -1397,7 +1396,7 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer if (r->text_buf_size > max_size) goto fail; - r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, + r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); } else { if (!get_data(&rb->text_data_ring, &d->text_blk_lpos, &data_size)) @@ -1421,7 +1420,7 @@ bool prb_reserve_in_last(struct prb_reserved_entry *e, struct printk_ringbuffer if (r->text_buf_size > max_size) goto fail; - r->text_buf = data_realloc(rb, &rb->text_data_ring, r->text_buf_size, + r->text_buf = data_realloc(rb, r->text_buf_size, &d->text_blk_lpos, id); } if (r->text_buf_size && !r->text_buf) @@ -1549,8 +1548,7 @@ bool prb_reserve(struct prb_reserved_entry *e, struct printk_ringbuffer *rb, if (info->seq > 0) desc_make_final(desc_ring, DESC_ID(id - 1)); - r->text_buf = data_alloc(rb, &rb->text_data_ring, r->text_buf_size, - &d->text_blk_lpos, id); + r->text_buf = data_alloc(rb, r->text_buf_size, &d->text_blk_lpos, id); /* If text data allocation fails, a data-less record is committed. */ if (r->text_buf_size && !r->text_buf) { prb_commit(e); -- cgit v1.2.3 From 09a3dac7b579e57e7ef2d875b9216c845ae8a0e5 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 11 Nov 2020 16:19:19 -0800 Subject: bpf: Fix NULL dereference in bpf_task_storage In bpf_pid_task_storage_update_elem(), it missed to test the !task_storage_ptr(task) which then could trigger a NULL pointer exception in bpf_local_storage_update(). Fixes: 4cf1bc1f1045 ("bpf: Implement task local storage") Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Tested-by: Roman Gushchin Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20201112001919.2028357-1-kafai@fb.com --- kernel/bpf/bpf_task_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 39a45fba4fb0..4ef1959a78f2 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -150,7 +150,7 @@ static int bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key, */ WARN_ON_ONCE(!rcu_read_lock_held()); task = pid_task(pid, PIDTYPE_PID); - if (!task) { + if (!task || !task_storage_ptr(task)) { err = -ENOENT; goto out; } -- cgit v1.2.3 From 6d94e741a8ff818e5518da8257f5ca0aaed1f269 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 10 Nov 2020 19:12:11 -0800 Subject: bpf: Support for pointers beyond pkt_end. This patch adds the verifier support to recognize inlined branch conditions. The LLVM knows that the branch evaluates to the same value, but the verifier couldn't track it. Hence causing valid programs to be rejected. The potential LLVM workaround: https://reviews.llvm.org/D87428 can have undesired side effects, since LLVM doesn't know that skb->data/data_end are being compared. LLVM has to introduce extra boolean variable and use inline_asm trick to force easier for the verifier assembly. Instead teach the verifier to recognize that r1 = skb->data; r1 += 10; r2 = skb->data_end; if (r1 > r2) { here r1 points beyond packet_end and subsequent if (r1 > r2) // always evaluates to "true". } Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Jiri Olsa Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201111031213.25109-2-alexei.starovoitov@gmail.com --- include/linux/bpf_verifier.h | 2 +- kernel/bpf/verifier.c | 129 +++++++++++++++++++++++++++++++++++-------- 2 files changed, 108 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e83ef6f6bf43..306869d4743b 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -45,7 +45,7 @@ struct bpf_reg_state { enum bpf_reg_type type; union { /* valid when type == PTR_TO_PACKET */ - u16 range; + int range; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 10da26e55130..7b1f85aa9741 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2739,7 +2739,9 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, regno); return -EACCES; } - err = __check_mem_access(env, regno, off, size, reg->range, + + err = reg->range < 0 ? -EINVAL : + __check_mem_access(env, regno, off, size, reg->range, zero_size_allowed); if (err) { verbose(env, "R%d offset is outside of the packet\n", regno); @@ -4697,6 +4699,32 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) __clear_all_pkt_pointers(env, vstate->frame[i]); } +enum { + AT_PKT_END = -1, + BEYOND_PKT_END = -2, +}; + +static void mark_pkt_end(struct bpf_verifier_state *vstate, int regn, bool range_open) +{ + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *reg = &state->regs[regn]; + + if (reg->type != PTR_TO_PACKET) + /* PTR_TO_PACKET_META is not supported yet */ + return; + + /* The 'reg' is pkt > pkt_end or pkt >= pkt_end. + * How far beyond pkt_end it goes is unknown. + * if (!range_open) it's the case of pkt >= pkt_end + * if (range_open) it's the case of pkt > pkt_end + * hence this pointer is at least 1 byte bigger than pkt_end + */ + if (range_open) + reg->range = BEYOND_PKT_END; + else + reg->range = AT_PKT_END; +} + static void release_reg_references(struct bpf_verifier_env *env, struct bpf_func_state *state, int ref_obj_id) @@ -6708,7 +6736,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) static void __find_good_pkt_pointers(struct bpf_func_state *state, struct bpf_reg_state *dst_reg, - enum bpf_reg_type type, u16 new_range) + enum bpf_reg_type type, int new_range) { struct bpf_reg_state *reg; int i; @@ -6733,8 +6761,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, enum bpf_reg_type type, bool range_right_open) { - u16 new_range; - int i; + int new_range, i; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -6985,6 +7012,67 @@ static int is_branch_taken(struct bpf_reg_state *reg, u64 val, u8 opcode, return is_branch64_taken(reg, val, opcode); } +static int flip_opcode(u32 opcode) +{ + /* How can we transform "a b" into "b a"? */ + static const u8 opcode_flip[16] = { + /* these stay the same */ + [BPF_JEQ >> 4] = BPF_JEQ, + [BPF_JNE >> 4] = BPF_JNE, + [BPF_JSET >> 4] = BPF_JSET, + /* these swap "lesser" and "greater" (L and G in the opcodes) */ + [BPF_JGE >> 4] = BPF_JLE, + [BPF_JGT >> 4] = BPF_JLT, + [BPF_JLE >> 4] = BPF_JGE, + [BPF_JLT >> 4] = BPF_JGT, + [BPF_JSGE >> 4] = BPF_JSLE, + [BPF_JSGT >> 4] = BPF_JSLT, + [BPF_JSLE >> 4] = BPF_JSGE, + [BPF_JSLT >> 4] = BPF_JSGT + }; + return opcode_flip[opcode >> 4]; +} + +static int is_pkt_ptr_branch_taken(struct bpf_reg_state *dst_reg, + struct bpf_reg_state *src_reg, + u8 opcode) +{ + struct bpf_reg_state *pkt; + + if (src_reg->type == PTR_TO_PACKET_END) { + pkt = dst_reg; + } else if (dst_reg->type == PTR_TO_PACKET_END) { + pkt = src_reg; + opcode = flip_opcode(opcode); + } else { + return -1; + } + + if (pkt->range >= 0) + return -1; + + switch (opcode) { + case BPF_JLE: + /* pkt <= pkt_end */ + fallthrough; + case BPF_JGT: + /* pkt > pkt_end */ + if (pkt->range == BEYOND_PKT_END) + /* pkt has at last one extra byte beyond pkt_end */ + return opcode == BPF_JGT; + break; + case BPF_JLT: + /* pkt < pkt_end */ + fallthrough; + case BPF_JGE: + /* pkt >= pkt_end */ + if (pkt->range == BEYOND_PKT_END || pkt->range == AT_PKT_END) + return opcode == BPF_JGE; + break; + } + return -1; +} + /* Adjusts the register min/max values in the case that the dst_reg is the * variable register that we are working on, and src_reg is a constant or we're * simply doing a BPF_K check. @@ -7148,23 +7236,7 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg, u64 val, u32 val32, u8 opcode, bool is_jmp32) { - /* How can we transform "a b" into "b a"? */ - static const u8 opcode_flip[16] = { - /* these stay the same */ - [BPF_JEQ >> 4] = BPF_JEQ, - [BPF_JNE >> 4] = BPF_JNE, - [BPF_JSET >> 4] = BPF_JSET, - /* these swap "lesser" and "greater" (L and G in the opcodes) */ - [BPF_JGE >> 4] = BPF_JLE, - [BPF_JGT >> 4] = BPF_JLT, - [BPF_JLE >> 4] = BPF_JGE, - [BPF_JLT >> 4] = BPF_JGT, - [BPF_JSGE >> 4] = BPF_JSLE, - [BPF_JSGT >> 4] = BPF_JSLT, - [BPF_JSLE >> 4] = BPF_JSGE, - [BPF_JSLT >> 4] = BPF_JSGT - }; - opcode = opcode_flip[opcode >> 4]; + opcode = flip_opcode(opcode); /* This uses zero as "not present in table"; luckily the zero opcode, * BPF_JA, can't get here. */ @@ -7346,6 +7418,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' > pkt_end, pkt_meta' > pkt_data */ find_good_pkt_pointers(this_branch, dst_reg, dst_reg->type, false); + mark_pkt_end(other_branch, insn->dst_reg, true); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7353,6 +7426,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end > pkt_data', pkt_data > pkt_meta' */ find_good_pkt_pointers(other_branch, src_reg, src_reg->type, true); + mark_pkt_end(this_branch, insn->src_reg, false); } else { return false; } @@ -7365,6 +7439,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' < pkt_end, pkt_meta' < pkt_data */ find_good_pkt_pointers(other_branch, dst_reg, dst_reg->type, true); + mark_pkt_end(this_branch, insn->dst_reg, false); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7372,6 +7447,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end < pkt_data', pkt_data > pkt_meta' */ find_good_pkt_pointers(this_branch, src_reg, src_reg->type, false); + mark_pkt_end(other_branch, insn->src_reg, true); } else { return false; } @@ -7384,6 +7460,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' >= pkt_end, pkt_meta' >= pkt_data */ find_good_pkt_pointers(this_branch, dst_reg, dst_reg->type, true); + mark_pkt_end(other_branch, insn->dst_reg, false); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7391,6 +7468,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end >= pkt_data', pkt_data >= pkt_meta' */ find_good_pkt_pointers(other_branch, src_reg, src_reg->type, false); + mark_pkt_end(this_branch, insn->src_reg, true); } else { return false; } @@ -7403,6 +7481,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_data' <= pkt_end, pkt_meta' <= pkt_data */ find_good_pkt_pointers(other_branch, dst_reg, dst_reg->type, false); + mark_pkt_end(this_branch, insn->dst_reg, true); } else if ((dst_reg->type == PTR_TO_PACKET_END && src_reg->type == PTR_TO_PACKET) || (reg_is_init_pkt_pointer(dst_reg, PTR_TO_PACKET) && @@ -7410,6 +7489,7 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, /* pkt_end <= pkt_data', pkt_data <= pkt_meta' */ find_good_pkt_pointers(this_branch, src_reg, src_reg->type, true); + mark_pkt_end(other_branch, insn->src_reg, false); } else { return false; } @@ -7509,6 +7589,10 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, src_reg->var_off.value, opcode, is_jmp32); + } else if (reg_is_pkt_pointer_any(dst_reg) && + reg_is_pkt_pointer_any(src_reg) && + !is_jmp32) { + pred = is_pkt_ptr_branch_taken(dst_reg, src_reg, opcode); } if (pred >= 0) { @@ -7517,7 +7601,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, */ if (!__is_pointer_value(false, dst_reg)) err = mark_chain_precision(env, insn->dst_reg); - if (BPF_SRC(insn->code) == BPF_X && !err) + if (BPF_SRC(insn->code) == BPF_X && !err && + !__is_pointer_value(false, src_reg)) err = mark_chain_precision(env, insn->src_reg); if (err) return err; -- cgit v1.2.3 From 8e4597c627fb48f361e2a5b012202cb1b6cbcd5e Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Thu, 12 Nov 2020 13:13:13 -0800 Subject: bpf: Allow using bpf_sk_storage in FENTRY/FEXIT/RAW_TP This patch enables the FENTRY/FEXIT/RAW_TP tracing program to use the bpf_sk_storage_(get|delete) helper, so those tracing programs can access the sk's bpf_local_storage and the later selftest will show some examples. The bpf_sk_storage is currently used in bpf-tcp-cc, tc, cg sockops...etc which is running either in softirq or task context. This patch adds bpf_sk_storage_get_tracing_proto and bpf_sk_storage_delete_tracing_proto. They will check in runtime that the helpers can only be called when serving softirq or running in a task context. That should enable most common tracing use cases on sk. During the load time, the new tracing_allowed() function will ensure the tracing prog using the bpf_sk_storage_(get|delete) helper is not tracing any bpf_sk_storage*() function itself. The sk is passed as "void *" when calling into bpf_local_storage. This patch only allows tracing a kernel function. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201112211313.2587383-1-kafai@fb.com --- include/net/bpf_sk_storage.h | 2 ++ kernel/trace/bpf_trace.c | 5 +++ net/core/bpf_sk_storage.c | 74 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 81 insertions(+) (limited to 'kernel') diff --git a/include/net/bpf_sk_storage.h b/include/net/bpf_sk_storage.h index 3c516dd07caf..0e85713f56df 100644 --- a/include/net/bpf_sk_storage.h +++ b/include/net/bpf_sk_storage.h @@ -20,6 +20,8 @@ void bpf_sk_storage_free(struct sock *sk); extern const struct bpf_func_proto bpf_sk_storage_get_proto; extern const struct bpf_func_proto bpf_sk_storage_delete_proto; +extern const struct bpf_func_proto bpf_sk_storage_get_tracing_proto; +extern const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto; struct bpf_local_storage_elem; struct bpf_sk_storage_diag; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e4515b0f62a8..cfce60ad1cb5 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -1735,6 +1736,10 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_skc_to_tcp_request_sock_proto; case BPF_FUNC_skc_to_udp6_sock: return &bpf_skc_to_udp6_sock_proto; + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_tracing_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_tracing_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index fd416678f236..359908a7d3c1 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -378,6 +379,79 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto = { .arg2_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, }; +static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog) +{ + const struct btf *btf_vmlinux; + const struct btf_type *t; + const char *tname; + u32 btf_id; + + if (prog->aux->dst_prog) + return false; + + /* Ensure the tracing program is not tracing + * any bpf_sk_storage*() function and also + * use the bpf_sk_storage_(get|delete) helper. + */ + switch (prog->expected_attach_type) { + case BPF_TRACE_RAW_TP: + /* bpf_sk_storage has no trace point */ + return true; + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + btf_vmlinux = bpf_get_btf_vmlinux(); + btf_id = prog->aux->attach_btf_id; + t = btf_type_by_id(btf_vmlinux, btf_id); + tname = btf_name_by_offset(btf_vmlinux, t->name_off); + return !!strncmp(tname, "bpf_sk_storage", + strlen("bpf_sk_storage")); + default: + return false; + } + + return false; +} + +BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk, + void *, value, u64, flags) +{ + if (!in_serving_softirq() && !in_task()) + return (unsigned long)NULL; + + return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags); +} + +BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map, + struct sock *, sk) +{ + if (!in_serving_softirq() && !in_task()) + return -EPERM; + + return ____bpf_sk_storage_delete(map, sk); +} + +const struct bpf_func_proto bpf_sk_storage_get_tracing_proto = { + .func = bpf_sk_storage_get_tracing, + .gpl_only = false, + .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL, + .arg4_type = ARG_ANYTHING, + .allowed = bpf_sk_storage_tracing_allowed, +}; + +const struct bpf_func_proto bpf_sk_storage_delete_tracing_proto = { + .func = bpf_sk_storage_delete_tracing, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_CONST_MAP_PTR, + .arg2_type = ARG_PTR_TO_BTF_ID, + .arg2_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_SOCK_COMMON], + .allowed = bpf_sk_storage_tracing_allowed, +}; + struct bpf_sk_storage_diag { u32 nr_maps; struct bpf_map *maps[]; -- cgit v1.2.3 From 423f16108c9d832bd96059d5c882c8ef6d76eb96 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 13 Nov 2020 00:59:29 +0000 Subject: bpf: Augment the set of sleepable LSM hooks Update the set of sleepable hooks with the ones that do not trigger a warning with might_fault() when exercised with the correct kernel config options enabled, i.e. DEBUG_ATOMIC_SLEEP=y LOCKDEP=y PROVE_LOCKING=y This means that a sleepable LSM eBPF program can be attached to these LSM hooks. A new helper method bpf_lsm_is_sleepable_hook is added and the set is maintained locally in bpf_lsm.c Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201113005930.541956-2-kpsingh@chromium.org --- include/linux/bpf_lsm.h | 7 +++++ kernel/bpf/bpf_lsm.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/verifier.c | 16 +--------- 3 files changed, 89 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_lsm.h b/include/linux/bpf_lsm.h index 73226181b744..0d1c33ace398 100644 --- a/include/linux/bpf_lsm.h +++ b/include/linux/bpf_lsm.h @@ -27,6 +27,8 @@ extern struct lsm_blob_sizes bpf_lsm_blob_sizes; int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog); +bool bpf_lsm_is_sleepable_hook(u32 btf_id); + static inline struct bpf_storage_blob *bpf_inode( const struct inode *inode) { @@ -54,6 +56,11 @@ void bpf_task_storage_free(struct task_struct *task); #else /* !CONFIG_BPF_LSM */ +static inline bool bpf_lsm_is_sleepable_hook(u32 btf_id) +{ + return false; +} + static inline int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) { diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index e92c51bebb47..aed74b853415 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -13,6 +13,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -72,6 +73,86 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) } } +/* The set of hooks which are called without pagefaults disabled and are allowed + * to "sleep" and thus can be used for sleeable BPF programs. + */ +BTF_SET_START(sleepable_lsm_hooks) +BTF_ID(func, bpf_lsm_bpf) +BTF_ID(func, bpf_lsm_bpf_map) +BTF_ID(func, bpf_lsm_bpf_map_alloc_security) +BTF_ID(func, bpf_lsm_bpf_map_free_security) +BTF_ID(func, bpf_lsm_bpf_prog) +BTF_ID(func, bpf_lsm_bprm_check_security) +BTF_ID(func, bpf_lsm_bprm_committed_creds) +BTF_ID(func, bpf_lsm_bprm_committing_creds) +BTF_ID(func, bpf_lsm_bprm_creds_for_exec) +BTF_ID(func, bpf_lsm_bprm_creds_from_file) +BTF_ID(func, bpf_lsm_capget) +BTF_ID(func, bpf_lsm_capset) +BTF_ID(func, bpf_lsm_cred_prepare) +BTF_ID(func, bpf_lsm_file_ioctl) +BTF_ID(func, bpf_lsm_file_lock) +BTF_ID(func, bpf_lsm_file_open) +BTF_ID(func, bpf_lsm_file_receive) +BTF_ID(func, bpf_lsm_inet_conn_established) +BTF_ID(func, bpf_lsm_inode_create) +BTF_ID(func, bpf_lsm_inode_free_security) +BTF_ID(func, bpf_lsm_inode_getattr) +BTF_ID(func, bpf_lsm_inode_getxattr) +BTF_ID(func, bpf_lsm_inode_mknod) +BTF_ID(func, bpf_lsm_inode_need_killpriv) +BTF_ID(func, bpf_lsm_inode_post_setxattr) +BTF_ID(func, bpf_lsm_inode_readlink) +BTF_ID(func, bpf_lsm_inode_rename) +BTF_ID(func, bpf_lsm_inode_rmdir) +BTF_ID(func, bpf_lsm_inode_setattr) +BTF_ID(func, bpf_lsm_inode_setxattr) +BTF_ID(func, bpf_lsm_inode_symlink) +BTF_ID(func, bpf_lsm_inode_unlink) +BTF_ID(func, bpf_lsm_kernel_module_request) +BTF_ID(func, bpf_lsm_kernfs_init_security) +BTF_ID(func, bpf_lsm_key_free) +BTF_ID(func, bpf_lsm_mmap_file) +BTF_ID(func, bpf_lsm_netlink_send) +BTF_ID(func, bpf_lsm_path_notify) +BTF_ID(func, bpf_lsm_release_secctx) +BTF_ID(func, bpf_lsm_sb_alloc_security) +BTF_ID(func, bpf_lsm_sb_eat_lsm_opts) +BTF_ID(func, bpf_lsm_sb_kern_mount) +BTF_ID(func, bpf_lsm_sb_mount) +BTF_ID(func, bpf_lsm_sb_remount) +BTF_ID(func, bpf_lsm_sb_set_mnt_opts) +BTF_ID(func, bpf_lsm_sb_show_options) +BTF_ID(func, bpf_lsm_sb_statfs) +BTF_ID(func, bpf_lsm_sb_umount) +BTF_ID(func, bpf_lsm_settime) +BTF_ID(func, bpf_lsm_socket_accept) +BTF_ID(func, bpf_lsm_socket_bind) +BTF_ID(func, bpf_lsm_socket_connect) +BTF_ID(func, bpf_lsm_socket_create) +BTF_ID(func, bpf_lsm_socket_getpeername) +BTF_ID(func, bpf_lsm_socket_getpeersec_dgram) +BTF_ID(func, bpf_lsm_socket_getsockname) +BTF_ID(func, bpf_lsm_socket_getsockopt) +BTF_ID(func, bpf_lsm_socket_listen) +BTF_ID(func, bpf_lsm_socket_post_create) +BTF_ID(func, bpf_lsm_socket_recvmsg) +BTF_ID(func, bpf_lsm_socket_sendmsg) +BTF_ID(func, bpf_lsm_socket_shutdown) +BTF_ID(func, bpf_lsm_socket_socketpair) +BTF_ID(func, bpf_lsm_syslog) +BTF_ID(func, bpf_lsm_task_alloc) +BTF_ID(func, bpf_lsm_task_getsecid) +BTF_ID(func, bpf_lsm_task_prctl) +BTF_ID(func, bpf_lsm_task_setscheduler) +BTF_ID(func, bpf_lsm_task_to_inode) +BTF_SET_END(sleepable_lsm_hooks) + +bool bpf_lsm_is_sleepable_hook(u32 btf_id) +{ + return btf_id_set_contains(&sleepable_lsm_hooks, btf_id); +} + const struct bpf_prog_ops lsm_prog_ops = { }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7b1f85aa9741..fb2943ea715d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11562,20 +11562,6 @@ static int check_attach_modify_return(unsigned long addr, const char *func_name) return -EINVAL; } -/* non exhaustive list of sleepable bpf_lsm_*() functions */ -BTF_SET_START(btf_sleepable_lsm_hooks) -#ifdef CONFIG_BPF_LSM -BTF_ID(func, bpf_lsm_bprm_committed_creds) -#else -BTF_ID_UNUSED -#endif -BTF_SET_END(btf_sleepable_lsm_hooks) - -static int check_sleepable_lsm_hook(u32 btf_id) -{ - return btf_id_set_contains(&btf_sleepable_lsm_hooks, btf_id); -} - /* list of non-sleepable functions that are otherwise on * ALLOW_ERROR_INJECTION list */ @@ -11797,7 +11783,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, /* LSM progs check that they are attached to bpf_lsm_*() funcs. * Only some of them are sleepable. */ - if (check_sleepable_lsm_hook(btf_id)) + if (bpf_lsm_is_sleepable_hook(btf_id)) ret = 0; break; default: -- cgit v1.2.3 From 6f100640ca5b2a2ff67b001c9fd3de21f7b12cf2 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Fri, 13 Nov 2020 00:59:30 +0000 Subject: bpf: Expose bpf_d_path helper to sleepable LSM hooks Sleepable hooks are never called from an NMI/interrupt context, so it is safe to use the bpf_d_path helper in LSM programs attaching to these hooks. The helper is not restricted to sleepable programs and merely uses the list of sleepable hooks as the initial subset of LSM hooks where it can be used. Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201113005930.541956-3-kpsingh@chromium.org --- kernel/trace/bpf_trace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cfce60ad1cb5..02986c7b90eb 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -16,6 +16,8 @@ #include #include #include +#include + #include #include @@ -1179,7 +1181,11 @@ BTF_SET_END(btf_allowlist_d_path) static bool bpf_d_path_allowed(const struct bpf_prog *prog) { - return btf_id_set_contains(&btf_allowlist_d_path, prog->aux->attach_btf_id); + if (prog->type == BPF_PROG_TYPE_LSM) + return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); + + return btf_id_set_contains(&btf_allowlist_d_path, + prog->aux->attach_btf_id); } BTF_ID_LIST_SINGLE(bpf_d_path_btf_ids, struct, path) -- cgit v1.2.3 From d19ad0775dcd64b49eecf4fa79c17959ebfbd26b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Oct 2020 17:42:17 -0400 Subject: ftrace: Have the callbacks receive a struct ftrace_regs instead of pt_regs In preparation to have arguments of a function passed to callbacks attached to functions as default, change the default callback prototype to receive a struct ftrace_regs as the forth parameter instead of a pt_regs. For callbacks that set the FL_SAVE_REGS flag in their ftrace_ops flags, they will now need to get the pt_regs via a ftrace_get_regs() helper call. If this is called by a callback that their ftrace_ops did not have a FL_SAVE_REGS flag set, it that helper function will return NULL. This will allow the ftrace_regs to hold enough just to get the parameters and stack pointer, but without the worry that callbacks may have a pt_regs that is not completely filled. Acked-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- arch/csky/kernel/probes/ftrace.c | 4 +++- arch/nds32/kernel/ftrace.c | 4 ++-- arch/parisc/kernel/ftrace.c | 8 +++++--- arch/powerpc/kernel/kprobes-ftrace.c | 4 +++- arch/s390/kernel/ftrace.c | 4 +++- arch/x86/kernel/kprobes/ftrace.c | 3 ++- fs/pstore/ftrace.c | 2 +- include/linux/ftrace.h | 16 ++++++++++++++-- include/linux/kprobes.h | 2 +- kernel/livepatch/patch.c | 3 ++- kernel/trace/ftrace.c | 27 +++++++++++++++------------ kernel/trace/trace_event_perf.c | 2 +- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_functions.c | 9 ++++----- kernel/trace/trace_irqsoff.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- kernel/trace/trace_selftest.c | 20 +++++++++++--------- kernel/trace/trace_stack.c | 2 +- 18 files changed, 71 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/arch/csky/kernel/probes/ftrace.c b/arch/csky/kernel/probes/ftrace.c index f30b179924ef..ae2b1c7b3b5c 100644 --- a/arch/csky/kernel/probes/ftrace.c +++ b/arch/csky/kernel/probes/ftrace.c @@ -11,17 +11,19 @@ int arch_check_ftrace_location(struct kprobe *p) /* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { int bit; bool lr_saver = false; struct kprobe *p; struct kprobe_ctlblk *kcb; + struct pt_regs *regs; bit = ftrace_test_recursion_trylock(ip, parent_ip); if (bit < 0) return; + regs = ftrace_get_regs(fregs); preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (!p) { diff --git a/arch/nds32/kernel/ftrace.c b/arch/nds32/kernel/ftrace.c index 3763b3f8c3db..414f8a780cc3 100644 --- a/arch/nds32/kernel/ftrace.c +++ b/arch/nds32/kernel/ftrace.c @@ -10,7 +10,7 @@ extern void (*ftrace_trace_function)(unsigned long, unsigned long, extern void ftrace_graph_caller(void); noinline void __naked ftrace_stub(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { __asm__ (""); /* avoid to optimize as pure function */ } @@ -38,7 +38,7 @@ EXPORT_SYMBOL(_mcount); #else /* CONFIG_DYNAMIC_FTRACE */ noinline void __naked ftrace_stub(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { __asm__ (""); /* avoid to optimize as pure function */ } diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index 1c5d3732bda2..0a1e75af5382 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -51,7 +51,7 @@ static void __hot prepare_ftrace_return(unsigned long *parent, void notrace __hot ftrace_function_trampoline(unsigned long parent, unsigned long self_addr, unsigned long org_sp_gr3, - struct pt_regs *regs) + struct ftrace_regs *fregs) { #ifndef CONFIG_DYNAMIC_FTRACE extern ftrace_func_t ftrace_trace_function; @@ -61,7 +61,7 @@ void notrace __hot ftrace_function_trampoline(unsigned long parent, if (function_trace_op->flags & FTRACE_OPS_FL_ENABLED && ftrace_trace_function != ftrace_stub) ftrace_trace_function(self_addr, parent, - function_trace_op, regs); + function_trace_op, fregs); #ifdef CONFIG_FUNCTION_GRAPH_TRACER if (dereference_function_descriptor(ftrace_graph_return) != @@ -204,9 +204,10 @@ int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, #ifdef CONFIG_KPROBES_ON_FTRACE void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct kprobe_ctlblk *kcb; + struct pt_regs *regs; struct kprobe *p; int bit; @@ -214,6 +215,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; + regs = ftrace_get_regs(fregs); preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) diff --git a/arch/powerpc/kernel/kprobes-ftrace.c b/arch/powerpc/kernel/kprobes-ftrace.c index fdfee39938ea..660138f6c4b2 100644 --- a/arch/powerpc/kernel/kprobes-ftrace.c +++ b/arch/powerpc/kernel/kprobes-ftrace.c @@ -14,16 +14,18 @@ /* Ftrace callback handler for kprobes */ void kprobe_ftrace_handler(unsigned long nip, unsigned long parent_nip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct kprobe *p; struct kprobe_ctlblk *kcb; + struct pt_regs *regs; int bit; bit = ftrace_test_recursion_trylock(nip, parent_nip); if (bit < 0) return; + regs = ftrace_get_regs(fregs); preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)nip); if (unlikely(!p) || kprobe_disabled(p)) diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 657c1ab45408..67b80f4412f9 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -198,9 +198,10 @@ int ftrace_disable_ftrace_graph_caller(void) #ifdef CONFIG_KPROBES_ON_FTRACE void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct kprobe_ctlblk *kcb; + struct pt_regs *regs; struct kprobe *p; int bit; @@ -208,6 +209,7 @@ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, if (bit < 0) return; + regs = ftrace_get_regs(fregs); preempt_disable_notrace(); p = get_kprobe((kprobe_opcode_t *)ip); if (unlikely(!p) || kprobe_disabled(p)) diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 954d930a7127..373e5fa3ce1f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -14,8 +14,9 @@ /* Ftrace callback handler for kprobes -- called under preepmt disabed */ void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); struct kprobe *p; struct kprobe_ctlblk *kcb; int bit; diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index adb0935eb062..5939595f0115 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -26,7 +26,7 @@ static u64 pstore_ftrace_stamp; static void notrace pstore_ftrace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, - struct pt_regs *regs) + struct ftrace_regs *fregs) { int bit; unsigned long flags; diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 8dde9c17aaa5..24e1fa52337d 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -90,8 +90,20 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, struct ftrace_ops; +struct ftrace_regs { + struct pt_regs regs; +}; + +static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs) +{ + if (!fregs) + return NULL; + + return &fregs->regs; +} + typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); @@ -259,7 +271,7 @@ int register_ftrace_function(struct ftrace_ops *ops); int unregister_ftrace_function(struct ftrace_ops *ops); extern void ftrace_stub(unsigned long a0, unsigned long a1, - struct ftrace_ops *op, struct pt_regs *regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); #else /* !CONFIG_FUNCTION_TRACER */ /* diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 629abaf25681..be73350955e4 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -345,7 +345,7 @@ static inline void wait_for_kprobe_optimizer(void) { } #endif /* CONFIG_OPTPROBES */ #ifdef CONFIG_KPROBES_ON_FTRACE extern void kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs); + struct ftrace_ops *ops, struct ftrace_regs *fregs); extern int arch_prepare_kprobe_ftrace(struct kprobe *p); #endif diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index 875c5dbbdd33..f89f9e7e9b07 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -40,8 +40,9 @@ struct klp_ops *klp_find_ops(void *old_func) static void notrace klp_ftrace_handler(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *fops, - struct pt_regs *regs) + struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); struct klp_ops *ops; struct klp_func *func; int patch_state; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3db64fb0cce8..67888311784e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -121,7 +121,7 @@ struct ftrace_ops global_ops; #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); #else /* See comment below, where ftrace_ops_list_func is defined */ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip); @@ -140,7 +140,7 @@ static inline void ftrace_ops_init(struct ftrace_ops *ops) } static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; int pid; @@ -154,7 +154,7 @@ static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip, return; } - op->saved_func(ip, parent_ip, op, regs); + op->saved_func(ip, parent_ip, op, fregs); } static void ftrace_sync_ipi(void *data) @@ -754,7 +754,7 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) static void function_profile_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; @@ -2143,6 +2143,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, bool enable, bool update) else rec->flags &= ~FTRACE_FL_TRAMP_EN; } + if (flag & FTRACE_FL_DIRECT) { /* * If there's only one user (direct_ops helper) @@ -2368,8 +2369,9 @@ unsigned long ftrace_find_rec_direct(unsigned long ip) } static void call_direct_funcs(unsigned long ip, unsigned long pip, - struct ftrace_ops *ops, struct pt_regs *regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); unsigned long addr; addr = ftrace_find_rec_direct(ip); @@ -4292,7 +4294,7 @@ static int __init ftrace_mod_cmd_init(void) core_initcall(ftrace_mod_cmd_init); static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct ftrace_probe_ops *probe_ops; struct ftrace_func_probe *probe; @@ -6911,8 +6913,9 @@ void ftrace_reset_array_ops(struct trace_array *tr) static nokprobe_inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ignored, struct pt_regs *regs) + struct ftrace_ops *ignored, struct ftrace_regs *fregs) { + struct pt_regs *regs = ftrace_get_regs(fregs); struct ftrace_ops *op; int bit; @@ -6945,7 +6948,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, pr_warn("op=%p %pS\n", op, op); goto out; } - op->func(ip, parent_ip, op, regs); + op->func(ip, parent_ip, op, fregs); } } while_for_each_ftrace_op(op); out: @@ -6968,9 +6971,9 @@ out: */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { - __ftrace_ops_list_func(ip, parent_ip, NULL, regs); + __ftrace_ops_list_func(ip, parent_ip, NULL, fregs); } NOKPROBE_SYMBOL(ftrace_ops_list_func); #else @@ -6987,7 +6990,7 @@ NOKPROBE_SYMBOL(ftrace_ops_no_ops); * this function will be called by the mcount trampoline. */ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { int bit; @@ -6998,7 +7001,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) - op->func(ip, parent_ip, op, regs); + op->func(ip, parent_ip, op, fregs); preempt_enable_notrace(); trace_clear_recursion(bit); diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 1b202e28dfaa..a71181655958 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -432,7 +432,7 @@ NOKPROBE_SYMBOL(perf_trace_buf_update); #ifdef CONFIG_FUNCTION_TRACER static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *ops, struct pt_regs *pt_regs) + struct ftrace_ops *ops, struct ftrace_regs *fregs) { struct ftrace_entry *entry; struct perf_event *event; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f4b459bb6d33..98d194d8460e 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3673,7 +3673,7 @@ static struct trace_event_file event_trace_file __initdata; static void __init function_test_events_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *regs) { struct trace_buffer *buffer; struct ring_buffer_event *event; diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 646eda6c44a5..c5095dd28e20 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -23,10 +23,10 @@ static void tracing_start_function_trace(struct trace_array *tr); static void tracing_stop_function_trace(struct trace_array *tr); static void function_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs); + struct ftrace_ops *op, struct ftrace_regs *fregs); static struct tracer_flags func_flags; /* Our option */ @@ -89,7 +89,6 @@ void ftrace_destroy_function_files(struct trace_array *tr) static int function_trace_init(struct trace_array *tr) { ftrace_func_t func; - /* * Instance trace_arrays get their ops allocated * at instance creation. Unless it failed @@ -129,7 +128,7 @@ static void function_trace_start(struct trace_array *tr) static void function_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; struct trace_array_cpu *data; @@ -178,7 +177,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = op->private; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 10bbb0f381d5..d06aab4dcbb8 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -138,7 +138,7 @@ static int func_prolog_dec(struct trace_array *tr, */ static void irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 97b10bb31a1f..c0181066dbe9 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -212,7 +212,7 @@ static void wakeup_print_header(struct seq_file *s) */ static void wakeup_tracer_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 8ee3c0bb5d8a..5ed081c6471c 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -107,7 +107,7 @@ static int trace_selftest_test_probe1_cnt; static void trace_selftest_test_probe1_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_probe1_cnt++; } @@ -116,7 +116,7 @@ static int trace_selftest_test_probe2_cnt; static void trace_selftest_test_probe2_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_probe2_cnt++; } @@ -125,7 +125,7 @@ static int trace_selftest_test_probe3_cnt; static void trace_selftest_test_probe3_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_probe3_cnt++; } @@ -134,7 +134,7 @@ static int trace_selftest_test_global_cnt; static void trace_selftest_test_global_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_global_cnt++; } @@ -143,7 +143,7 @@ static int trace_selftest_test_dyn_cnt; static void trace_selftest_test_dyn_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { trace_selftest_test_dyn_cnt++; } @@ -414,7 +414,7 @@ static int trace_selftest_recursion_cnt; static void trace_selftest_test_recursion_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { /* * This function is registered without the recursion safe flag. @@ -429,7 +429,7 @@ static void trace_selftest_test_recursion_func(unsigned long ip, static void trace_selftest_test_recursion_safe_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { /* * We said we would provide our own recursion. By calling @@ -548,9 +548,11 @@ static enum { static void trace_selftest_test_regs_func(unsigned long ip, unsigned long pip, struct ftrace_ops *op, - struct pt_regs *pt_regs) + struct ftrace_regs *fregs) { - if (pt_regs) + struct pt_regs *regs = ftrace_get_regs(fregs); + + if (regs) trace_selftest_regs_stat = TRACE_SELFTEST_REGS_FOUND; else trace_selftest_regs_stat = TRACE_SELFTEST_REGS_NOT_FOUND; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 969db526a563..63c285042051 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -290,7 +290,7 @@ static void check_stack(unsigned long ip, unsigned long *stack) static void stack_trace_call(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *pt_regs) + struct ftrace_ops *op, struct ftrace_regs *fregs) { unsigned long stack; -- cgit v1.2.3 From 02a474ca266a47ea8f4d5a11f4ffa120f83730ad Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Tue, 27 Oct 2020 10:55:55 -0400 Subject: ftrace/x86: Allow for arguments to be passed in to ftrace_regs by default Currently, the only way to get access to the registers of a function via a ftrace callback is to set the "FL_SAVE_REGS" bit in the ftrace_ops. But as this saves all regs as if a breakpoint were to trigger (for use with kprobes), it is expensive. The regs are already saved on the stack for the default ftrace callbacks, as that is required otherwise a function being traced will get the wrong arguments and possibly crash. And on x86, the arguments are already stored where they would be on a pt_regs structure to use that code for both the regs version of a callback, it makes sense to pass that information always to all functions. If an architecture does this (as x86_64 now does), it is to set HAVE_DYNAMIC_FTRACE_WITH_ARGS, and this will let the generic code that it could have access to arguments without having to set the flags. This also includes having the stack pointer being saved, which could be used for accessing arguments on the stack, as well as having the function graph tracer not require its own trampoline! Acked-by: Peter Zijlstra (Intel) Signed-off-by: Steven Rostedt (VMware) --- arch/x86/Kconfig | 1 + arch/x86/include/asm/ftrace.h | 15 +++++++++++++++ arch/x86/kernel/ftrace_64.S | 11 +++++++++-- include/linux/ftrace.h | 7 ++++++- kernel/trace/Kconfig | 9 +++++++++ 5 files changed, 40 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f6946b81f74a..478526aabe5d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -167,6 +167,7 @@ config X86 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_DYNAMIC_FTRACE_WITH_ARGS if X86_64 select HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS select HAVE_EBPF_JIT select HAVE_EFFICIENT_UNALIGNED_ACCESS diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 84b9449be080..e00fe88146e0 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -41,6 +41,21 @@ static inline void arch_ftrace_set_direct_caller(struct pt_regs *regs, unsigned regs->orig_ax = addr; } +#ifdef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS +struct ftrace_regs { + struct pt_regs regs; +}; + +static __always_inline struct pt_regs * +arch_ftrace_get_regs(struct ftrace_regs *fregs) +{ + /* Only when FL_SAVE_REGS is set, cs will be non zero */ + if (!fregs->regs.cs) + return NULL; + return &fregs->regs; +} +#endif + #ifdef CONFIG_DYNAMIC_FTRACE struct dyn_arch_ftrace { diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index ac3d5f22fe64..60e3b64f5ea6 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -140,12 +140,19 @@ SYM_FUNC_START(ftrace_caller) /* save_mcount_regs fills in first two parameters */ save_mcount_regs + /* Stack - skipping return address of ftrace_caller */ + leaq MCOUNT_REG_SIZE+8(%rsp), %rcx + movq %rcx, RSP(%rsp) + SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) /* Load the ftrace_ops into the 3rd parameter */ movq function_trace_op(%rip), %rdx - /* regs go into 4th parameter (but make it NULL) */ - movq $0, %rcx + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + + /* Only ops with REGS flag set should have CS register set */ + movq $0, CS(%rsp) SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) call ftrace_stub diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 24e1fa52337d..588ea7023a7a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -90,16 +90,21 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, struct ftrace_ops; +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + struct ftrace_regs { struct pt_regs regs; }; +#define arch_ftrace_get_regs(fregs) (&(fregs)->regs) + +#endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs) { if (!fregs) return NULL; - return &fregs->regs; + return arch_ftrace_get_regs(fregs); } typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip, diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6aa36ec73ccb..c9b64dea1216 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -31,6 +31,15 @@ config HAVE_DYNAMIC_FTRACE_WITH_REGS config HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS bool +config HAVE_DYNAMIC_FTRACE_WITH_ARGS + bool + help + If this is set, then arguments and stack can be found from + the pt_regs passed into the function callback regs parameter + by default, even without setting the REGS flag in the ftrace_ops. + This allows for use of regs_get_kernel_argument() and + kernel_stack_pointer(). + config HAVE_FTRACE_MCOUNT_RECORD bool help -- cgit v1.2.3 From 2860cd8a235375df3c8ec8039d9fe5eb2f658b86 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 28 Oct 2020 17:15:27 -0400 Subject: livepatch: Use the default ftrace_ops instead of REGS when ARGS is available When CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS is available, the ftrace call will be able to set the ip of the calling function. This will improve the performance of live kernel patching where it does not need all the regs to be stored just to change the instruction pointer. If all archs that support live kernel patching also support HAVE_DYNAMIC_FTRACE_WITH_ARGS, then the architecture specific function klp_arch_set_pc() could be made generic. It is possible that an arch can support HAVE_DYNAMIC_FTRACE_WITH_ARGS but not HAVE_DYNAMIC_FTRACE_WITH_REGS and then have access to live patching. Cc: Josh Poimboeuf Cc: Jiri Kosina Cc: live-patching@vger.kernel.org Acked-by: Peter Zijlstra (Intel) Acked-by: Miroslav Benes Signed-off-by: Steven Rostedt (VMware) --- arch/powerpc/include/asm/livepatch.h | 4 +++- arch/s390/include/asm/livepatch.h | 5 ++++- arch/x86/include/asm/ftrace.h | 3 +++ arch/x86/include/asm/livepatch.h | 4 ++-- arch/x86/kernel/ftrace_64.S | 4 ++++ include/linux/ftrace.h | 7 +++++++ kernel/livepatch/Kconfig | 2 +- kernel/livepatch/patch.c | 9 +++++---- 8 files changed, 29 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/include/asm/livepatch.h b/arch/powerpc/include/asm/livepatch.h index 4a3d5d25fed5..ae25e6e72997 100644 --- a/arch/powerpc/include/asm/livepatch.h +++ b/arch/powerpc/include/asm/livepatch.h @@ -12,8 +12,10 @@ #include #ifdef CONFIG_LIVEPATCH -static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { + struct pt_regs *regs = ftrace_get_regs(fregs); + regs->nip = ip; } diff --git a/arch/s390/include/asm/livepatch.h b/arch/s390/include/asm/livepatch.h index 818612b784cd..d578a8c76676 100644 --- a/arch/s390/include/asm/livepatch.h +++ b/arch/s390/include/asm/livepatch.h @@ -11,10 +11,13 @@ #ifndef ASM_LIVEPATCH_H #define ASM_LIVEPATCH_H +#include #include -static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { + struct pt_regs *regs = ftrace_get_regs(fregs); + regs->psw.addr = ip; } diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index e00fe88146e0..9f3130f40807 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -54,6 +54,9 @@ arch_ftrace_get_regs(struct ftrace_regs *fregs) return NULL; return &fregs->regs; } + +#define ftrace_instruction_pointer_set(fregs, _ip) \ + do { (fregs)->regs.ip = (_ip); } while (0) #endif #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h index 1fde1ab6559e..7c5cc6660e4b 100644 --- a/arch/x86/include/asm/livepatch.h +++ b/arch/x86/include/asm/livepatch.h @@ -12,9 +12,9 @@ #include #include -static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip) +static inline void klp_arch_set_pc(struct ftrace_regs *fregs, unsigned long ip) { - regs->ip = ip; + ftrace_instruction_pointer_set(fregs, ip); } #endif /* _ASM_X86_LIVEPATCH_H */ diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index 60e3b64f5ea6..0d54099c2a3a 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -157,6 +157,10 @@ SYM_INNER_LABEL(ftrace_caller_op_ptr, SYM_L_GLOBAL) SYM_INNER_LABEL(ftrace_call, SYM_L_GLOBAL) call ftrace_stub + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE(%rsp) + restore_mcount_regs /* diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 588ea7023a7a..9a8ce28e4485 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -97,6 +97,13 @@ struct ftrace_regs { }; #define arch_ftrace_get_regs(fregs) (&(fregs)->regs) +/* + * ftrace_instruction_pointer_set() is to be defined by the architecture + * if to allow setting of the instruction pointer from the ftrace_regs + * when HAVE_DYNAMIC_FTRACE_WITH_ARGS is set and it supports + * live kernel patching. + */ +#define ftrace_instruction_pointer_set(fregs, ip) do { } while (0) #endif /* CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS */ static __always_inline struct pt_regs *ftrace_get_regs(struct ftrace_regs *fregs) diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig index 54102deb50ba..53d51ed619a3 100644 --- a/kernel/livepatch/Kconfig +++ b/kernel/livepatch/Kconfig @@ -6,7 +6,7 @@ config HAVE_LIVEPATCH config LIVEPATCH bool "Kernel Live Patching" - depends on DYNAMIC_FTRACE_WITH_REGS + depends on DYNAMIC_FTRACE_WITH_REGS || DYNAMIC_FTRACE_WITH_ARGS depends on MODULES depends on SYSFS depends on KALLSYMS_ALL diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c index f89f9e7e9b07..e8029aea67f1 100644 --- a/kernel/livepatch/patch.c +++ b/kernel/livepatch/patch.c @@ -42,7 +42,6 @@ static void notrace klp_ftrace_handler(unsigned long ip, struct ftrace_ops *fops, struct ftrace_regs *fregs) { - struct pt_regs *regs = ftrace_get_regs(fregs); struct klp_ops *ops; struct klp_func *func; int patch_state; @@ -118,7 +117,7 @@ static void notrace klp_ftrace_handler(unsigned long ip, if (func->nop) goto unlock; - klp_arch_set_pc(regs, (unsigned long)func->new_func); + klp_arch_set_pc(fregs, (unsigned long)func->new_func); unlock: preempt_enable_notrace(); @@ -200,8 +199,10 @@ static int klp_patch_func(struct klp_func *func) return -ENOMEM; ops->fops.func = klp_ftrace_handler; - ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | - FTRACE_OPS_FL_DYNAMIC | + ops->fops.flags = FTRACE_OPS_FL_DYNAMIC | +#ifndef CONFIG_HAVE_DYNAMIC_FTRACE_WITH_ARGS + FTRACE_OPS_FL_SAVE_REGS | +#endif FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_PERMANENT; -- cgit v1.2.3 From b111545d26c0d66dd9aae668d9373669e752b075 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sat, 14 Nov 2020 00:02:40 +0800 Subject: tracing: Remove the useless value assignment in test_create_synth_event() The value of variable ret is overwritten on the delete branch in the test_create_synth_event() and we care more about the above error than this delete portion. Remove it. Link: https://lkml.kernel.org/r/1605283360-6804-1-git-send-email-kaixuxia@tencent.com Reported-by: Tosk Robot Signed-off-by: Kaixu Xia Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/synth_event_gen_test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/synth_event_gen_test.c b/kernel/trace/synth_event_gen_test.c index edd912cd14aa..a4b4bbf8c3bf 100644 --- a/kernel/trace/synth_event_gen_test.c +++ b/kernel/trace/synth_event_gen_test.c @@ -307,7 +307,7 @@ static int __init test_create_synth_event(void) return ret; delete: /* We got an error after creating the event, delete it */ - ret = synth_event_delete("create_synth_test"); + synth_event_delete("create_synth_test"); goto out; } -- cgit v1.2.3 From 932f8c64d38bb08f69c8c26a2216ba0c36c6daa8 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Fri, 13 Nov 2020 18:20:12 +0100 Subject: futex: Remove unused empty compat_exit_robust_list() Commit ba31c1a48538 ("futex: Move futex exit handling into futex code") introduced compat_exit_robust_list() with a full-fledged implementation for CONFIG_COMPAT, and an empty-body function for !CONFIG_COMPAT. However, compat_exit_robust_list() is only used in futex_mm_release() under #ifdef CONFIG_COMPAT. Hence for !CONFIG_COMPAT, make CC=clang W=1 warns: kernel/futex.c:314:20: warning: unused function 'compat_exit_robust_list' [-Wunused-function] There is no need to declare the unused empty function for !CONFIG_COMPAT. Simply remove it. Signed-off-by: Lukas Bulwahn Signed-off-by: Thomas Gleixner Reviewed-by: Nathan Chancellor Link: https://lore.kernel.org/r/20201113172012.27221-1-lukas.bulwahn@gmail.com --- kernel/futex.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index ac328874f6e5..aee6ce294d84 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -310,8 +310,6 @@ static inline bool should_fail_futex(bool fshared) #ifdef CONFIG_COMPAT static void compat_exit_robust_list(struct task_struct *curr); -#else -static inline void compat_exit_robust_list(struct task_struct *curr) { } #endif /* -- cgit v1.2.3 From f782e2c300a717233b64697affda3ea7aac00b2b Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Fri, 13 Nov 2020 17:17:56 +0000 Subject: bpf: Relax return code check for subprograms Currently verifier enforces return code checks for subprograms in the same manner as it does for program entry points. This prevents returning arbitrary scalar values from subprograms. Scalar type of returned values is checked by btf_prepare_func_args() and hence it should be safe to allow only scalars for now. Relax return code checks for subprograms and allow any correct scalar values. Fixes: 51c39bb1d5d10 (bpf: Introduce function-by-function verification) Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201113171756.90594-1-me@ubique.spb.ru --- kernel/bpf/verifier.c | 15 +++++++++++++-- .../selftests/bpf/prog_tests/test_global_funcs.c | 1 + tools/testing/selftests/bpf/progs/test_global_func8.c | 19 +++++++++++++++++++ 3 files changed, 33 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/progs/test_global_func8.c (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6204ec705d80..1388bf733071 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7786,9 +7786,11 @@ static int check_return_code(struct bpf_verifier_env *env) struct tnum range = tnum_range(0, 1); enum bpf_prog_type prog_type = resolve_prog_type(env->prog); int err; + const bool is_subprog = env->cur_state->frame[0]->subprogno; /* LSM and struct_ops func-ptr's return type could be "void" */ - if ((prog_type == BPF_PROG_TYPE_STRUCT_OPS || + if (!is_subprog && + (prog_type == BPF_PROG_TYPE_STRUCT_OPS || prog_type == BPF_PROG_TYPE_LSM) && !prog->aux->attach_func_proto->type) return 0; @@ -7808,6 +7810,16 @@ static int check_return_code(struct bpf_verifier_env *env) return -EACCES; } + reg = cur_regs(env) + BPF_REG_0; + if (is_subprog) { + if (reg->type != SCALAR_VALUE) { + verbose(env, "At subprogram exit the register R0 is not a scalar value (%s)\n", + reg_type_str[reg->type]); + return -EINVAL; + } + return 0; + } + switch (prog_type) { case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: if (env->prog->expected_attach_type == BPF_CGROUP_UDP4_RECVMSG || @@ -7861,7 +7873,6 @@ static int check_return_code(struct bpf_verifier_env *env) return 0; } - reg = cur_regs(env) + BPF_REG_0; if (reg->type != SCALAR_VALUE) { verbose(env, "At program exit the register R0 is not a known value (%s)\n", reg_type_str[reg->type]); diff --git a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c index 193002b14d7f..32e4348b714b 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c +++ b/tools/testing/selftests/bpf/prog_tests/test_global_funcs.c @@ -60,6 +60,7 @@ void test_test_global_funcs(void) { "test_global_func5.o" , "expected pointer to ctx, but got PTR" }, { "test_global_func6.o" , "modified ctx ptr R2" }, { "test_global_func7.o" , "foo() doesn't return scalar" }, + { "test_global_func8.o" }, }; libbpf_print_fn_t old_print_fn = NULL; int err, i, duration = 0; diff --git a/tools/testing/selftests/bpf/progs/test_global_func8.c b/tools/testing/selftests/bpf/progs/test_global_func8.c new file mode 100644 index 000000000000..d55a6544b1ab --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_global_func8.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Facebook */ +#include +#include +#include + +__noinline int foo(struct __sk_buff *skb) +{ + return bpf_get_prandom_u32(); +} + +SEC("cgroup_skb/ingress") +int test_cls(struct __sk_buff *skb) +{ + if (!foo(skb)) + return 0; + + return 1; +} -- cgit v1.2.3 From 8b92c4ff4423aa9900cf838d3294fcade4dbda35 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 13 Nov 2020 22:52:02 -0800 Subject: Revert "kernel/reboot.c: convert simple_strtoul to kstrtoint" Patch series "fix parsing of reboot= cmdline", v3. The parsing of the reboot= cmdline has two major errors: - a missing bound check can crash the system on reboot - parsing of the cpu number only works if specified last Fix both. This patch (of 2): This reverts commit 616feab753972b97. kstrtoint() and simple_strtoul() have a subtle difference which makes them non interchangeable: if a non digit character is found amid the parsing, the former will return an error, while the latter will just stop parsing, e.g. simple_strtoul("123xyx") = 123. The kernel cmdline reboot= argument allows to specify the CPU used for rebooting, with the syntax `s####` among the other flags, e.g. "reboot=warm,s31,force", so if this flag is not the last given, it's silently ignored as well as the subsequent ones. Fixes: 616feab75397 ("kernel/reboot.c: convert simple_strtoul to kstrtoint") Signed-off-by: Matteo Croce Signed-off-by: Andrew Morton Cc: Guenter Roeck Cc: Petr Mladek Cc: Arnd Bergmann Cc: Mike Rapoport Cc: Kees Cook Cc: Pavel Tatashin Cc: Robin Holt Cc: Fabian Frederick Cc: Greg Kroah-Hartman Cc: Link: https://lkml.kernel.org/r/20201103214025.116799-2-mcroce@linux.microsoft.com Signed-off-by: Linus Torvalds --- kernel/reboot.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index e7b78d5ae1ab..8fbba433725e 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -551,22 +551,15 @@ static int __init reboot_setup(char *str) break; case 's': - { - int rc; - - if (isdigit(*(str+1))) { - rc = kstrtoint(str+1, 0, &reboot_cpu); - if (rc) - return rc; - } else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) { - rc = kstrtoint(str+3, 0, &reboot_cpu); - if (rc) - return rc; - } else + if (isdigit(*(str+1))) + reboot_cpu = simple_strtoul(str+1, NULL, 0); + else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) + reboot_cpu = simple_strtoul(str+3, NULL, 0); + else *mode = REBOOT_SOFT; break; - } + case 'g': *mode = REBOOT_GPIO; break; -- cgit v1.2.3 From df5b0ab3e08a156701b537809914b339b0daa526 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 13 Nov 2020 22:52:07 -0800 Subject: reboot: fix overflow parsing reboot cpu number Limit the CPU number to num_possible_cpus(), because setting it to a value lower than INT_MAX but higher than NR_CPUS produces the following error on reboot and shutdown: BUG: unable to handle page fault for address: ffffffff90ab1bb0 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 1c09067 P4D 1c09067 PUD 1c0a063 PMD 0 Oops: 0000 [#1] SMP CPU: 1 PID: 1 Comm: systemd-shutdow Not tainted 5.9.0-rc8-kvm #110 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 RIP: 0010:migrate_to_reboot_cpu+0xe/0x60 Code: ea ea 00 48 89 fa 48 c7 c7 30 57 f1 81 e9 fa ef ff ff 66 2e 0f 1f 84 00 00 00 00 00 53 8b 1d d5 ea ea 00 e8 14 33 fe ff 89 da <48> 0f a3 15 ea fc bd 00 48 89 d0 73 29 89 c2 c1 e8 06 65 48 8b 3c RSP: 0018:ffffc90000013e08 EFLAGS: 00010246 RAX: ffff88801f0a0000 RBX: 0000000077359400 RCX: 0000000000000000 RDX: 0000000077359400 RSI: 0000000000000002 RDI: ffffffff81c199e0 RBP: ffffffff81c1e3c0 R08: ffff88801f41f000 R09: ffffffff81c1e348 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: 00007f32bedf8830 R14: 00000000fee1dead R15: 0000000000000000 FS: 00007f32bedf8980(0000) GS:ffff88801f480000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: ffffffff90ab1bb0 CR3: 000000001d057000 CR4: 00000000000006a0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __do_sys_reboot.cold+0x34/0x5b do_syscall_64+0x2d/0x40 Fixes: 1b3a5d02ee07 ("reboot: move arch/x86 reboot= handling to generic kernel") Signed-off-by: Matteo Croce Signed-off-by: Andrew Morton Cc: Arnd Bergmann Cc: Fabian Frederick Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: Kees Cook Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Petr Mladek Cc: Robin Holt Cc: Link: https://lkml.kernel.org/r/20201103214025.116799-3-mcroce@linux.microsoft.com Signed-off-by: Linus Torvalds --- kernel/reboot.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 8fbba433725e..af6f23d8bea1 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -558,6 +558,13 @@ static int __init reboot_setup(char *str) reboot_cpu = simple_strtoul(str+3, NULL, 0); else *mode = REBOOT_SOFT; + if (reboot_cpu >= num_possible_cpus()) { + pr_err("Ignoring the CPU number in reboot= option. " + "CPU %d exceeds possible cpu number %d\n", + reboot_cpu, num_possible_cpus()); + reboot_cpu = 0; + break; + } break; case 'g': -- cgit v1.2.3 From e7e046155af04cdca5e1157f28b07e1651eb317b Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Fri, 13 Nov 2020 22:52:10 -0800 Subject: kernel/watchdog: fix watchdog_allowed_mask not used warning Define watchdog_allowed_mask only when SOFTLOCKUP_DETECTOR is enabled. Fixes: 7feeb9cd4f5b ("watchdog/sysctl: Clean up sysctl variable name space") Signed-off-by: Santosh Sivaraj Signed-off-by: Andrew Morton Reviewed-by: Petr Mladek Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20201106015025.1281561-1-santosh@fossix.org Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 5abb5b22ad13..71109065bd8e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -44,8 +44,6 @@ int __read_mostly soft_watchdog_user_enabled = 1; int __read_mostly watchdog_thresh = 10; static int __read_mostly nmi_watchdog_available; -static struct cpumask watchdog_allowed_mask __read_mostly; - struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -162,6 +160,8 @@ static void lockup_detector_update_enable(void) int __read_mostly sysctl_softlockup_all_cpu_backtrace; #endif +static struct cpumask watchdog_allowed_mask __read_mostly; + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; -- cgit v1.2.3 From 2f31ad64a9cce8b2409d2d4563482adfb8664082 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 13 Nov 2020 22:52:20 -0800 Subject: panic: don't dump stack twice on warn Before commit 3f388f28639f ("panic: dump registers on panic_on_warn"), __warn() was calling show_regs() when regs was not NULL, and show_stack() otherwise. After that commit, show_stack() is called regardless of whether show_regs() has been called or not, leading to duplicated Call Trace: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1 at arch/powerpc/mm/nohash/8xx.c:186 mmu_mark_initmem_nx+0x24/0x94 CPU: 0 PID: 1 Comm: swapper Not tainted 5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty #4092 NIP: c00128b4 LR: c0010228 CTR: 00000000 REGS: c9023e40 TRAP: 0700 Not tainted (5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty) MSR: 00029032 CR: 24000424 XER: 00000000 GPR00: c0010228 c9023ef8 c2100000 0074c000 ffffffff 00000000 c2151000 c07b3880 GPR08: ff000900 0074c000 c8000000 c33b53a8 24000822 00000000 c0003a20 00000000 GPR16: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 GPR24: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00800000 NIP [c00128b4] mmu_mark_initmem_nx+0x24/0x94 LR [c0010228] free_initmem+0x20/0x58 Call Trace: free_initmem+0x20/0x58 kernel_init+0x1c/0x114 ret_from_kernel_thread+0x14/0x1c Instruction dump: 7d291850 7d234b78 4e800020 9421ffe0 7c0802a6 bfc10018 3fe0c060 3bff0000 3fff4080 3bffffff 90010024 57ff0010 <0fe00000> 392001cd 7c3e0b78 953e0008 CPU: 0 PID: 1 Comm: swapper Not tainted 5.10.0-rc2-s3k-dev-01375-gf46ec0d3ecbd-dirty #4092 Call Trace: __warn+0x8c/0xd8 (unreliable) report_bug+0x11c/0x154 program_check_exception+0x1dc/0x6e0 ret_from_except_full+0x0/0x4 --- interrupt: 700 at mmu_mark_initmem_nx+0x24/0x94 LR = free_initmem+0x20/0x58 free_initmem+0x20/0x58 kernel_init+0x1c/0x114 ret_from_kernel_thread+0x14/0x1c ---[ end trace 31702cd2a9570752 ]--- Only call show_stack() when regs is NULL. Fixes: 3f388f28639f ("panic: dump registers on panic_on_warn") Signed-off-by: Christophe Leroy Signed-off-by: Andrew Morton Cc: Alexey Kardashevskiy Cc: Kefeng Wang Link: https://lkml.kernel.org/r/e8c055458b080707f1bc1a98ff8bea79d0cec445.1604748361.git.christophe.leroy@csgroup.eu Signed-off-by: Linus Torvalds --- kernel/panic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 396142ee43fd..332736a72a58 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -605,7 +605,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint, panic("panic_on_warn set ...\n"); } - dump_stack(); + if (!regs) + dump_stack(); print_irqtrace_events(current); -- cgit v1.2.3 From f296dcd629aa412a80a53215e46087f53af87f08 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 14 Nov 2020 22:01:45 +0100 Subject: genirq: Remove GENERIC_IRQ_LEGACY_ALLOC_HWIRQ Commit bb9d812643d8 ("arch: remove tile port") removed the last user of this cruft two years ago... Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87eekvac06.fsf@nanos.tec.linutronix.de --- include/linux/irq.h | 15 --------------- kernel/irq/Kconfig | 5 ----- kernel/irq/irqdesc.c | 51 --------------------------------------------------- 3 files changed, 71 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index c54365309e97..79ce314a603b 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -954,21 +954,6 @@ static inline void irq_free_desc(unsigned int irq) irq_free_descs(irq, 1); } -#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ -unsigned int irq_alloc_hwirqs(int cnt, int node); -static inline unsigned int irq_alloc_hwirq(int node) -{ - return irq_alloc_hwirqs(1, node); -} -void irq_free_hwirqs(unsigned int from, int cnt); -static inline void irq_free_hwirq(unsigned int irq) -{ - return irq_free_hwirqs(irq, 1); -} -int arch_setup_hwirq(unsigned int irq, int node); -void arch_teardown_hwirq(unsigned int irq); -#endif - #ifdef CONFIG_GENERIC_IRQ_LEGACY void irq_init_desc(unsigned int irq); #endif diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 10a5aff4eecc..f2cda6b0057f 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -26,11 +26,6 @@ config GENERIC_IRQ_SHOW_LEVEL config GENERIC_IRQ_EFFECTIVE_AFF_MASK bool -# Facility to allocate a hardware interrupt. This is legacy support -# and should not be used in new code. Use irq domains instead. -config GENERIC_IRQ_LEGACY_ALLOC_HWIRQ - bool - # Support for delayed migration from interrupt context config GENERIC_PENDING_IRQ bool diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 1a7723604399..e810eb9906ea 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -810,57 +810,6 @@ unlock: } EXPORT_SYMBOL_GPL(__irq_alloc_descs); -#ifdef CONFIG_GENERIC_IRQ_LEGACY_ALLOC_HWIRQ -/** - * irq_alloc_hwirqs - Allocate an irq descriptor and initialize the hardware - * @cnt: number of interrupts to allocate - * @node: node on which to allocate - * - * Returns an interrupt number > 0 or 0, if the allocation fails. - */ -unsigned int irq_alloc_hwirqs(int cnt, int node) -{ - int i, irq = __irq_alloc_descs(-1, 0, cnt, node, NULL, NULL); - - if (irq < 0) - return 0; - - for (i = irq; cnt > 0; i++, cnt--) { - if (arch_setup_hwirq(i, node)) - goto err; - irq_clear_status_flags(i, _IRQ_NOREQUEST); - } - return irq; - -err: - for (i--; i >= irq; i--) { - irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); - arch_teardown_hwirq(i); - } - irq_free_descs(irq, cnt); - return 0; -} -EXPORT_SYMBOL_GPL(irq_alloc_hwirqs); - -/** - * irq_free_hwirqs - Free irq descriptor and cleanup the hardware - * @from: Free from irq number - * @cnt: number of interrupts to free - * - */ -void irq_free_hwirqs(unsigned int from, int cnt) -{ - int i, j; - - for (i = from, j = cnt; j > 0; i++, j--) { - irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE); - arch_teardown_hwirq(i); - } - irq_free_descs(from, cnt); -} -EXPORT_SYMBOL_GPL(irq_free_hwirqs); -#endif - /** * irq_get_next_irq - get next allocated irq number * @offset: where to start the search -- cgit v1.2.3 From e906a546bd8653ed2e7a14cb300fd17952d7f862 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 14 Nov 2020 23:36:28 +0100 Subject: genirq/irqdomain: Make irq_domain_disassociate() static No users outside of the core code. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/87a6vja7mb.fsf@nanos.tec.linutronix.de --- include/linux/irqdomain.h | 2 -- kernel/irq/irqdomain.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 77bf7d84c673..5701a8b01726 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -387,8 +387,6 @@ extern int irq_domain_associate(struct irq_domain *domain, unsigned int irq, extern void irq_domain_associate_many(struct irq_domain *domain, unsigned int irq_base, irq_hw_number_t hwirq_base, int count); -extern void irq_domain_disassociate(struct irq_domain *domain, - unsigned int irq); extern unsigned int irq_create_mapping(struct irq_domain *host, irq_hw_number_t hwirq); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 9c9cb8829f7a..3d7463fd6453 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -496,7 +496,7 @@ static void irq_domain_set_mapping(struct irq_domain *domain, } } -void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) +static void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq) { struct irq_data *irq_data = irq_get_irq_data(irq); irq_hw_number_t hwirq; -- cgit v1.2.3 From c4d51a52c67a1e3a0fa3006e5ec21cdc07649cd6 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 27 Oct 2020 14:39:43 +0000 Subject: sched/wait: Add add_wait_queue_priority() This allows an exclusive wait_queue_entry to be added at the head of the queue, instead of the tail as normal. Thus, it gets to consume events first without allowing non-exclusive waiters to be woken at all. The (first) intended use is for KVM IRQFD, which currently has inconsistent behaviour depending on whether posted interrupts are available or not. If they are, KVM will bypass the eventfd completely and deliver interrupts directly to the appropriate vCPU. If not, events are delivered through the eventfd and userspace will receive them when polling on the eventfd. By using add_wait_queue_priority(), KVM will be able to consistently consume events within the kernel without accidentally exposing them to userspace when they're supposed to be bypassed. This, in turn, means that userspace doesn't have to jump through hoops to avoid listening on the erroneously noisy eventfd and injecting duplicate interrupts. Signed-off-by: David Woodhouse Message-Id: <20201027143944.648769-2-dwmw2@infradead.org> Signed-off-by: Paolo Bonzini --- include/linux/wait.h | 12 +++++++++++- kernel/sched/wait.c | 17 ++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/wait.h b/include/linux/wait.h index 27fb99cfeb02..fe10e8570a52 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -22,6 +22,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int #define WQ_FLAG_BOOKMARK 0x04 #define WQ_FLAG_CUSTOM 0x08 #define WQ_FLAG_DONE 0x10 +#define WQ_FLAG_PRIORITY 0x20 /* * A single wait-queue entry structure: @@ -164,11 +165,20 @@ static inline bool wq_has_sleeper(struct wait_queue_head *wq_head) extern void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); +extern void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); extern void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry); static inline void __add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { - list_add(&wq_entry->entry, &wq_head->head); + struct list_head *head = &wq_head->head; + struct wait_queue_entry *wq; + + list_for_each_entry(wq, &wq_head->head, entry) { + if (!(wq->flags & WQ_FLAG_PRIORITY)) + break; + head = &wq->entry; + } + list_add(&wq_entry->entry, head); } /* diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 01f5d3020589..183cc6ae68a6 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -37,6 +37,17 @@ void add_wait_queue_exclusive(struct wait_queue_head *wq_head, struct wait_queue } EXPORT_SYMBOL(add_wait_queue_exclusive); +void add_wait_queue_priority(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) +{ + unsigned long flags; + + wq_entry->flags |= WQ_FLAG_EXCLUSIVE | WQ_FLAG_PRIORITY; + spin_lock_irqsave(&wq_head->lock, flags); + __add_wait_queue(wq_head, wq_entry); + spin_unlock_irqrestore(&wq_head->lock, flags); +} +EXPORT_SYMBOL_GPL(add_wait_queue_priority); + void remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq_entry) { unsigned long flags; @@ -57,7 +68,11 @@ EXPORT_SYMBOL(remove_wait_queue); /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve - * number) then we wake all the non-exclusive tasks and one exclusive task. + * number) then we wake that number of exclusive tasks, and potentially all + * the non-exclusive tasks. Normally, exclusive tasks will be at the end of + * the list and any non-exclusive tasks will be woken first. A priority task + * may be at the head of the list, and can consume the event without any other + * tasks being woken. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns -- cgit v1.2.3 From da88f9b3113620dcd30fc203236aa53d5430ee98 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Wed, 4 Nov 2020 17:34:01 +0100 Subject: timer_list: Use printk format instead of open-coded symbol lookup Use the "%ps" printk format string to resolve symbol names. This works on all platforms, including ia64, ppc64 and parisc64 on which one needs to dereference pointers to function descriptors instead of function pointers. Signed-off-by: Helge Deller Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201104163401.GA3984@ls3530.fritz.box --- kernel/time/timer_list.c | 66 ++++++++++++++---------------------------------- 1 file changed, 19 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index acb326f5f50a..6939140ab7c5 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -42,24 +42,11 @@ static void SEQ_printf(struct seq_file *m, const char *fmt, ...) va_end(args); } -static void print_name_offset(struct seq_file *m, void *sym) -{ - char symname[KSYM_NAME_LEN]; - - if (lookup_symbol_name((unsigned long)sym, symname) < 0) - SEQ_printf(m, "<%pK>", sym); - else - SEQ_printf(m, "%s", symname); -} - static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: ", idx); - print_name_offset(m, taddr); - SEQ_printf(m, ", "); - print_name_offset(m, timer->function); + SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -116,9 +103,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); - SEQ_printf(m, " .get_time: "); - print_name_offset(m, base->get_time); - SEQ_printf(m, "\n"); + SEQ_printf(m, " .get_time: %ps\n", base->get_time); #ifdef CONFIG_HIGH_RES_TIMERS SEQ_printf(m, " .offset: %Lu nsecs\n", (unsigned long long) ktime_to_ns(base->offset)); @@ -218,42 +203,29 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); - SEQ_printf(m, " set_next_event: "); - print_name_offset(m, dev->set_next_event); - SEQ_printf(m, "\n"); + SEQ_printf(m, " set_next_event: %ps\n", dev->set_next_event); - if (dev->set_state_shutdown) { - SEQ_printf(m, " shutdown: "); - print_name_offset(m, dev->set_state_shutdown); - SEQ_printf(m, "\n"); - } + if (dev->set_state_shutdown) + SEQ_printf(m, " shutdown: %ps\n", + dev->set_state_shutdown); - if (dev->set_state_periodic) { - SEQ_printf(m, " periodic: "); - print_name_offset(m, dev->set_state_periodic); - SEQ_printf(m, "\n"); - } + if (dev->set_state_periodic) + SEQ_printf(m, " periodic: %ps\n", + dev->set_state_periodic); - if (dev->set_state_oneshot) { - SEQ_printf(m, " oneshot: "); - print_name_offset(m, dev->set_state_oneshot); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot) + SEQ_printf(m, " oneshot: %ps\n", + dev->set_state_oneshot); - if (dev->set_state_oneshot_stopped) { - SEQ_printf(m, " oneshot stopped: "); - print_name_offset(m, dev->set_state_oneshot_stopped); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot_stopped) + SEQ_printf(m, " oneshot stopped: %ps\n", + dev->set_state_oneshot_stopped); - if (dev->tick_resume) { - SEQ_printf(m, " resume: "); - print_name_offset(m, dev->tick_resume); - SEQ_printf(m, "\n"); - } + if (dev->tick_resume) + SEQ_printf(m, " resume: %ps\n", + dev->tick_resume); - SEQ_printf(m, " event_handler: "); - print_name_offset(m, dev->event_handler); + SEQ_printf(m, " event_handler: %ps\n", dev->event_handler); SEQ_printf(m, "\n"); SEQ_printf(m, " retries: %lu\n", dev->retries); SEQ_printf(m, "\n"); -- cgit v1.2.3 From c725dafc95f1b37027840aaeaa8b7e4e9cd20516 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 3 Nov 2020 20:09:37 +0100 Subject: timers: Don't block on ->expiry_lock for TIMER_IRQSAFE timers PREEMPT_RT does not spin and wait until a running timer completes its callback but instead it blocks on a sleeping lock to prevent a livelock in the case that the task waiting for the callback completion preempted the callback. This cannot be done for timers flagged with TIMER_IRQSAFE. These timers can be canceled from an interrupt disabled context even on RT kernels. The expiry callback of such timers is invoked with interrupts disabled so there is no need to use the expiry lock mechanism because obviously the callback cannot be preempted even on RT kernels. Do not use the timer_base::expiry_lock mechanism when waiting for a running callback to complete if the timer is flagged with TIMER_IRQSAFE. Also add a lockdep assertion for RT kernels to validate that the expiry lock mechanism is always invoked in preemptible context. Reported-by: Mike Galbraith Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201103190937.hga67rqhvknki3tp@linutronix.de --- kernel/time/timer.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index de37e33a868d..af9ddfbd447d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1288,7 +1288,7 @@ static void del_timer_wait_running(struct timer_list *timer) u32 tf; tf = READ_ONCE(timer->flags); - if (!(tf & TIMER_MIGRATING)) { + if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { struct timer_base *base = get_timer_base(tf); /* @@ -1372,6 +1372,13 @@ int del_timer_sync(struct timer_list *timer) */ WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); + /* + * Must be able to sleep on PREEMPT_RT because of the slowpath in + * del_timer_wait_running(). + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) + lockdep_assert_preemption_enabled(); + do { ret = try_to_del_timer_sync(timer); -- cgit v1.2.3 From a0f5a65fa5faeef708d022698d5fcba290a35856 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 13 Nov 2020 15:24:30 +0800 Subject: time: Add missing colons for parameter documentation of time64_to_tm() Address these kernel-doc warnings: kernel/time/timeconv.c:79: warning: Function parameter or member 'totalsecs' not described in 'time64_to_tm' kernel/time/timeconv.c:79: warning: Function parameter or member 'offset' not described in 'time64_to_tm' kernel/time/timeconv.c:79: warning: Function parameter or member 'result' not described in 'time64_to_tm' The parameters are described but lack colons after the parameter name. [ tglx: Massaged changelog ] Signed-off-by: Alex Shi Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/1605252275-63652-1-git-send-email-alex.shi@linux.alibaba.com --- kernel/time/timeconv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c index 589e0a552129..62e3b46717a6 100644 --- a/kernel/time/timeconv.c +++ b/kernel/time/timeconv.c @@ -70,10 +70,10 @@ static const unsigned short __mon_yday[2][13] = { /** * time64_to_tm - converts the calendar time to local broken-down time * - * @totalsecs the number of seconds elapsed since 00:00:00 on January 1, 1970, + * @totalsecs: the number of seconds elapsed since 00:00:00 on January 1, 1970, * Coordinated Universal Time (UTC). - * @offset offset seconds adding to totalsecs. - * @result pointer to struct tm variable to receive broken-down time + * @offset: offset seconds adding to totalsecs. + * @result: pointer to struct tm variable to receive broken-down time */ void time64_to_tm(time64_t totalsecs, int offset, struct tm *result) { -- cgit v1.2.3 From 199d280c884de44c3b0daeb77438db43f6db01a2 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 13 Nov 2020 15:24:33 +0800 Subject: timekeeping: Remove static functions from kernel-doc markup Various static functions in the timekeeping code have function comments which pretend to be kernel-doc, but are incomplete and trigger parser warnings. As these functions are local to the timekeeping core code there is no need to expose them via kernel-doc. Remove the double star kernel-doc marker and remove excess newlines. [ tglx: Massaged changelog and removed excess newlines ] Signed-off-by: Alex Shi Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/1605252275-63652-4-git-send-email-alex.shi@linux.alibaba.com --- kernel/time/timekeeping.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 6858a31364b6..570fc500d263 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1415,9 +1415,8 @@ void timekeeping_warp_clock(void) } } -/** +/* * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic - * */ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) { @@ -1425,7 +1424,7 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset) tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0)); } -/** +/* * change_clocksource - Swaps clocksources if a new one is available * * Accumulates current time interval and initializes new clocksource @@ -2023,13 +2022,12 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) } } -/** +/* * accumulate_nsecs_to_secs - Accumulates nsecs into secs * * Helper function that accumulates the nsecs greater than a second * from the xtime_nsec field to the xtime_secs field. * It also calls into the NTP code to handle leapsecond processing. - * */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { @@ -2071,7 +2069,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) return clock_set; } -/** +/* * logarithmic_accumulation - shifted accumulation of cycles * * This functions accumulates a shifted interval of cycles into @@ -2314,7 +2312,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, return base; } -/** +/* * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex */ static int timekeeping_validate_timex(const struct __kernel_timex *txc) -- cgit v1.2.3 From e025b03113d27139ce2b28b82599018e4d8fa5f6 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 13 Nov 2020 15:24:31 +0800 Subject: timekeeping: Add missing parameter documentation for update_fast_timekeeper() Address the following warning: kernel/time/timekeeping.c:415: warning: Function parameter or member 'tkf' not described in 'update_fast_timekeeper' [ tglx: Remove the bogus ktime_get_mono_fast_ns() part ] Signed-off-by: Alex Shi Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/1605252275-63652-2-git-send-email-alex.shi@linux.alibaba.com --- kernel/time/timekeeping.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 570fc500d263..a823703c905e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -407,6 +407,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update + * @tkf: Pointer to NMI safe timekeeper * * We want to use this from any context including NMI and tracing / * instrumenting the timekeeping code itself. -- cgit v1.2.3 From c1ce406e80fb15fa52b2b48dfd48fad6f3d2a32f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 15 Nov 2020 21:09:31 +0100 Subject: timekeeping: Fix up function documentation for the NMI safe accessors Alex reported the following warning: kernel/time/timekeeping.c:464: warning: Function parameter or member 'tkf' not described in '__ktime_get_fast_ns' which is not entirely correct because the documented function is ktime_get_mono_fast_ns() which does not have a parameter, but the kernel-doc parser looks at the function declaration which follows the comment and complains about the missing parameter documentation. Aside of that the documentation for the rest of the NMI safe accessors is either incomplete or missing. - Move the function documentation to the right place - Fixup the references and inconsistencies - Add the missing documentation for ktime_get_raw_fast_ns() Reported-by: Alex Shi Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 58 +++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a823703c905e..ab4b83186331 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -437,6 +437,27 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, memcpy(base + 1, base, sizeof(*base)); } +static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) +{ + struct tk_read_base *tkr; + unsigned int seq; + u64 now; + + do { + seq = raw_read_seqcount_latch(&tkf->seq); + tkr = tkf->base + (seq & 0x01); + now = ktime_to_ns(tkr->base); + + now += timekeeping_delta_to_ns(tkr, + clocksource_delta( + tk_clock_read(tkr), + tkr->cycle_last, + tkr->mask)); + } while (read_seqcount_latch_retry(&tkf->seq, seq)); + + return now; +} + /** * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic * @@ -463,39 +484,24 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr, * * So reader 6 will observe time going backwards versus reader 5. * - * While other CPUs are likely to be able observe that, the only way + * While other CPUs are likely to be able to observe that, the only way * for a CPU local observation is when an NMI hits in the middle of * the update. Timestamps taken from that NMI context might be ahead * of the following timestamps. Callers need to be aware of that and * deal with it. */ -static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) -{ - struct tk_read_base *tkr; - unsigned int seq; - u64 now; - - do { - seq = raw_read_seqcount_latch(&tkf->seq); - tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base); - - now += timekeeping_delta_to_ns(tkr, - clocksource_delta( - tk_clock_read(tkr), - tkr->cycle_last, - tkr->mask)); - } while (read_seqcount_latch_retry(&tkf->seq, seq)); - - return now; -} - u64 ktime_get_mono_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_mono); } EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); +/** + * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw + * + * Contrary to ktime_get_mono_fast_ns() this is always correct because the + * conversion factor is not affected by NTP/PTP correction. + */ u64 ktime_get_raw_fast_ns(void) { return __ktime_get_fast_ns(&tk_fast_raw); @@ -522,6 +528,9 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be * partially updated. Since the tk->offs_boot update is a rare event, this * should be a rare occurrence which postprocessing should be able to handle. + * + * The caveats vs. timestamp ordering as documented for ktime_get_fast_ns() + * apply as well. */ u64 notrace ktime_get_boot_fast_ns(void) { @@ -531,9 +540,6 @@ u64 notrace ktime_get_boot_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns); -/* - * See comment for __ktime_get_fast_ns() vs. timestamp ordering - */ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) { struct tk_read_base *tkr; @@ -558,6 +564,8 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono) /** * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime. + * + * See ktime_get_fast_ns() for documentation of the time stamp ordering. */ u64 ktime_get_real_fast_ns(void) { -- cgit v1.2.3 From f27f7c3f100e74a7f451a63a15788f50c52f7cce Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 13 Nov 2020 15:24:32 +0800 Subject: timekeeping: Add missing parameter docs for pvclock_gtod_[un]register_notifier() The kernel-doc parser complains about: kernel/time/timekeeping.c:651: warning: Function parameter or member 'nb' not described in 'pvclock_gtod_register_notifier' kernel/time/timekeeping.c:670: warning: Function parameter or member 'nb' not described in 'pvclock_gtod_unregister_notifier' Add the missing parameter explanations. [ tglx: Massaged changelog ] Signed-off-by: Alex Shi Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/1605252275-63652-3-git-send-email-alex.shi@linux.alibaba.com --- kernel/time/timekeeping.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index ab4b83186331..9c9392360ade 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -663,6 +663,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set) /** * pvclock_gtod_register_notifier - register a pvclock timedata update listener + * @nb: Pointer to the notifier block to register */ int pvclock_gtod_register_notifier(struct notifier_block *nb) { @@ -682,6 +683,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier); /** * pvclock_gtod_unregister_notifier - unregister a pvclock * timedata update listener + * @nb: Pointer to the notifier block to unregister */ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) { -- cgit v1.2.3 From 29efc4612ac1b888e65da408b41dafa4dd00842f Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 13 Nov 2020 15:24:35 +0800 Subject: timekeeping: Fix parameter docs of read_persistent_wall_and_boot_offset() Address the following kernel-doc markup warnings: kernel/time/timekeeping.c:1563: warning: Function parameter or member 'wall_time' not described in 'read_persistent_wall_and_boot_offset' kernel/time/timekeeping.c:1563: warning: Function parameter or member 'boot_offset' not described in 'read_persistent_wall_and_boot_offset' The parameters are described but miss the leading '@' and the colon after the parameter names. [ tglx: Massaged changelog ] Signed-off-by: Alex Shi Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/1605252275-63652-6-git-send-email-alex.shi@linux.alibaba.com --- kernel/time/timekeeping.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 9c9392360ade..75cba958ae29 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1576,8 +1576,9 @@ void __weak read_persistent_clock64(struct timespec64 *ts) * from the boot. * * Weak dummy function for arches that do not yet support it. - * wall_time - current time as returned by persistent clock - * boot_offset - offset that is defined as wall_time - boot_time + * @wall_time: - current time as returned by persistent clock + * @boot_offset: - offset that is defined as wall_time - boot_time + * * The default function calculates offset based on the current value of * local_clock(). This way architectures that support sched_clock() but don't * support dedicated boot time clock will provide the best estimate of the -- cgit v1.2.3 From 6e5a91901c2dff3a0f2eb9f10e427dce2b0488fc Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 13 Nov 2020 15:24:34 +0800 Subject: timekeeping: Address parameter documentation issues for various functions The kernel-doc parser complains: kernel/time/timekeeping.c:1543: warning: Function parameter or member 'ts' not described in 'read_persistent_clock64' kernel/time/timekeeping.c:764: warning: Function parameter or member 'tk' not described in 'timekeeping_forward_now' kernel/time/timekeeping.c:1331: warning: Function parameter or member 'ts' not described in 'timekeeping_inject_offset' kernel/time/timekeeping.c:1331: warning: Excess function parameter 'tv' description in 'timekeeping_inject_offset' Add the missing parameter documentations and rename the 'tv' parameter of timekeeping_inject_offset() to 'ts' so it matches the implemention. [ tglx: Reworded a few docs and massaged changelog ] Signed-off-by: Alex Shi Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/1605252275-63652-5-git-send-email-alex.shi@linux.alibaba.com --- kernel/time/timekeeping.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 75cba958ae29..74503c0151e5 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -774,6 +774,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) /** * timekeeping_forward_now - update clock to the current time + * @tk: Pointer to the timekeeper to update * * Forward the current clock to update its state since the last call to * update_wall_time(). This is useful before significant clock changes, @@ -1350,7 +1351,7 @@ EXPORT_SYMBOL(do_settimeofday64); /** * timekeeping_inject_offset - Adds or subtracts from the current time. - * @tv: pointer to the timespec variable containing the offset + * @ts: Pointer to the timespec variable containing the offset * * Adds or subtracts an offset value from the current time. */ @@ -1558,6 +1559,7 @@ u64 timekeeping_max_deferment(void) /** * read_persistent_clock64 - Return time from the persistent clock. + * @ts: Pointer to the storage for the readout value * * Weak dummy function for arches that do not yet support it. * Reads the time from the battery backed persistent clock. @@ -1663,7 +1665,8 @@ static struct timespec64 timekeeping_suspend_time; /** * __timekeeping_inject_sleeptime - Internal function to add sleep interval - * @delta: pointer to a timespec delta value + * @tk: Pointer to the timekeeper to be updated + * @delta: Pointer to the delta value in timespec64 format * * Takes a timespec offset measuring a suspend interval and properly * adds the sleep offset to the timekeeping variables. -- cgit v1.2.3 From 78a56e0494ad29feccd4c54c2b5682721f8cb988 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 4 Nov 2020 15:01:57 -0800 Subject: entry: Fix spelling/typo errors in irq entry code s/reguired/required/ s/Interupts/Interrupts/ s/quiescient/quiescent/ s/assemenbly/assembly/ Signed-off-by: Ira Weiny Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201104230157.3378023-1-ira.weiny@intel.com --- include/linux/entry-common.h | 4 ++-- kernel/entry/common.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 1a128baf3628..aab549026ab8 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -415,7 +415,7 @@ void irqentry_exit_cond_resched(void); * @state: Return value from matching call to irqentry_enter() * * Depending on the return target (kernel/user) this runs the necessary - * preemption and work checks if possible and reguired and returns to + * preemption and work checks if possible and required and returns to * the caller with interrupts disabled and no further work pending. * * This is the last action before returning to the low level ASM code which @@ -438,7 +438,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs); * @regs: Pointer to pt_regs (NMI entry regs) * @irq_state: Return value from matching call to irqentry_nmi_enter() * - * Last action before returning to the low level assmenbly code. + * Last action before returning to the low level assembly code. * * Counterpart to irqentry_nmi_enter(). */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index bc75c114c1b3..fa17baadf63e 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -304,7 +304,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * If this entry hit the idle task invoke rcu_irq_enter() whether * RCU is watching or not. * - * Interupts can nest when the first interrupt invokes softirq + * Interrupts can nest when the first interrupt invokes softirq * processing on return which enables interrupts. * * Scheduler ticks in the idle task can mark quiescent state and @@ -315,7 +315,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) * interrupt to invoke rcu_irq_enter(). If that nested interrupt is * the tick then rcu_flavor_sched_clock_irq() would wrongfully * assume that it is the first interupt and eventually claim - * quiescient state and end grace periods prematurely. + * quiescent state and end grace periods prematurely. * * Unconditionally invoke rcu_irq_enter() so RCU state stays * consistent. -- cgit v1.2.3 From cc947f2b9c04113d84eeef67cc7c6326e1982019 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Nov 2020 10:53:38 +0100 Subject: timers: Make run_local_timers() static No users outside of the timer code. Move the caller below this function to avoid a pointless forward declaration. Signed-off-by: Thomas Gleixner --- include/linux/timer.h | 1 - kernel/time/timer.c | 48 ++++++++++++++++++++++++------------------------ 2 files changed, 24 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/timer.h b/include/linux/timer.h index d10bc7e73b41..fda13c9d1256 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -193,7 +193,6 @@ extern int try_to_del_timer_sync(struct timer_list *timer); #define del_singleshot_timer_sync(t) del_timer_sync(t) extern void init_timers(void); -extern void run_local_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index af9ddfbd447d..ebf3b26d2501 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1705,29 +1705,6 @@ void timer_clear_idle(void) } #endif -/* - * Called from the timer interrupt handler to charge one tick to the current - * process. user_tick is 1 if the tick is user time, 0 for system. - */ -void update_process_times(int user_tick) -{ - struct task_struct *p = current; - - PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0); - - /* Note: this timer irq context must be accounted for as well. */ - account_process_tick(p, user_tick); - run_local_timers(); - rcu_sched_clock_irq(user_tick); -#ifdef CONFIG_IRQ_WORK - if (in_irq()) - irq_work_tick(); -#endif - scheduler_tick(); - if (IS_ENABLED(CONFIG_POSIX_TIMERS)) - run_posix_cpu_timers(); -} - /** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. @@ -1777,7 +1754,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) /* * Called by the local, per-CPU timer interrupt on SMP. */ -void run_local_timers(void) +static void run_local_timers(void) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); @@ -1794,6 +1771,29 @@ void run_local_timers(void) raise_softirq(TIMER_SOFTIRQ); } +/* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. + */ +void update_process_times(int user_tick) +{ + struct task_struct *p = current; + + PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0); + + /* Note: this timer irq context must be accounted for as well. */ + account_process_tick(p, user_tick); + run_local_timers(); + rcu_sched_clock_irq(user_tick); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_tick(); +#endif + scheduler_tick(); + if (IS_ENABLED(CONFIG_POSIX_TIMERS)) + run_posix_cpu_timers(); +} + /* * Since schedule_timeout()'s timer is defined on the stack, it must store * the target task on the stack as well. -- cgit v1.2.3 From 66981c37b3199d293c58f84cf2366e86a06e1a3d Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 16 Nov 2020 11:18:14 +0100 Subject: hrtimer: Fix kernel-doc markups The hrtimer_get_remaining() markup is documenting, instead, __hrtimer_get_remaining(), as it is placed at the C file. In order to properly document it, a kernel-doc markup is needed together with the function prototype. So, add a new one, while preserving the existing one, just fixing the function name. The hrtimer_is_queued prototype has a typo: it is using '=' instead of '-' to split: identifier - description as required by kernel-doc markup. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/9dc87808c2fd07b7e050bafcd033c5ef05808fea.1605521731.git.mchehab+huawei@kernel.org --- include/linux/hrtimer.h | 6 +++++- kernel/time/hrtimer.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 107cedd7019a..bb5e7b0a4274 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -447,6 +447,10 @@ static inline void hrtimer_restart(struct hrtimer *timer) /* Query timers: */ extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); +/** + * hrtimer_get_remaining - get remaining time for the timer + * @timer: the timer to read + */ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { return __hrtimer_get_remaining(timer, false); @@ -458,7 +462,7 @@ extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); /** - * hrtimer_is_queued = check, whether the timer is on one of the queues + * hrtimer_is_queued - check, whether the timer is on one of the queues * @timer: Timer to check * * Returns: True if the timer is queued, false otherwise diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3624b9b5835d..61c39ff68439 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1289,7 +1289,7 @@ int hrtimer_cancel(struct hrtimer *timer) EXPORT_SYMBOL_GPL(hrtimer_cancel); /** - * hrtimer_get_remaining - get remaining time for the timer + * __hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ -- cgit v1.2.3 From 8c67d247dcad67fbdd07c8bab9818d0b8d9240bf Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 16 Nov 2020 11:18:15 +0100 Subject: genirq: Fix kernel-doc markups Some identifiers have different names between their prototypes and the kernel-doc markup. Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/13a44f4f0c3135e14b16ae8fcce4af1eab27cb5f.1605521731.git.mchehab+huawei@kernel.org --- kernel/irq/chip.c | 2 +- kernel/irq/generic-chip.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b9b9618e1aca..df75c3573dcb 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -61,7 +61,7 @@ int irq_set_chip(unsigned int irq, struct irq_chip *chip) EXPORT_SYMBOL(irq_set_chip); /** - * irq_set_type - set the irq trigger type for an irq + * irq_set_irq_type - set the irq trigger type for an irq * @irq: irq number * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h */ diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index e2999a070a99..a23ac2bbf433 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -269,7 +269,7 @@ irq_gc_init_mask_cache(struct irq_chip_generic *gc, enum irq_gc_flags flags) } /** - * __irq_alloc_domain_generic_chip - Allocate generic chips for an irq domain + * __irq_alloc_domain_generic_chips - Allocate generic chips for an irq domain * @d: irq domain for which to allocate chips * @irqs_per_chip: Number of interrupts each chip handles (max 32) * @num_ct: Number of irq_chip_type instances associated with this -- cgit v1.2.3 From 872f690341948b502c93318f806d821c56772c42 Mon Sep 17 00:00:00 2001 From: Francis Laniel Date: Sun, 15 Nov 2020 18:08:06 +0100 Subject: treewide: rename nla_strlcpy to nla_strscpy. Calls to nla_strlcpy are now replaced by calls to nla_strscpy which is the new name of this function. Signed-off-by: Francis Laniel Reviewed-by: Kees Cook Signed-off-by: Jakub Kicinski --- drivers/infiniband/core/nldev.c | 10 +++++----- drivers/net/can/vxcan.c | 4 ++-- drivers/net/veth.c | 4 ++-- include/linux/genl_magic_struct.h | 2 +- include/net/netlink.h | 4 ++-- include/net/pkt_cls.h | 2 +- kernel/taskstats.c | 2 +- lib/nlattr.c | 6 +++--- net/core/fib_rules.c | 4 ++-- net/core/rtnetlink.c | 12 ++++++------ net/decnet/dn_dev.c | 2 +- net/ieee802154/nl-mac.c | 2 +- net/ipv4/devinet.c | 2 +- net/ipv4/fib_semantics.c | 2 +- net/ipv4/metrics.c | 2 +- net/netfilter/ipset/ip_set_hash_netiface.c | 4 ++-- net/netfilter/nf_tables_api.c | 6 +++--- net/netfilter/nfnetlink_acct.c | 2 +- net/netfilter/nfnetlink_cthelper.c | 4 ++-- net/netfilter/nft_ct.c | 2 +- net/netfilter/nft_log.c | 2 +- net/netlabel/netlabel_mgmt.c | 2 +- net/nfc/netlink.c | 2 +- net/sched/act_api.c | 2 +- net/sched/act_ipt.c | 2 +- net/sched/act_simple.c | 4 ++-- net/sched/cls_api.c | 2 +- net/sched/sch_api.c | 2 +- net/tipc/netlink_compat.c | 2 +- 29 files changed, 49 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/drivers/infiniband/core/nldev.c b/drivers/infiniband/core/nldev.c index 12d29d54a081..08366e254b1d 100644 --- a/drivers/infiniband/core/nldev.c +++ b/drivers/infiniband/core/nldev.c @@ -932,7 +932,7 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RDMA_NLDEV_ATTR_DEV_NAME]) { char name[IB_DEVICE_NAME_MAX] = {}; - nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + nla_strscpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME], IB_DEVICE_NAME_MAX); if (strlen(name) == 0) { err = -EINVAL; @@ -1529,13 +1529,13 @@ static int nldev_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, !tb[RDMA_NLDEV_ATTR_LINK_TYPE] || !tb[RDMA_NLDEV_ATTR_NDEV_NAME]) return -EINVAL; - nla_strlcpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], + nla_strscpy(ibdev_name, tb[RDMA_NLDEV_ATTR_DEV_NAME], sizeof(ibdev_name)); if (strchr(ibdev_name, '%') || strlen(ibdev_name) == 0) return -EINVAL; - nla_strlcpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); - nla_strlcpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], + nla_strscpy(type, tb[RDMA_NLDEV_ATTR_LINK_TYPE], sizeof(type)); + nla_strscpy(ndev_name, tb[RDMA_NLDEV_ATTR_NDEV_NAME], sizeof(ndev_name)); ndev = dev_get_by_name(sock_net(skb->sk), ndev_name); @@ -1602,7 +1602,7 @@ static int nldev_get_chardev(struct sk_buff *skb, struct nlmsghdr *nlh, if (err || !tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE]) return -EINVAL; - nla_strlcpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], + nla_strscpy(client_name, tb[RDMA_NLDEV_ATTR_CHARDEV_TYPE], sizeof(client_name)); if (tb[RDMA_NLDEV_ATTR_DEV_INDEX]) { diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index d6ba9426be4d..fa47bab510bb 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -186,7 +186,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev, } if (ifmp && tbp[IFLA_IFNAME]) { - nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); @@ -223,7 +223,7 @@ static int vxcan_newlink(struct net *net, struct net_device *dev, /* register first device */ if (tb[IFLA_IFNAME]) - nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); diff --git a/drivers/net/veth.c b/drivers/net/veth.c index 8c737668008a..359d3ab33c4d 100644 --- a/drivers/net/veth.c +++ b/drivers/net/veth.c @@ -1329,7 +1329,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, } if (ifmp && tbp[IFLA_IFNAME]) { - nla_strlcpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tbp[IFLA_IFNAME], IFNAMSIZ); name_assign_type = NET_NAME_USER; } else { snprintf(ifname, IFNAMSIZ, DRV_NAME "%%d"); @@ -1379,7 +1379,7 @@ static int veth_newlink(struct net *src_net, struct net_device *dev, eth_hw_addr_random(dev); if (tb[IFLA_IFNAME]) - nla_strlcpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ); else snprintf(dev->name, IFNAMSIZ, DRV_NAME "%%d"); diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h index eeae59d3ceb7..35d21fddaf2d 100644 --- a/include/linux/genl_magic_struct.h +++ b/include/linux/genl_magic_struct.h @@ -89,7 +89,7 @@ static inline int nla_put_u64_0pad(struct sk_buff *skb, int attrtype, u64 value) nla_get_u64, nla_put_u64_0pad, false) #define __str_field(attr_nr, attr_flag, name, maxlen) \ __array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \ - nla_strlcpy, nla_put, false) + nla_strscpy, nla_put, false) #define __bin_field(attr_nr, attr_flag, name, maxlen) \ __array(attr_nr, attr_flag, name, NLA_BINARY, char, maxlen, \ nla_memcpy, nla_put, false) diff --git a/include/net/netlink.h b/include/net/netlink.h index 446ca182e13d..1ceec518ab49 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -142,7 +142,7 @@ * Attribute Misc: * nla_memcpy(dest, nla, count) copy attribute into memory * nla_memcmp(nla, data, size) compare attribute with memory area - * nla_strlcpy(dst, nla, size) copy attribute to a sized string + * nla_strscpy(dst, nla, size) copy attribute to a sized string * nla_strcmp(nla, str) compare attribute with string * * Attribute Parsing: @@ -506,7 +506,7 @@ int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, struct netlink_ext_ack *extack); int nla_policy_len(const struct nla_policy *, int); struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype); -ssize_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize); +ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize); char *nla_strdup(const struct nlattr *nla, gfp_t flags); int nla_memcpy(void *dest, const struct nlattr *src, int count); int nla_memcmp(const struct nlattr *nla, const void *data, size_t size); diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h index db9a828f4f4f..133f9ad4d4f9 100644 --- a/include/net/pkt_cls.h +++ b/include/net/pkt_cls.h @@ -512,7 +512,7 @@ tcf_change_indev(struct net *net, struct nlattr *indev_tlv, char indev[IFNAMSIZ]; struct net_device *dev; - if (nla_strlcpy(indev, indev_tlv, IFNAMSIZ) < 0) { + if (nla_strscpy(indev, indev_tlv, IFNAMSIZ) < 0) { NL_SET_ERR_MSG_ATTR(extack, indev_tlv, "Interface name too long"); return -EINVAL; diff --git a/kernel/taskstats.c b/kernel/taskstats.c index a2802b6ff4bb..2b4898b4752e 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -346,7 +346,7 @@ static int parse(struct nlattr *na, struct cpumask *mask) data = kmalloc(len, GFP_KERNEL); if (!data) return -ENOMEM; - nla_strlcpy(data, na, len); + nla_strscpy(data, na, len); ret = cpulist_parse(data, mask); kfree(data); return ret; diff --git a/lib/nlattr.c b/lib/nlattr.c index 447182543c03..09aa181569e0 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -709,7 +709,7 @@ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) EXPORT_SYMBOL(nla_find); /** - * nla_strlcpy - Copy string attribute payload into a sized buffer + * nla_strscpy - Copy string attribute payload into a sized buffer * @dst: Where to copy the string to. * @nla: Attribute to copy the string from. * @dstsize: Size of destination buffer. @@ -722,7 +722,7 @@ EXPORT_SYMBOL(nla_find); * * -E2BIG - If @dstsize is 0 or greater than U16_MAX or @nla length greater * than @dstsize. */ -ssize_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize) +ssize_t nla_strscpy(char *dst, const struct nlattr *nla, size_t dstsize) { size_t srclen = nla_len(nla); char *src = nla_data(nla); @@ -749,7 +749,7 @@ ssize_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize) return ret; } -EXPORT_SYMBOL(nla_strlcpy); +EXPORT_SYMBOL(nla_strscpy); /** * nla_strdup - Copy string attribute payload into a newly allocated buffer diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 7bcfb16854cb..cd80ffed6d26 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -563,7 +563,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; nlrule->iifindex = -1; - nla_strlcpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); + nla_strscpy(nlrule->iifname, tb[FRA_IIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->iifname); if (dev) nlrule->iifindex = dev->ifindex; @@ -573,7 +573,7 @@ static int fib_nl2rule(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; nlrule->oifindex = -1; - nla_strlcpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); + nla_strscpy(nlrule->oifname, tb[FRA_OIFNAME], IFNAMSIZ); dev = __dev_get_by_name(net, nlrule->oifname); if (dev) nlrule->oifindex = dev->ifindex; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 7d7223691783..60917ff4a00b 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1939,7 +1939,7 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla if (linfo[IFLA_INFO_KIND]) { char kind[MODULE_NAME_LEN]; - nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); + nla_strscpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } @@ -2953,9 +2953,9 @@ static struct net_device *rtnl_dev_get(struct net *net, if (!ifname) { ifname = buffer; if (ifname_attr) - nla_strlcpy(ifname, ifname_attr, IFNAMSIZ); + nla_strscpy(ifname, ifname_attr, IFNAMSIZ); else if (altifname_attr) - nla_strlcpy(ifname, altifname_attr, ALTIFNAMSIZ); + nla_strscpy(ifname, altifname_attr, ALTIFNAMSIZ); else return NULL; } @@ -2983,7 +2983,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, goto errout; if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; @@ -3264,7 +3264,7 @@ replay: return err; if (tb[IFLA_IFNAME]) - nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else ifname[0] = '\0'; @@ -3296,7 +3296,7 @@ replay: memset(linkinfo, 0, sizeof(linkinfo)); if (linkinfo[IFLA_INFO_KIND]) { - nla_strlcpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); + nla_strscpy(kind, linkinfo[IFLA_INFO_KIND], sizeof(kind)); ops = rtnl_link_ops_get(kind); } else { kind[0] = '\0'; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 15d42353f1a3..d1c50a48614b 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -658,7 +658,7 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, ifa->ifa_dev = dn_db; if (tb[IFA_LABEL]) - nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); + nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); diff --git a/net/ieee802154/nl-mac.c b/net/ieee802154/nl-mac.c index 6d091e419d3e..9c640d670ffe 100644 --- a/net/ieee802154/nl-mac.c +++ b/net/ieee802154/nl-mac.c @@ -149,7 +149,7 @@ static struct net_device *ieee802154_nl_get_dev(struct genl_info *info) if (info->attrs[IEEE802154_ATTR_DEV_NAME]) { char name[IFNAMSIZ + 1]; - nla_strlcpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME], + nla_strscpy(name, info->attrs[IEEE802154_ATTR_DEV_NAME], sizeof(name)); dev = dev_get_by_name(&init_net, name); } else if (info->attrs[IEEE802154_ATTR_DEV_INDEX]) { diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 43e04382c593..75f67994fc85 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -880,7 +880,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ifa->ifa_broadcast = nla_get_in_addr(tb[IFA_BROADCAST]); if (tb[IFA_LABEL]) - nla_strlcpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); + nla_strscpy(ifa->ifa_label, tb[IFA_LABEL], IFNAMSIZ); else memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 7612ff6111a7..b5400cec4f69 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -973,7 +973,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) char tmp[TCP_CA_NAME_MAX]; bool ecn_ca = false; - nla_strlcpy(tmp, nla, sizeof(tmp)); + nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca); } else { if (nla_len(nla) != sizeof(u32)) diff --git a/net/ipv4/metrics.c b/net/ipv4/metrics.c index 3205d5f7c8c9..25ea6ac44db9 100644 --- a/net/ipv4/metrics.c +++ b/net/ipv4/metrics.c @@ -31,7 +31,7 @@ static int ip_metrics_convert(struct net *net, struct nlattr *fc_mx, if (type == RTAX_CC_ALGO) { char tmp[TCP_CA_NAME_MAX]; - nla_strlcpy(tmp, nla, sizeof(tmp)); + nla_strscpy(tmp, nla, sizeof(tmp)); val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca); if (val == TCP_CA_UNSPEC) { NL_SET_ERR_MSG(extack, "Unknown tcp congestion algorithm"); diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c index 3d74169b794c..ddd51c2e1cb3 100644 --- a/net/netfilter/ipset/ip_set_hash_netiface.c +++ b/net/netfilter/ipset/ip_set_hash_netiface.c @@ -226,7 +226,7 @@ hash_netiface4_uadt(struct ip_set *set, struct nlattr *tb[], if (e.cidr > HOST_MASK) return -IPSET_ERR_INVALID_CIDR; } - nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); + nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); @@ -443,7 +443,7 @@ hash_netiface6_uadt(struct ip_set *set, struct nlattr *tb[], ip6_netmask(&e.ip, e.cidr); - nla_strlcpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); + nla_strscpy(e.iface, tb[IPSET_ATTR_IFACE], IFNAMSIZ); if (tb[IPSET_ATTR_CADT_FLAGS]) { u32 cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]); diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index bd0e12bf5770..65aa98fc5eb6 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1282,7 +1282,7 @@ static struct nft_chain *nft_chain_lookup(struct net *net, if (nla == NULL) return ERR_PTR(-EINVAL); - nla_strlcpy(search, nla, sizeof(search)); + nla_strscpy(search, nla, sizeof(search)); WARN_ON(!rcu_read_lock_held() && !lockdep_commit_lock_is_held(net)); @@ -1722,7 +1722,7 @@ static struct nft_hook *nft_netdev_hook_alloc(struct net *net, goto err_hook_alloc; } - nla_strlcpy(ifname, attr, IFNAMSIZ); + nla_strscpy(ifname, attr, IFNAMSIZ); dev = __dev_get_by_name(net, ifname); if (!dev) { err = -ENOENT; @@ -5735,7 +5735,7 @@ struct nft_object *nft_obj_lookup(const struct net *net, struct rhlist_head *tmp, *list; struct nft_object *obj; - nla_strlcpy(search, nla, sizeof(search)); + nla_strscpy(search, nla, sizeof(search)); k.name = search; WARN_ON_ONCE(!rcu_read_lock_held() && diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index 5bfec829c12f..5e511df8d709 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -112,7 +112,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, nfacct->flags = flags; } - nla_strlcpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX); + nla_strscpy(nfacct->name, tb[NFACCT_NAME], NFACCT_NAME_MAX); if (tb[NFACCT_BYTES]) { atomic64_set(&nfacct->bytes, diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 5b0d0a77379c..0f94fce1d3ed 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -146,7 +146,7 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, !tb[NFCTH_POLICY_EXPECT_TIMEOUT]) return -EINVAL; - nla_strlcpy(expect_policy->name, + nla_strscpy(expect_policy->name, tb[NFCTH_POLICY_NAME], NF_CT_HELPER_NAME_LEN); expect_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); @@ -233,7 +233,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[], if (ret < 0) goto err1; - nla_strlcpy(helper->name, + nla_strscpy(helper->name, tb[NFCTH_NAME], NF_CT_HELPER_NAME_LEN); size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); if (size > sizeof_field(struct nf_conn_help, data)) { diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 322bd674963e..a8c4d442231c 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -990,7 +990,7 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, if (!priv->l4proto) return -ENOENT; - nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); + nla_strscpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); if (tb[NFTA_CT_HELPER_L3PROTO]) family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO])); diff --git a/net/netfilter/nft_log.c b/net/netfilter/nft_log.c index 57899454a530..a06a46b039c5 100644 --- a/net/netfilter/nft_log.c +++ b/net/netfilter/nft_log.c @@ -152,7 +152,7 @@ static int nft_log_init(const struct nft_ctx *ctx, priv->prefix = kmalloc(nla_len(nla) + 1, GFP_KERNEL); if (priv->prefix == NULL) return -ENOMEM; - nla_strlcpy(priv->prefix, nla, nla_len(nla) + 1); + nla_strscpy(priv->prefix, nla, nla_len(nla) + 1); } else { priv->prefix = (char *)nft_log_null_prefix; } diff --git a/net/netlabel/netlabel_mgmt.c b/net/netlabel/netlabel_mgmt.c index eb1d66d20afb..df1b41ed73fd 100644 --- a/net/netlabel/netlabel_mgmt.c +++ b/net/netlabel/netlabel_mgmt.c @@ -95,7 +95,7 @@ static int netlbl_mgmt_add_common(struct genl_info *info, ret_val = -ENOMEM; goto add_free_entry; } - nla_strlcpy(entry->domain, + nla_strscpy(entry->domain, info->attrs[NLBL_MGMT_A_DOMAIN], tmp_size); } diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 8709f3d4e7c4..573b38ad2f8e 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -1226,7 +1226,7 @@ static int nfc_genl_fw_download(struct sk_buff *skb, struct genl_info *info) if (!dev) return -ENODEV; - nla_strlcpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], + nla_strscpy(firmware_name, info->attrs[NFC_ATTR_FIRMWARE_NAME], sizeof(firmware_name)); rc = nfc_fw_download(dev, firmware_name); diff --git a/net/sched/act_api.c b/net/sched/act_api.c index fe540a89b16c..fc23f46a315c 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -939,7 +939,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp, NL_SET_ERR_MSG(extack, "TC action kind must be specified"); goto err_out; } - if (nla_strlcpy(act_name, kind, IFNAMSIZ) < 0) { + if (nla_strscpy(act_name, kind, IFNAMSIZ) < 0) { NL_SET_ERR_MSG(extack, "TC action name too long"); goto err_out; } diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 8dc3bec0d325..ac7297f42355 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -166,7 +166,7 @@ static int __tcf_ipt_init(struct net *net, unsigned int id, struct nlattr *nla, if (unlikely(!tname)) goto err1; if (tb[TCA_IPT_TABLE] == NULL || - nla_strlcpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ) + nla_strscpy(tname, tb[TCA_IPT_TABLE], IFNAMSIZ) >= IFNAMSIZ) strcpy(tname, "mangle"); t = kmemdup(td, td->u.target_size, GFP_KERNEL); diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index a4f3d0f0daa9..726cc956d06f 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -52,7 +52,7 @@ static int alloc_defdata(struct tcf_defact *d, const struct nlattr *defdata) d->tcfd_defdata = kzalloc(SIMP_MAX_DATA, GFP_KERNEL); if (unlikely(!d->tcfd_defdata)) return -ENOMEM; - nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); return 0; } @@ -71,7 +71,7 @@ static int reset_policy(struct tc_action *a, const struct nlattr *defdata, spin_lock_bh(&d->tcf_lock); goto_ch = tcf_action_set_ctrlact(a, p->action, goto_ch); memset(d->tcfd_defdata, 0, SIMP_MAX_DATA); - nla_strlcpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); + nla_strscpy(d->tcfd_defdata, defdata, SIMP_MAX_DATA); spin_unlock_bh(&d->tcf_lock); if (goto_ch) tcf_chain_put_by_act(goto_ch); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index c2e9661e20d3..ff3e943febaa 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -223,7 +223,7 @@ static inline u32 tcf_auto_prio(struct tcf_proto *tp) static bool tcf_proto_check_kind(struct nlattr *kind, char *name) { if (kind) - return nla_strlcpy(name, kind, IFNAMSIZ) < 0; + return nla_strscpy(name, kind, IFNAMSIZ) < 0; memset(name, 0, IFNAMSIZ); return false; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 05449286d889..1a2d2471b078 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1170,7 +1170,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, #ifdef CONFIG_MODULES if (ops == NULL && kind != NULL) { char name[IFNAMSIZ]; - if (nla_strlcpy(name, kind, IFNAMSIZ) >= 0) { + if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) { /* We dropped the RTNL semaphore in order to * perform the module load. So, even if we * succeeded in loading the module we have to diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index 5c6206c9d6a8..82f154989418 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -696,7 +696,7 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, link_info.dest = nla_get_flag(link[TIPC_NLA_LINK_DEST]); link_info.up = htonl(nla_get_flag(link[TIPC_NLA_LINK_UP])); - nla_strlcpy(link_info.str, link[TIPC_NLA_LINK_NAME], + nla_strscpy(link_info.str, link[TIPC_NLA_LINK_NAME], TIPC_MAX_LINK_NAME); return tipc_add_tlv(msg->rep, TIPC_TLV_LINK_INFO, -- cgit v1.2.3 From 76980f5fa06d505879ba936b1b5066a056991de0 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Sun, 15 Nov 2020 16:53:36 +0100 Subject: tracing: Clean up after filter logic rewriting The functions event_{set,clear,}_no_set_filter_flag were only used in replace_system_preds() [now, renamed to process_system_preds()]. Commit 80765597bc58 ("tracing: Rewrite filter logic to be simpler and faster") removed the use of those functions in replace_system_preds(). Since then, the functions event_{set,clear,}_no_set_filter_flag were unused. Fortunately, make CC=clang W=1 indicates this with -Wunused-function warnings on those three functions. So, clean up these obsolete unused functions. Link: https://lkml.kernel.org/r/20201115155336.20248-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events_filter.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d0f515ac9b7c..e91259f6a722 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1561,27 +1561,6 @@ static inline void event_clear_filter(struct trace_event_file *file) RCU_INIT_POINTER(file->filter, NULL); } -static inline void -event_set_no_set_filter_flag(struct trace_event_file *file) -{ - file->flags |= EVENT_FILE_FL_NO_SET_FILTER; -} - -static inline void -event_clear_no_set_filter_flag(struct trace_event_file *file) -{ - file->flags &= ~EVENT_FILE_FL_NO_SET_FILTER; -} - -static inline bool -event_no_set_filter_flag(struct trace_event_file *file) -{ - if (file->flags & EVENT_FILE_FL_NO_SET_FILTER) - return true; - - return false; -} - struct filter_list { struct list_head list; struct event_filter *filter; -- cgit v1.2.3 From b86678cf0f1d76062aa964c5f0c6c89fe5a6dcfd Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:41:59 -0500 Subject: entry: Wire up syscall_work in common entry code Prepare the common entry code to use the SYSCALL_WORK flags. They will be defined in subsequent patches for each type of syscall work. SYSCALL_WORK_ENTRY/EXIT are defined for the transition, as they will replace the TIF_ equivalent defines. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-4-krisman@collabora.com --- include/linux/entry-common.h | 3 +++ kernel/entry/common.c | 15 +++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index aab549026ab8..3fe8f868f15e 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -64,6 +64,9 @@ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK) +#define SYSCALL_WORK_ENTER (0) +#define SYSCALL_WORK_EXIT (0) + /* * TIF flags handled in exit_to_user_mode_loop() */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index fa17baadf63e..e7a11e38daba 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -42,7 +42,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) } static long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long ti_work) + unsigned long ti_work, unsigned long work) { long ret = 0; @@ -74,11 +74,12 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, static __always_inline long __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) { + unsigned long work = READ_ONCE(current_thread_info()->syscall_work); unsigned long ti_work; ti_work = READ_ONCE(current_thread_info()->flags); - if (ti_work & SYSCALL_ENTER_WORK) - syscall = syscall_trace_enter(regs, syscall, ti_work); + if (work & SYSCALL_WORK_ENTER || ti_work & SYSCALL_ENTER_WORK) + syscall = syscall_trace_enter(regs, syscall, ti_work, work); return syscall; } @@ -225,7 +226,8 @@ static inline bool report_single_step(unsigned long ti_work) } #endif -static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) +static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, + unsigned long work) { bool step; @@ -245,6 +247,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work) */ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { + unsigned long work = READ_ONCE(current_thread_info()->syscall_work); u32 cached_flags = READ_ONCE(current_thread_info()->flags); unsigned long nr = syscall_get_nr(current, regs); @@ -262,8 +265,8 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) * enabled, we want to run them exactly once per syscall exit with * interrupts enabled. */ - if (unlikely(cached_flags & SYSCALL_EXIT_WORK)) - syscall_exit_work(regs, cached_flags); + if (unlikely(work & SYSCALL_WORK_EXIT || cached_flags & SYSCALL_EXIT_WORK)) + syscall_exit_work(regs, cached_flags, work); } __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) -- cgit v1.2.3 From 23d67a54857a768acdb0804cdd6037c324a50ecd Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:00 -0500 Subject: seccomp: Migrate to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SECCOMP, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-5-krisman@collabora.com --- include/asm-generic/syscall.h | 2 +- include/linux/entry-common.h | 8 ++------ include/linux/seccomp.h | 2 +- include/linux/thread_info.h | 6 ++++++ kernel/entry/common.c | 2 +- kernel/fork.c | 2 +- kernel/seccomp.c | 6 +++--- 7 files changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index f3135e734387..524d8e68ff5e 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -135,7 +135,7 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, * Returns the AUDIT_ARCH_* based on the system call convention in use. * * It's only valid to call this when @task is stopped on entry to a system - * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %TIF_SECCOMP. + * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %SYSCALL_WORK_SECCOMP. * * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must * provide an implementation of this. diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 3fe8f868f15e..fa3cdb102dbf 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -21,10 +21,6 @@ # define _TIF_SYSCALL_TRACEPOINT (0) #endif -#ifndef _TIF_SECCOMP -# define _TIF_SECCOMP (0) -#endif - #ifndef _TIF_SYSCALL_AUDIT # define _TIF_SYSCALL_AUDIT (0) #endif @@ -49,7 +45,7 @@ #endif #define SYSCALL_ENTER_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) @@ -64,7 +60,7 @@ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK) -#define SYSCALL_WORK_ENTER (0) +#define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP) #define SYSCALL_WORK_EXIT (0) /* diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 02aef2844c38..47763f3999f7 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -42,7 +42,7 @@ struct seccomp { extern int __secure_computing(const struct seccomp_data *sd); static inline int secure_computing(void) { - if (unlikely(test_thread_flag(TIF_SECCOMP))) + if (unlikely(test_syscall_work(SECCOMP))) return __secure_computing(NULL); return 0; } diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 0e9fb15d6b42..a308ba4ef07b 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -35,6 +35,12 @@ enum { GOOD_STACK, }; +enum syscall_work_bit { + SYSCALL_WORK_BIT_SECCOMP, +}; + +#define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) + #include #ifdef __KERNEL__ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e7a11e38daba..5747a6eb2c48 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -54,7 +54,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, } /* Do seccomp after ptrace, to catch any tracer changes. */ - if (ti_work & _TIF_SECCOMP) { + if (work & SYSCALL_WORK_SECCOMP) { ret = __secure_computing(NULL); if (ret == -1L) return ret; diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..bc5b1090f415 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1625,7 +1625,7 @@ static void copy_seccomp(struct task_struct *p) * to manually enable the seccomp thread flag here. */ if (p->seccomp.mode != SECCOMP_MODE_DISABLED) - set_tsk_thread_flag(p, TIF_SECCOMP); + set_task_syscall_work(p, SECCOMP); #endif } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 8ad7a293255a..f67e92d11ad7 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -356,14 +356,14 @@ static inline void seccomp_assign_mode(struct task_struct *task, task->seccomp.mode = seccomp_mode; /* - * Make sure TIF_SECCOMP cannot be set before the mode (and + * Make sure SYSCALL_WORK_SECCOMP cannot be set before the mode (and * filter) is set. */ smp_mb__before_atomic(); /* Assume default seccomp processes want spec flaw mitigation. */ if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0) arch_seccomp_spec_mitigate(task); - set_tsk_thread_flag(task, TIF_SECCOMP); + set_task_syscall_work(task, SECCOMP); } #ifdef CONFIG_SECCOMP_FILTER @@ -929,7 +929,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, /* * Make sure that any changes to mode from another thread have - * been seen after TIF_SECCOMP was seen. + * been seen after SYSCALL_WORK_SECCOMP was seen. */ rmb(); -- cgit v1.2.3 From 524666cb5de7c38a1925e7401a6e59d68682dd8c Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:01 -0500 Subject: tracepoints: Migrate to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_TRACEPOINT, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-6-krisman@collabora.com --- include/linux/entry-common.h | 13 +++++-------- include/linux/thread_info.h | 2 ++ include/trace/syscall.h | 6 +++--- kernel/entry/common.c | 4 ++-- kernel/trace/trace_events.c | 8 ++++---- kernel/tracepoint.c | 4 ++-- 6 files changed, 18 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index fa3cdb102dbf..2a01eee2dbba 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -17,10 +17,6 @@ # define _TIF_SYSCALL_EMU (0) #endif -#ifndef _TIF_SYSCALL_TRACEPOINT -# define _TIF_SYSCALL_TRACEPOINT (0) -#endif - #ifndef _TIF_SYSCALL_AUDIT # define _TIF_SYSCALL_AUDIT (0) #endif @@ -46,7 +42,7 @@ #define SYSCALL_ENTER_WORK \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ - _TIF_SYSCALL_TRACEPOINT | _TIF_SYSCALL_EMU | \ + _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) /* @@ -58,10 +54,11 @@ #define SYSCALL_EXIT_WORK \ (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ - _TIF_SYSCALL_TRACEPOINT | ARCH_SYSCALL_EXIT_WORK) + ARCH_SYSCALL_EXIT_WORK) -#define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP) -#define SYSCALL_WORK_EXIT (0) +#define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ + SYSCALL_WORK_SYSCALL_TRACEPOINT) +#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT) /* * TIF flags handled in exit_to_user_mode_loop() diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index a308ba4ef07b..c232043c12d3 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -37,9 +37,11 @@ enum { enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, + SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) +#define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #include diff --git a/include/trace/syscall.h b/include/trace/syscall.h index dc8ac27d27c1..8e193f3a33b3 100644 --- a/include/trace/syscall.h +++ b/include/trace/syscall.h @@ -37,10 +37,10 @@ struct syscall_metadata { #if defined(CONFIG_TRACEPOINTS) && defined(CONFIG_HAVE_SYSCALL_TRACEPOINTS) static inline void syscall_tracepoint_update(struct task_struct *p) { - if (test_thread_flag(TIF_SYSCALL_TRACEPOINT)) - set_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); + if (test_syscall_work(SYSCALL_TRACEPOINT)) + set_task_syscall_work(p, SYSCALL_TRACEPOINT); else - clear_tsk_thread_flag(p, TIF_SYSCALL_TRACEPOINT); + clear_task_syscall_work(p, SYSCALL_TRACEPOINT); } #else static inline void syscall_tracepoint_update(struct task_struct *p) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 5747a6eb2c48..f651967847ec 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -63,7 +63,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, /* Either of the above might have changed the syscall number */ syscall = syscall_get_nr(current, regs); - if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT)) + if (unlikely(work & SYSCALL_WORK_SYSCALL_TRACEPOINT)) trace_sys_enter(regs, syscall); syscall_enter_audit(regs, syscall); @@ -233,7 +233,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, audit_syscall_exit(regs); - if (ti_work & _TIF_SYSCALL_TRACEPOINT) + if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) trace_sys_exit(regs, syscall_get_return_value(current, regs)); step = report_single_step(ti_work); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 47a71f96e5bc..adf65b502453 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3428,10 +3428,10 @@ static __init int event_trace_enable(void) * initialize events and perhaps start any events that are on the * command line. Unfortunately, there are some events that will not * start this early, like the system call tracepoints that need - * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable() - * is called before pid 1 starts, and this flag is never set, making - * the syscall tracepoint never get reached, but the event is enabled - * regardless (and not doing anything). + * to set the %SYSCALL_WORK_SYSCALL_TRACEPOINT flag of pid 1. But + * event_trace_enable() is called before pid 1 starts, and this flag + * is never set, making the syscall tracepoint never get reached, but + * the event is enabled regardless (and not doing anything). */ static __init int event_trace_enable_again(void) { diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 3f659f855074..7261fa0f5e3c 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -594,7 +594,7 @@ int syscall_regfunc(void) if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { - set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + set_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } @@ -611,7 +611,7 @@ void syscall_unregfunc(void) if (!sys_tracepoint_refcount) { read_lock(&tasklist_lock); for_each_process_thread(p, t) { - clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT); + clear_task_syscall_work(t, SYSCALL_TRACEPOINT); } read_unlock(&tasklist_lock); } -- cgit v1.2.3 From 64c19ba29b66e98af9306b4a7525fb22c895d252 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:02 -0500 Subject: ptrace: Migrate to use SYSCALL_TRACE flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_TRACE, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-7-krisman@collabora.com --- include/asm-generic/syscall.h | 15 ++++++++------- include/linux/entry-common.h | 10 ++++++---- include/linux/thread_info.h | 2 ++ include/linux/tracehook.h | 17 +++++++++-------- kernel/entry/common.c | 4 ++-- kernel/fork.c | 2 +- kernel/ptrace.c | 6 +++--- 7 files changed, 31 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index 524d8e68ff5e..ed94e5658d0c 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -43,7 +43,7 @@ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs); * @regs: task_pt_regs() of @task * * It's only valid to call this when @task is stopped for system - * call exit tracing (due to TIF_SYSCALL_TRACE or TIF_SYSCALL_AUDIT), + * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or TIF_SYSCALL_AUDIT), * after tracehook_report_syscall_entry() returned nonzero to prevent * the system call from taking place. * @@ -63,7 +63,7 @@ void syscall_rollback(struct task_struct *task, struct pt_regs *regs); * Returns 0 if the system call succeeded, or -ERRORCODE if it failed. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); @@ -76,7 +76,7 @@ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); * This value is meaningless if syscall_get_error() returned nonzero. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); @@ -93,7 +93,7 @@ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); * code; the user sees a failed system call with this errno code. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, int error, long val); @@ -108,7 +108,7 @@ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, * @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args); @@ -123,7 +123,7 @@ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, * The first argument gets value @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %TIF_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. */ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, const unsigned long *args); @@ -135,7 +135,8 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, * Returns the AUDIT_ARCH_* based on the system call convention in use. * * It's only valid to call this when @task is stopped on entry to a system - * call, due to %TIF_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or %SYSCALL_WORK_SECCOMP. + * call, due to %SYSCALL_WORK_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or + * %SYSCALL_WORK_SECCOMP. * * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must * provide an implementation of this. diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 2a01eee2dbba..ae426ab9c372 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -41,7 +41,7 @@ #endif #define SYSCALL_ENTER_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + (_TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) @@ -53,12 +53,14 @@ #endif #define SYSCALL_EXIT_WORK \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + (_TIF_SYSCALL_AUDIT | \ ARCH_SYSCALL_EXIT_WORK) #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ - SYSCALL_WORK_SYSCALL_TRACEPOINT) -#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT) + SYSCALL_WORK_SYSCALL_TRACEPOINT | \ + SYSCALL_WORK_SYSCALL_TRACE) +#define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ + SYSCALL_WORK_SYSCALL_TRACE) /* * TIF flags handled in exit_to_user_mode_loop() diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index c232043c12d3..761a4590d554 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -38,10 +38,12 @@ enum { enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, + SYSCALL_WORK_BIT_SYSCALL_TRACE, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) +#define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #include diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index f7d82e4fafd6..3f20368afe9e 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -83,11 +83,12 @@ static inline int ptrace_report_syscall(struct pt_regs *regs, * tracehook_report_syscall_entry - task is about to attempt a system call * @regs: user register state of current task * - * This will be called if %TIF_SYSCALL_TRACE or %TIF_SYSCALL_EMU have been set, - * when the current task has just entered the kernel for a system call. - * Full user register state is available here. Changing the values - * in @regs can affect the system call number and arguments to be tried. - * It is safe to block here, preventing the system call from beginning. + * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or + * %TIF_SYSCALL_EMU have been set, when the current task has just + * entered the kernel for a system call. Full user register state is + * available here. Changing the values in @regs can affect the system + * call number and arguments to be tried. It is safe to block here, + * preventing the system call from beginning. * * Returns zero normally, or nonzero if the calling arch code should abort * the system call. That must prevent normal entry so no system call is @@ -109,15 +110,15 @@ static inline __must_check int tracehook_report_syscall_entry( * @regs: user register state of current task * @step: nonzero if simulating single-step or block-step * - * This will be called if %TIF_SYSCALL_TRACE has been set, when the - * current task has just finished an attempted system call. Full + * This will be called if %SYSCALL_WORK_SYSCALL_TRACE has been set, when + * the current task has just finished an attempted system call. Full * user register state is available here. It is safe to block here, * preventing signals from being processed. * * If @step is nonzero, this report is also in lieu of the normal * trap that would follow the system call instruction because * user_enable_block_step() or user_enable_single_step() was used. - * In this case, %TIF_SYSCALL_TRACE might not be set. + * In this case, %SYSCALL_WORK_SYSCALL_TRACE might not be set. * * Called without locks, just before checking for pending signals. */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f651967847ec..917328a9edaa 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -47,7 +47,7 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, long ret = 0; /* Handle ptrace */ - if (ti_work & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU)) { + if (work & SYSCALL_WORK_SYSCALL_TRACE || ti_work & _TIF_SYSCALL_EMU) { ret = arch_syscall_enter_tracehook(regs); if (ret || (ti_work & _TIF_SYSCALL_EMU)) return -1L; @@ -237,7 +237,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, trace_sys_exit(regs, syscall_get_return_value(current, regs)); step = report_single_step(ti_work); - if (step || ti_work & _TIF_SYSCALL_TRACE) + if (step || work & SYSCALL_WORK_SYSCALL_TRACE) arch_syscall_exit_tracehook(regs, step); } diff --git a/kernel/fork.c b/kernel/fork.c index bc5b1090f415..99f68c20f2ff 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2158,7 +2158,7 @@ static __latent_entropy struct task_struct *copy_process( * child regardless of CLONE_PTRACE. */ user_disable_single_step(p); - clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE); + clear_task_syscall_work(p, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); #endif diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43d6179508d6..55a2bc3186a7 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -117,7 +117,7 @@ void __ptrace_unlink(struct task_struct *child) const struct cred *old_cred; BUG_ON(!child->ptrace); - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + clear_task_syscall_work(child, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif @@ -812,9 +812,9 @@ static int ptrace_resume(struct task_struct *child, long request, return -EIO; if (request == PTRACE_SYSCALL) - set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + set_task_syscall_work(child, SYSCALL_TRACE); else - clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); + clear_task_syscall_work(child, SYSCALL_TRACE); #ifdef TIF_SYSCALL_EMU if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) -- cgit v1.2.3 From 64eb35f701f04b30706e21d1b02636b5d31a37d2 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:03 -0500 Subject: ptrace: Migrate TIF_SYSCALL_EMU to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_EMU, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-8-krisman@collabora.com --- include/linux/entry-common.h | 8 ++------ include/linux/thread_info.h | 2 ++ include/linux/tracehook.h | 2 +- kernel/entry/common.c | 19 ++++++++++--------- kernel/fork.c | 4 ++-- kernel/ptrace.c | 10 +++++----- 6 files changed, 22 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index ae426ab9c372..b30f82bed92b 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -13,10 +13,6 @@ * Define dummy _TIF work flags if not defined by the architecture or for * disabled functionality. */ -#ifndef _TIF_SYSCALL_EMU -# define _TIF_SYSCALL_EMU (0) -#endif - #ifndef _TIF_SYSCALL_AUDIT # define _TIF_SYSCALL_AUDIT (0) #endif @@ -42,7 +38,6 @@ #define SYSCALL_ENTER_WORK \ (_TIF_SYSCALL_AUDIT | \ - _TIF_SYSCALL_EMU | \ ARCH_SYSCALL_ENTER_WORK) /* @@ -58,7 +53,8 @@ #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ SYSCALL_WORK_SYSCALL_TRACEPOINT | \ - SYSCALL_WORK_SYSCALL_TRACE) + SYSCALL_WORK_SYSCALL_TRACE | \ + SYSCALL_WORK_SYSCALL_EMU) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 761a4590d554..85b8a4216168 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -39,11 +39,13 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SECCOMP, SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, SYSCALL_WORK_BIT_SYSCALL_TRACE, + SYSCALL_WORK_BIT_SYSCALL_EMU, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) +#define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #include diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index 3f20368afe9e..54b925224a13 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -84,7 +84,7 @@ static inline int ptrace_report_syscall(struct pt_regs *regs, * @regs: user register state of current task * * This will be called if %SYSCALL_WORK_SYSCALL_TRACE or - * %TIF_SYSCALL_EMU have been set, when the current task has just + * %SYSCALL_WORK_SYSCALL_EMU have been set, when the current task has just * entered the kernel for a system call. Full user register state is * available here. Changing the values in @regs can affect the system * call number and arguments to be tried. It is safe to block here, diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 917328a9edaa..90533f34ea99 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -47,9 +47,9 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, long ret = 0; /* Handle ptrace */ - if (work & SYSCALL_WORK_SYSCALL_TRACE || ti_work & _TIF_SYSCALL_EMU) { + if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { ret = arch_syscall_enter_tracehook(regs); - if (ret || (ti_work & _TIF_SYSCALL_EMU)) + if (ret || (work & SYSCALL_WORK_SYSCALL_EMU)) return -1L; } @@ -208,21 +208,22 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) } #ifndef _TIF_SINGLESTEP -static inline bool report_single_step(unsigned long ti_work) +static inline bool report_single_step(unsigned long work) { return false; } #else /* - * If TIF_SYSCALL_EMU is set, then the only reason to report is when + * If SYSCALL_EMU is set, then the only reason to report is when * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall * instruction has been already reported in syscall_enter_from_user_mode(). */ -#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU) - -static inline bool report_single_step(unsigned long ti_work) +static inline bool report_single_step(unsigned long work) { - return (ti_work & SYSEMU_STEP) == _TIF_SINGLESTEP; + if (!(work & SYSCALL_WORK_SYSCALL_EMU)) + return false; + + return !!(current_thread_info()->flags & _TIF_SINGLESTEP); } #endif @@ -236,7 +237,7 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) trace_sys_exit(regs, syscall_get_return_value(current, regs)); - step = report_single_step(ti_work); + step = report_single_step(work); if (step || work & SYSCALL_WORK_SYSCALL_TRACE) arch_syscall_exit_tracehook(regs, step); } diff --git a/kernel/fork.c b/kernel/fork.c index 99f68c20f2ff..02b689a23457 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2159,8 +2159,8 @@ static __latent_entropy struct task_struct *copy_process( */ user_disable_single_step(p); clear_task_syscall_work(p, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) + clear_task_syscall_work(p, SYSCALL_EMU); #endif clear_tsk_latency_tracing(p); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 55a2bc3186a7..237bcd6d255c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -118,8 +118,8 @@ void __ptrace_unlink(struct task_struct *child) BUG_ON(!child->ptrace); clear_task_syscall_work(child, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) + clear_task_syscall_work(child, SYSCALL_EMU); #endif child->parent = child->real_parent; @@ -816,11 +816,11 @@ static int ptrace_resume(struct task_struct *child, long request, else clear_task_syscall_work(child, SYSCALL_TRACE); -#ifdef TIF_SYSCALL_EMU +#if defined(CONFIG_GENERIC_ENTRY) || defined(TIF_SYSCALL_EMU) if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) - set_tsk_thread_flag(child, TIF_SYSCALL_EMU); + set_task_syscall_work(child, SYSCALL_EMU); else - clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); + clear_task_syscall_work(child, SYSCALL_EMU); #endif if (is_singleblock(request)) { -- cgit v1.2.3 From 785dc4eb7fd74e3b7f4eac468457b633117e1aea Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:04 -0500 Subject: audit: Migrate to use SYSCALL_WORK flag On architectures using the generic syscall entry code the architecture independent syscall work is moved to flags in thread_info::syscall_work. This removes architecture dependencies and frees up TIF bits. Define SYSCALL_WORK_SYSCALL_AUDIT, use it in the generic entry code and convert the code which uses the TIF specific helper functions to use the new *_syscall_work() helpers which either resolve to the new mode for users of the generic entry code or to the TIF based functions for the other architectures. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-9-krisman@collabora.com --- include/asm-generic/syscall.h | 23 ++++++++++++++--------- include/linux/entry-common.h | 18 ++++++------------ include/linux/thread_info.h | 2 ++ kernel/auditsc.c | 4 ++-- 4 files changed, 24 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/syscall.h b/include/asm-generic/syscall.h index ed94e5658d0c..524218ae3825 100644 --- a/include/asm-generic/syscall.h +++ b/include/asm-generic/syscall.h @@ -43,9 +43,9 @@ int syscall_get_nr(struct task_struct *task, struct pt_regs *regs); * @regs: task_pt_regs() of @task * * It's only valid to call this when @task is stopped for system - * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or TIF_SYSCALL_AUDIT), - * after tracehook_report_syscall_entry() returned nonzero to prevent - * the system call from taking place. + * call exit tracing (due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT), after tracehook_report_syscall_entry() + * returned nonzero to prevent the system call from taking place. * * This rolls back the register state in @regs so it's as if the * system call instruction was a no-op. The registers containing @@ -63,7 +63,8 @@ void syscall_rollback(struct task_struct *task, struct pt_regs *regs); * Returns 0 if the system call succeeded, or -ERRORCODE if it failed. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. */ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); @@ -76,7 +77,8 @@ long syscall_get_error(struct task_struct *task, struct pt_regs *regs); * This value is meaningless if syscall_get_error() returned nonzero. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. */ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); @@ -93,7 +95,8 @@ long syscall_get_return_value(struct task_struct *task, struct pt_regs *regs); * code; the user sees a failed system call with this errno code. * * It's only valid to call this when @task is stopped for tracing on exit - * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * from a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. */ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, int error, long val); @@ -108,7 +111,8 @@ void syscall_set_return_value(struct task_struct *task, struct pt_regs *regs, * @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. */ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, unsigned long *args); @@ -123,7 +127,8 @@ void syscall_get_arguments(struct task_struct *task, struct pt_regs *regs, * The first argument gets value @args[0], and so on. * * It's only valid to call this when @task is stopped for tracing on - * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or %TIF_SYSCALL_AUDIT. + * entry to a system call, due to %SYSCALL_WORK_SYSCALL_TRACE or + * %SYSCALL_WORK_SYSCALL_AUDIT. */ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, const unsigned long *args); @@ -135,7 +140,7 @@ void syscall_set_arguments(struct task_struct *task, struct pt_regs *regs, * Returns the AUDIT_ARCH_* based on the system call convention in use. * * It's only valid to call this when @task is stopped on entry to a system - * call, due to %SYSCALL_WORK_SYSCALL_TRACE, %TIF_SYSCALL_AUDIT, or + * call, due to %SYSCALL_WORK_SYSCALL_TRACE, %SYSCALL_WORK_SYSCALL_AUDIT, or * %SYSCALL_WORK_SECCOMP. * * Architectures which permit CONFIG_HAVE_ARCH_SECCOMP_FILTER must diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index b30f82bed92b..d7b96f42817f 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -13,10 +13,6 @@ * Define dummy _TIF work flags if not defined by the architecture or for * disabled functionality. */ -#ifndef _TIF_SYSCALL_AUDIT -# define _TIF_SYSCALL_AUDIT (0) -#endif - #ifndef _TIF_PATCH_PENDING # define _TIF_PATCH_PENDING (0) #endif @@ -36,9 +32,7 @@ # define ARCH_SYSCALL_ENTER_WORK (0) #endif -#define SYSCALL_ENTER_WORK \ - (_TIF_SYSCALL_AUDIT | \ - ARCH_SYSCALL_ENTER_WORK) +#define SYSCALL_ENTER_WORK ARCH_SYSCALL_ENTER_WORK /* * TIF flags handled in syscall_exit_to_user_mode() @@ -47,16 +41,16 @@ # define ARCH_SYSCALL_EXIT_WORK (0) #endif -#define SYSCALL_EXIT_WORK \ - (_TIF_SYSCALL_AUDIT | \ - ARCH_SYSCALL_EXIT_WORK) +#define SYSCALL_EXIT_WORK ARCH_SYSCALL_EXIT_WORK #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ - SYSCALL_WORK_SYSCALL_EMU) + SYSCALL_WORK_SYSCALL_EMU | \ + SYSCALL_WORK_SYSCALL_AUDIT) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ - SYSCALL_WORK_SYSCALL_TRACE) + SYSCALL_WORK_SYSCALL_TRACE | \ + SYSCALL_WORK_SYSCALL_AUDIT) /* * TIF flags handled in exit_to_user_mode_loop() diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 85b8a4216168..317363212ae9 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -40,12 +40,14 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT, SYSCALL_WORK_BIT_SYSCALL_TRACE, SYSCALL_WORK_BIT_SYSCALL_EMU, + SYSCALL_WORK_BIT_SYSCALL_AUDIT, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) #define SYSCALL_WORK_SYSCALL_TRACEPOINT BIT(SYSCALL_WORK_BIT_SYSCALL_TRACEPOINT) #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) +#define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) #include diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 8dba8f0983b5..c00aa5837965 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -952,7 +952,7 @@ int audit_alloc(struct task_struct *tsk) state = audit_filter_task(tsk, &key); if (state == AUDIT_DISABLED) { - clear_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); + clear_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } @@ -964,7 +964,7 @@ int audit_alloc(struct task_struct *tsk) context->filterkey = key; audit_set_context(tsk, context); - set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); + set_task_syscall_work(tsk, SYSCALL_AUDIT); return 0; } -- cgit v1.2.3 From 2991552447707d791d9d81a5dc161f9e9e90b163 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Mon, 16 Nov 2020 12:42:05 -0500 Subject: entry: Drop usage of TIF flags in the generic syscall code Now that the flags migration in the common syscall entry code is complete and the code relies exclusively on thread_info::syscall_work, clean up the accesses to TI flags in that path. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Link: https://lore.kernel.org/r/20201116174206.2639648-10-krisman@collabora.com --- include/linux/entry-common.h | 26 ++++++++++++-------------- kernel/entry/common.c | 17 +++++++---------- 2 files changed, 19 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index d7b96f42817f..49b26b216e4e 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -26,31 +26,29 @@ #endif /* - * TIF flags handled in syscall_enter_from_user_mode() + * SYSCALL_WORK flags handled in syscall_enter_from_user_mode() */ -#ifndef ARCH_SYSCALL_ENTER_WORK -# define ARCH_SYSCALL_ENTER_WORK (0) +#ifndef ARCH_SYSCALL_WORK_ENTER +# define ARCH_SYSCALL_WORK_ENTER (0) #endif -#define SYSCALL_ENTER_WORK ARCH_SYSCALL_ENTER_WORK - /* - * TIF flags handled in syscall_exit_to_user_mode() + * SYSCALL_WORK flags handled in syscall_exit_to_user_mode() */ -#ifndef ARCH_SYSCALL_EXIT_WORK -# define ARCH_SYSCALL_EXIT_WORK (0) +#ifndef ARCH_SYSCALL_WORK_EXIT +# define ARCH_SYSCALL_WORK_EXIT (0) #endif -#define SYSCALL_EXIT_WORK ARCH_SYSCALL_EXIT_WORK - #define SYSCALL_WORK_ENTER (SYSCALL_WORK_SECCOMP | \ SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_EMU | \ - SYSCALL_WORK_SYSCALL_AUDIT) + SYSCALL_WORK_SYSCALL_AUDIT | \ + ARCH_SYSCALL_WORK_ENTER) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ - SYSCALL_WORK_SYSCALL_AUDIT) + SYSCALL_WORK_SYSCALL_AUDIT | \ + ARCH_SYSCALL_WORK_EXIT) /* * TIF flags handled in exit_to_user_mode_loop() @@ -136,8 +134,8 @@ void syscall_enter_from_user_mode_prepare(struct pt_regs *regs); * * It handles the following work items: * - * 1) TIF flag dependent invocations of arch_syscall_enter_tracehook(), - * __secure_computing(), trace_sys_enter() + * 1) syscall_work flag dependent invocations of + * arch_syscall_enter_tracehook(), __secure_computing(), trace_sys_enter() * 2) Invocation of audit_syscall_entry() */ long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 90533f34ea99..91e8fd50adf4 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -42,7 +42,7 @@ static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) } static long syscall_trace_enter(struct pt_regs *regs, long syscall, - unsigned long ti_work, unsigned long work) + unsigned long work) { long ret = 0; @@ -75,11 +75,9 @@ static __always_inline long __syscall_enter_from_user_work(struct pt_regs *regs, long syscall) { unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - unsigned long ti_work; - ti_work = READ_ONCE(current_thread_info()->flags); - if (work & SYSCALL_WORK_ENTER || ti_work & SYSCALL_ENTER_WORK) - syscall = syscall_trace_enter(regs, syscall, ti_work, work); + if (work & SYSCALL_WORK_ENTER) + syscall = syscall_trace_enter(regs, syscall, work); return syscall; } @@ -227,8 +225,8 @@ static inline bool report_single_step(unsigned long work) } #endif -static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, - unsigned long work) + +static void syscall_exit_work(struct pt_regs *regs, unsigned long work) { bool step; @@ -249,7 +247,6 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long ti_work, static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { unsigned long work = READ_ONCE(current_thread_info()->syscall_work); - u32 cached_flags = READ_ONCE(current_thread_info()->flags); unsigned long nr = syscall_get_nr(current, regs); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); @@ -266,8 +263,8 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) * enabled, we want to run them exactly once per syscall exit with * interrupts enabled. */ - if (unlikely(work & SYSCALL_WORK_EXIT || cached_flags & SYSCALL_EXIT_WORK)) - syscall_exit_work(regs, cached_flags, work); + if (unlikely(work & SYSCALL_WORK_EXIT)) + syscall_exit_work(regs, work); } __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) -- cgit v1.2.3 From 8e1ac4299a6e8726de42310d9c1379f188140c71 Mon Sep 17 00:00:00 2001 From: Quentin Perret Date: Thu, 12 Nov 2020 11:12:01 +0000 Subject: sched/fair: Fix overutilized update in enqueue_task_fair() enqueue_task_fair() attempts to skip the overutilized update for new tasks as their util_avg is not accurate yet. However, the flag we check to do so is overwritten earlier on in the function, which makes the condition pretty much a nop. Fix this by saving the flag early on. Fixes: 2802bf3cd936 ("sched/fair: Add over-utilization/tipping point indicator") Reported-by: Rick Yiu Signed-off-by: Quentin Perret Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Reviewed-by: Valentin Schneider Link: https://lkml.kernel.org/r/20201112111201.2081902-1-qperret@google.com --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8e563cf2b5e7..56a8ca93a971 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5477,6 +5477,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); /* * The code below (indirectly) updates schedutil which looks at @@ -5549,7 +5550,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * into account, but that is not straightforward to implement, * and the following generally works well enough in practice. */ - if (flags & ENQUEUE_WAKEUP) + if (!task_new) update_overutilized_status(rq); enqueue_throttle: -- cgit v1.2.3 From ec618b84f6e15281cc3660664d34cd0dd2f2579e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Sep 2020 13:50:42 +0200 Subject: sched: Fix rq->nr_iowait ordering schedule() ttwu() deactivate_task(); if (p->on_rq && ...) // false atomic_dec(&task_rq(p)->nr_iowait); if (prev->in_iowait) atomic_inc(&rq->nr_iowait); Allows nr_iowait to be decremented before it gets incremented, resulting in more dodgy IO-wait numbers than usual. Note that because we can now do ttwu_queue_wakelist() before p->on_cpu==0, we lose the natural ordering and have to further delay the decrement. Fixes: c6e7bd7afaeb ("sched/core: Optimize ttwu() spinning on p->on_cpu") Reported-by: Tejun Heo Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20201117093829.GD3121429@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..9f0ebfb0d17b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2501,7 +2501,12 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, #ifdef CONFIG_SMP if (wake_flags & WF_MIGRATED) en_flags |= ENQUEUE_MIGRATED; + else #endif + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } activate_task(rq, p, en_flags); ttwu_do_wakeup(rq, p, wake_flags, rf); @@ -2888,11 +2893,6 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) goto unlock; - if (p->in_iowait) { - delayacct_blkio_end(p); - atomic_dec(&task_rq(p)->nr_iowait); - } - #ifdef CONFIG_SMP /* * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be @@ -2963,6 +2963,11 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { + if (p->in_iowait) { + delayacct_blkio_end(p); + atomic_dec(&task_rq(p)->nr_iowait); + } + wake_flags |= WF_MIGRATED; psi_ttwu_dequeue(p); set_task_cpu(p, cpu); -- cgit v1.2.3 From 2279f540ea7d05f22d2f0c4224319330228586bc Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 17 Nov 2020 07:14:32 +0100 Subject: sched/deadline: Fix priority inheritance with multiple scheduling classes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Glenn reported that "an application [he developed produces] a BUG in deadline.c when a SCHED_DEADLINE task contends with CFS tasks on nested PTHREAD_PRIO_INHERIT mutexes. I believe the bug is triggered when a CFS task that was boosted by a SCHED_DEADLINE task boosts another CFS task (nested priority inheritance). ------------[ cut here ]------------ kernel BUG at kernel/sched/deadline.c:1462! invalid opcode: 0000 [#1] PREEMPT SMP CPU: 12 PID: 19171 Comm: dl_boost_bug Tainted: ... Hardware name: ... RIP: 0010:enqueue_task_dl+0x335/0x910 Code: ... RSP: 0018:ffffc9000c2bbc68 EFLAGS: 00010002 RAX: 0000000000000009 RBX: ffff888c0af94c00 RCX: ffffffff81e12500 RDX: 000000000000002e RSI: ffff888c0af94c00 RDI: ffff888c10b22600 RBP: ffffc9000c2bbd08 R08: 0000000000000009 R09: 0000000000000078 R10: ffffffff81e12440 R11: ffffffff81e1236c R12: ffff888bc8932600 R13: ffff888c0af94eb8 R14: ffff888c10b22600 R15: ffff888bc8932600 FS: 00007fa58ac55700(0000) GS:ffff888c10b00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fa58b523230 CR3: 0000000bf44ab003 CR4: 00000000007606e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? intel_pstate_update_util_hwp+0x13/0x170 rt_mutex_setprio+0x1cc/0x4b0 task_blocks_on_rt_mutex+0x225/0x260 rt_spin_lock_slowlock_locked+0xab/0x2d0 rt_spin_lock_slowlock+0x50/0x80 hrtimer_grab_expiry_lock+0x20/0x30 hrtimer_cancel+0x13/0x30 do_nanosleep+0xa0/0x150 hrtimer_nanosleep+0xe1/0x230 ? __hrtimer_init_sleeper+0x60/0x60 __x64_sys_nanosleep+0x8d/0xa0 do_syscall_64+0x4a/0x100 entry_SYSCALL_64_after_hwframe+0x49/0xbe RIP: 0033:0x7fa58b52330d ... ---[ end trace 0000000000000002 ]— He also provided a simple reproducer creating the situation below: So the execution order of locking steps are the following (N1 and N2 are non-deadline tasks. D1 is a deadline task. M1 and M2 are mutexes that are enabled * with priority inheritance.) Time moves forward as this timeline goes down: N1 N2 D1 | | | | | | Lock(M1) | | | | | | Lock(M2) | | | | | | Lock(M2) | | | | Lock(M1) | | (!!bug triggered!) | Daniel reported a similar situation as well, by just letting ksoftirqd run with DEADLINE (and eventually block on a mutex). Problem is that boosted entities (Priority Inheritance) use static DEADLINE parameters of the top priority waiter. However, there might be cases where top waiter could be a non-DEADLINE entity that is currently boosted by a DEADLINE entity from a different lock chain (i.e., nested priority chains involving entities of non-DEADLINE classes). In this case, top waiter static DEADLINE parameters could be null (initialized to 0 at fork()) and replenish_dl_entity() would hit a BUG(). Fix this by keeping track of the original donor and using its parameters when a task is boosted. Reported-by: Glenn Elliott Reported-by: Daniel Bristot de Oliveira Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Tested-by: Daniel Bristot de Oliveira Link: https://lkml.kernel.org/r/20201117061432.517340-1-juri.lelli@redhat.com --- include/linux/sched.h | 10 ++++- kernel/sched/core.c | 11 +++--- kernel/sched/deadline.c | 97 +++++++++++++++++++++++++++---------------------- 3 files changed, 68 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 0e91b451d2a2..095fdec07b38 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -551,7 +551,6 @@ struct sched_dl_entity { * overruns. */ unsigned int dl_throttled : 1; - unsigned int dl_boosted : 1; unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; @@ -570,6 +569,15 @@ struct sched_dl_entity { * time. */ struct hrtimer inactive_timer; + +#ifdef CONFIG_RT_MUTEXES + /* + * Priority Inheritance. When a DEADLINE scheduling entity is boosted + * pi_se points to the donor, otherwise points to the dl_se it belongs + * to (the original one/itself). + */ + struct sched_dl_entity *pi_se; +#endif }; #ifdef CONFIG_UCLAMP_TASK diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9f0ebfb0d17b..e7e453492cff 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4912,20 +4912,21 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (!dl_prio(p->normal_prio) || (pi_task && dl_prio(pi_task->prio) && dl_entity_preempt(&pi_task->dl, &p->dl))) { - p->dl.dl_boosted = 1; + p->dl.pi_se = pi_task->dl.pi_se; queue_flag |= ENQUEUE_REPLENISH; - } else - p->dl.dl_boosted = 0; + } else { + p->dl.pi_se = &p->dl; + } p->sched_class = &dl_sched_class; } else if (rt_prio(prio)) { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (oldprio < prio) queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) - p->dl.dl_boosted = 0; + p->dl.pi_se = &p->dl; if (rt_prio(oldprio)) p->rt.timeout = 0; p->sched_class = &fair_sched_class; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6d93f4518734..949bc5c083c1 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -43,6 +43,28 @@ static inline int on_dl_rq(struct sched_dl_entity *dl_se) return !RB_EMPTY_NODE(&dl_se->rb_node); } +#ifdef CONFIG_RT_MUTEXES +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se->pi_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return pi_of(dl_se) != dl_se; +} +#else +static inline struct sched_dl_entity *pi_of(struct sched_dl_entity *dl_se) +{ + return dl_se; +} + +static inline bool is_dl_boosted(struct sched_dl_entity *dl_se) +{ + return false; +} +#endif + #ifdef CONFIG_SMP static inline struct dl_bw *dl_bw_of(int i) { @@ -698,7 +720,7 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(dl_se->dl_boosted); + WARN_ON(is_dl_boosted(dl_se)); WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); /* @@ -736,21 +758,20 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) * could happen are, typically, a entity voluntarily trying to overcome its * runtime, or it just underestimated it during sched_setattr(). */ -static void replenish_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void replenish_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - BUG_ON(pi_se->dl_runtime <= 0); + BUG_ON(pi_of(dl_se)->dl_runtime <= 0); /* * This could be the case for a !-dl task that is boosted. * Just go with full inherited parameters. */ if (dl_se->dl_deadline == 0) { - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded && dl_se->runtime > 0) @@ -763,8 +784,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * arbitrary large. */ while (dl_se->runtime <= 0) { - dl_se->deadline += pi_se->dl_period; - dl_se->runtime += pi_se->dl_runtime; + dl_se->deadline += pi_of(dl_se)->dl_period; + dl_se->runtime += pi_of(dl_se)->dl_runtime; } /* @@ -778,8 +799,8 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { printk_deferred_once("sched: DL replenish lagged too much\n"); - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } if (dl_se->dl_yielded) @@ -812,8 +833,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * task with deadline equal to period this is the same of using * dl_period instead of dl_deadline in the equation above. */ -static bool dl_entity_overflow(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, u64 t) +static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t) { u64 left, right; @@ -835,9 +855,9 @@ static bool dl_entity_overflow(struct sched_dl_entity *dl_se, * of anything below microseconds resolution is actually fiction * (but still we want to give the user that illusion >;). */ - left = (pi_se->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); + left = (pi_of(dl_se)->dl_deadline >> DL_SCALE) * (dl_se->runtime >> DL_SCALE); right = ((dl_se->deadline - t) >> DL_SCALE) * - (pi_se->dl_runtime >> DL_SCALE); + (pi_of(dl_se)->dl_runtime >> DL_SCALE); return dl_time_before(right, left); } @@ -922,24 +942,23 @@ static inline bool dl_is_implicit(struct sched_dl_entity *dl_se) * Please refer to the comments update_dl_revised_wakeup() function to find * more about the Revised CBS rule. */ -static void update_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se) +static void update_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); if (dl_time_before(dl_se->deadline, rq_clock(rq)) || - dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { + dl_entity_overflow(dl_se, rq_clock(rq))) { if (unlikely(!dl_is_implicit(dl_se) && !dl_time_before(dl_se->deadline, rq_clock(rq)) && - !dl_se->dl_boosted)){ + !is_dl_boosted(dl_se))) { update_dl_revised_wakeup(dl_se, rq); return; } - dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; - dl_se->runtime = pi_se->dl_runtime; + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; } } @@ -1038,7 +1057,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ - if (dl_se->dl_boosted) + if (is_dl_boosted(dl_se)) goto unlock; /* @@ -1066,7 +1085,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * but do not enqueue -- wait for our wakeup to do that. */ if (!task_on_rq_queued(p)) { - replenish_dl_entity(dl_se, dl_se); + replenish_dl_entity(dl_se); goto unlock; } @@ -1156,7 +1175,7 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se) if (dl_time_before(dl_se->deadline, rq_clock(rq)) && dl_time_before(rq_clock(rq), dl_next_period(dl_se))) { - if (unlikely(dl_se->dl_boosted || !start_dl_timer(p))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(p))) return; dl_se->dl_throttled = 1; if (dl_se->runtime > 0) @@ -1287,7 +1306,7 @@ throttle: dl_se->dl_overrun = 1; __dequeue_task_dl(rq, curr, 0); - if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) + if (unlikely(is_dl_boosted(dl_se) || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); if (!is_leftmost(curr, &rq->dl)) @@ -1481,8 +1500,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) } static void -enqueue_dl_entity(struct sched_dl_entity *dl_se, - struct sched_dl_entity *pi_se, int flags) +enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) { BUG_ON(on_dl_rq(dl_se)); @@ -1493,9 +1511,9 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, */ if (flags & ENQUEUE_WAKEUP) { task_contending(dl_se, flags); - update_dl_entity(dl_se, pi_se); + update_dl_entity(dl_se); } else if (flags & ENQUEUE_REPLENISH) { - replenish_dl_entity(dl_se, pi_se); + replenish_dl_entity(dl_se); } else if ((flags & ENQUEUE_RESTORE) && dl_time_before(dl_se->deadline, rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { @@ -1512,19 +1530,7 @@ static void dequeue_dl_entity(struct sched_dl_entity *dl_se) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); - struct sched_dl_entity *pi_se = &p->dl; - - /* - * Use the scheduling parameters of the top pi-waiter task if: - * - we have a top pi-waiter which is a SCHED_DEADLINE task AND - * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is - * smaller than our deadline OR we are a !SCHED_DEADLINE task getting - * boosted due to a SCHED_DEADLINE pi-waiter). - * Otherwise we keep our runtime and deadline. - */ - if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) { - pi_se = &pi_task->dl; + if (is_dl_boosted(&p->dl)) { /* * Because of delays in the detection of the overrun of a * thread's runtime, it might be the case that a thread @@ -1557,7 +1563,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) * the throttle. */ p->dl.dl_throttled = 0; - BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH); + BUG_ON(!is_dl_boosted(&p->dl) || flags != ENQUEUE_REPLENISH); return; } @@ -1594,7 +1600,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) return; } - enqueue_dl_entity(&p->dl, pi_se, flags); + enqueue_dl_entity(&p->dl, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); @@ -2787,11 +2793,14 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_density = 0; - dl_se->dl_boosted = 0; dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; dl_se->dl_overrun = 0; + +#ifdef CONFIG_RT_MUTEXES + dl_se->pi_se = dl_se; +#endif } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) -- cgit v1.2.3 From 43be4388e94b915799a24f0eaf664bf95b85231f Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Fri, 13 Nov 2020 19:05:03 +0800 Subject: lockdep: Put graph lock/unlock under lock_recursion protection A warning was hit when running xfstests/generic/068 in a Hyper-V guest: [...] ------------[ cut here ]------------ [...] DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled()) [...] WARNING: CPU: 2 PID: 1350 at kernel/locking/lockdep.c:5280 check_flags.part.0+0x165/0x170 [...] ... [...] Workqueue: events pwq_unbound_release_workfn [...] RIP: 0010:check_flags.part.0+0x165/0x170 [...] ... [...] Call Trace: [...] lock_is_held_type+0x72/0x150 [...] ? lock_acquire+0x16e/0x4a0 [...] rcu_read_lock_sched_held+0x3f/0x80 [...] __send_ipi_one+0x14d/0x1b0 [...] hv_send_ipi+0x12/0x30 [...] __pv_queued_spin_unlock_slowpath+0xd1/0x110 [...] __raw_callee_save___pv_queued_spin_unlock_slowpath+0x11/0x20 [...] .slowpath+0x9/0xe [...] lockdep_unregister_key+0x128/0x180 [...] pwq_unbound_release_workfn+0xbb/0xf0 [...] process_one_work+0x227/0x5c0 [...] worker_thread+0x55/0x3c0 [...] ? process_one_work+0x5c0/0x5c0 [...] kthread+0x153/0x170 [...] ? __kthread_bind_mask+0x60/0x60 [...] ret_from_fork+0x1f/0x30 The cause of the problem is we have call chain lockdep_unregister_key() -> lockdep_unlock() -> arch_spin_unlock() -> __pv_queued_spin_unlock_slowpath() -> pv_kick() -> __send_ipi_one() -> trace_hyperv_send_ipi_one(). Although this particular warning is triggered because Hyper-V has a trace point in ipi sending, but in general arch_spin_unlock() may call another function having a trace point in it, so put the arch_spin_lock() and arch_spin_unlock() after lock_recursion protection to fix this problem and avoid similiar problems. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201113110512.1056501-1-boqun.feng@gmail.com --- kernel/locking/lockdep.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index d9fb9e19d2ed..c1418b47f625 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -108,19 +108,21 @@ static inline void lockdep_lock(void) { DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + __this_cpu_inc(lockdep_recursion); arch_spin_lock(&__lock); __owner = current; - __this_cpu_inc(lockdep_recursion); } static inline void lockdep_unlock(void) { + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current)) return; - __this_cpu_dec(lockdep_recursion); __owner = NULL; arch_spin_unlock(&__lock); + __this_cpu_dec(lockdep_recursion); } static inline bool lockdep_assert_locked(void) -- cgit v1.2.3 From 66f4fa32eb18af9a60bbda589ee239621a49bcc1 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 3 Nov 2020 22:45:04 +0200 Subject: resource: Simplify region_intersects() by reducing conditionals Now we have for 'other' and 'type' variables other type return 0 0 REGION_DISJOINT 0 x REGION_INTERSECTS x 0 REGION_DISJOINT x x REGION_MIXED Obviously it's easier to check 'type' for 0 first instead of currently checked 'other'. Signed-off-by: Andy Shevchenko Reviewed-by: Hanjun Guo Tested-by: Hanjun Guo Signed-off-by: Rafael J. Wysocki --- kernel/resource.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 3ae2f56cc79d..82df80417489 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -557,13 +557,13 @@ int region_intersects(resource_size_t start, size_t size, unsigned long flags, } read_unlock(&resource_lock); - if (other == 0) - return type ? REGION_INTERSECTS : REGION_DISJOINT; + if (type == 0) + return REGION_DISJOINT; - if (type) - return REGION_MIXED; + if (other == 0) + return REGION_INTERSECTS; - return REGION_DISJOINT; + return REGION_MIXED; } EXPORT_SYMBOL_GPL(region_intersects); -- cgit v1.2.3 From 5df38ca6afeceaf3ea911ad2f7e2101364dee48d Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 3 Nov 2020 22:45:08 +0200 Subject: resource: Add test cases for new resource API Add test cases for newly added resource APIs. Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- kernel/Makefile | 1 + kernel/resource_kunit.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 11 ++++ 3 files changed, 162 insertions(+) create mode 100644 kernel/resource_kunit.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index af601b9bda0e..aac15aeb9d69 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -123,6 +123,7 @@ obj-$(CONFIG_HAS_IOMEM) += iomem.o obj-$(CONFIG_RSEQ) += rseq.o obj-$(CONFIG_WATCH_QUEUE) += watch_queue.o +obj-$(CONFIG_RESOURCE_KUNIT_TEST) += resource_kunit.o obj-$(CONFIG_SYSCTL_KUNIT_TEST) += sysctl-test.o CFLAGS_stackleak.o += $(DISABLE_STACKLEAK_PLUGIN) diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c new file mode 100644 index 000000000000..9fdbca8426f1 --- /dev/null +++ b/kernel/resource_kunit.c @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: GPL-2.0+ +/* + * Test cases for API provided by resource.c and ioport.h + */ + +#include +#include +#include +#include + +#define R0_START 0x0000 +#define R0_END 0xffff +#define R1_START 0x1234 +#define R1_END 0x2345 +#define R2_START 0x4567 +#define R2_END 0x5678 +#define R3_START 0x6789 +#define R3_END 0x789a +#define R4_START 0x2000 +#define R4_END 0x7000 + +static struct resource r0 = { .start = R0_START, .end = R0_END }; +static struct resource r1 = { .start = R1_START, .end = R1_END }; +static struct resource r2 = { .start = R2_START, .end = R2_END }; +static struct resource r3 = { .start = R3_START, .end = R3_END }; +static struct resource r4 = { .start = R4_START, .end = R4_END }; + +struct result { + struct resource *r1; + struct resource *r2; + struct resource r; + bool ret; +}; + +static struct result results_for_union[] = { + { + .r1 = &r1, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true, + }, { + .r1 = &r3, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true, + }, { + .r1 = &r4, .r2 = &r0, .r.start = R0_START, .r.end = R0_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r1, .ret = false, + }, { + .r1 = &r3, .r2 = &r1, .ret = false, + }, { + .r1 = &r4, .r2 = &r1, .r.start = R1_START, .r.end = R4_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r3, .ret = false, + }, { + .r1 = &r2, .r2 = &r4, .r.start = R4_START, .r.end = R4_END, .ret = true, + }, { + .r1 = &r3, .r2 = &r4, .r.start = R4_START, .r.end = R3_END, .ret = true, + }, +}; + +static struct result results_for_intersection[] = { + { + .r1 = &r1, .r2 = &r0, .r.start = R1_START, .r.end = R1_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r0, .r.start = R2_START, .r.end = R2_END, .ret = true, + }, { + .r1 = &r3, .r2 = &r0, .r.start = R3_START, .r.end = R3_END, .ret = true, + }, { + .r1 = &r4, .r2 = &r0, .r.start = R4_START, .r.end = R4_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r1, .ret = false, + }, { + .r1 = &r3, .r2 = &r1, .ret = false, + }, { + .r1 = &r4, .r2 = &r1, .r.start = R4_START, .r.end = R1_END, .ret = true, + }, { + .r1 = &r2, .r2 = &r3, .ret = false, + }, { + .r1 = &r2, .r2 = &r4, .r.start = R2_START, .r.end = R2_END, .ret = true, + }, { + .r1 = &r3, .r2 = &r4, .r.start = R3_START, .r.end = R4_END, .ret = true, + }, +}; + +static void resource_do_test(struct kunit *test, bool ret, struct resource *r, + bool exp_ret, struct resource *exp_r, + struct resource *r1, struct resource *r2) +{ + KUNIT_EXPECT_EQ_MSG(test, ret, exp_ret, "Resources %pR %pR", r1, r2); + KUNIT_EXPECT_EQ_MSG(test, r->start, exp_r->start, "Start elements are not equal"); + KUNIT_EXPECT_EQ_MSG(test, r->end, exp_r->end, "End elements are not equal"); +} + +static void resource_do_union_test(struct kunit *test, struct result *r) +{ + struct resource result; + bool ret; + + memset(&result, 0, sizeof(result)); + ret = resource_union(r->r1, r->r2, &result); + resource_do_test(test, ret, &result, r->ret, &r->r, r->r1, r->r2); + + memset(&result, 0, sizeof(result)); + ret = resource_union(r->r2, r->r1, &result); + resource_do_test(test, ret, &result, r->ret, &r->r, r->r2, r->r1); +} + +static void resource_test_union(struct kunit *test) +{ + struct result *r = results_for_union; + unsigned int i = 0; + + do { + resource_do_union_test(test, &r[i]); + } while (++i < ARRAY_SIZE(results_for_union)); +} + +static void resource_do_intersection_test(struct kunit *test, struct result *r) +{ + struct resource result; + bool ret; + + memset(&result, 0, sizeof(result)); + ret = resource_intersection(r->r1, r->r2, &result); + resource_do_test(test, ret, &result, r->ret, &r->r, r->r1, r->r2); + + memset(&result, 0, sizeof(result)); + ret = resource_intersection(r->r2, r->r1, &result); + resource_do_test(test, ret, &result, r->ret, &r->r, r->r2, r->r1); +} + +static void resource_test_intersection(struct kunit *test) +{ + struct result *r = results_for_intersection; + unsigned int i = 0; + + do { + resource_do_intersection_test(test, &r[i]); + } while (++i < ARRAY_SIZE(results_for_intersection)); +} + +static struct kunit_case resource_test_cases[] = { + KUNIT_CASE(resource_test_union), + KUNIT_CASE(resource_test_intersection), + {} +}; + +static struct kunit_suite resource_test_suite = { + .name = "resource", + .test_cases = resource_test_cases, +}; +kunit_test_suite(resource_test_suite); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c789b39ed527..64f9501a6b5c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2226,6 +2226,17 @@ config BITFIELD_KUNIT If unsure, say N. +config RESOURCE_KUNIT_TEST + tristate "KUnit test for resource API" + depends on KUNIT + help + This builds the resource API unit test. + Tests the logic of API provided by resource.c and ioport.h. + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + If unsure, say N. + config SYSCTL_KUNIT_TEST tristate "KUnit test for sysctl" if !KUNIT_ALL_TESTS depends on KUNIT -- cgit v1.2.3 From 172292be01dbd6c26aba23f62e8ec090f31cdb71 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 6 Nov 2020 19:19:41 +0100 Subject: dma-mapping: remove dma_virt_ops Now that the RDMA core deals with devices that only do DMA mapping in lower layers properly, there is no user for dma_virt_ops and it can be removed. Link: https://lore.kernel.org/r/20201106181941.1878556-11-hch@lst.de Signed-off-by: Christoph Hellwig Signed-off-by: Jason Gunthorpe --- include/linux/dma-mapping.h | 2 -- kernel/dma/Kconfig | 5 ---- kernel/dma/Makefile | 1 - kernel/dma/virt.c | 61 --------------------------------------------- 4 files changed, 69 deletions(-) delete mode 100644 kernel/dma/virt.c (limited to 'kernel') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 956151052d45..2aaed35b556d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -565,6 +565,4 @@ static inline int dma_mmap_wc(struct device *dev, int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, dma_addr_t dma_start, u64 size); -extern const struct dma_map_ops dma_virt_ops; - #endif /* _LINUX_DMA_MAPPING_H */ diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index c99de4a21458..fd2db2665fc6 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -75,11 +75,6 @@ config ARCH_HAS_DMA_PREP_COHERENT config ARCH_HAS_FORCE_DMA_UNENCRYPTED bool -config DMA_VIRT_OPS - bool - depends on HAS_DMA - select DMA_OPS - config SWIOTLB bool select NEED_DMA_MAP_STATE diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index dc755ab68aab..cd1d86358a7a 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -5,7 +5,6 @@ obj-$(CONFIG_DMA_OPS) += ops_helpers.o obj-$(CONFIG_DMA_OPS) += dummy.o obj-$(CONFIG_DMA_CMA) += contiguous.o obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o -obj-$(CONFIG_DMA_VIRT_OPS) += virt.o obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c deleted file mode 100644 index 59d32317dd57..000000000000 --- a/kernel/dma/virt.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * DMA operations that map to virtual addresses without flushing memory. - */ -#include -#include -#include -#include - -static void *dma_virt_alloc(struct device *dev, size_t size, - dma_addr_t *dma_handle, gfp_t gfp, - unsigned long attrs) -{ - void *ret; - - ret = (void *)__get_free_pages(gfp | __GFP_ZERO, get_order(size)); - if (ret) - *dma_handle = (uintptr_t)ret; - return ret; -} - -static void dma_virt_free(struct device *dev, size_t size, - void *cpu_addr, dma_addr_t dma_addr, - unsigned long attrs) -{ - free_pages((unsigned long)cpu_addr, get_order(size)); -} - -static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page, - unsigned long offset, size_t size, - enum dma_data_direction dir, - unsigned long attrs) -{ - return (uintptr_t)(page_address(page) + offset); -} - -static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl, - int nents, enum dma_data_direction dir, - unsigned long attrs) -{ - int i; - struct scatterlist *sg; - - for_each_sg(sgl, sg, nents, i) { - BUG_ON(!sg_page(sg)); - sg_dma_address(sg) = (uintptr_t)sg_virt(sg); - sg_dma_len(sg) = sg->length; - } - - return nents; -} - -const struct dma_map_ops dma_virt_ops = { - .alloc = dma_virt_alloc, - .free = dma_virt_free, - .map_page = dma_virt_map_page, - .map_sg = dma_virt_map_sg, - .alloc_pages = dma_common_alloc_pages, - .free_pages = dma_common_free_pages, -}; -EXPORT_SYMBOL(dma_virt_ops); -- cgit v1.2.3 From cf23705244c947151179f929774fabf71e239eee Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Fri, 30 Oct 2020 13:38:48 +0100 Subject: ptrace: Set PF_SUPERPRIV when checking capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 69f594a38967 ("ptrace: do not audit capability check when outputing /proc/pid/stat") replaced the use of ns_capable() with has_ns_capability{,_noaudit}() which doesn't set PF_SUPERPRIV. Commit 6b3ad6649a4c ("ptrace: reintroduce usage of subjective credentials in ptrace_has_cap()") replaced has_ns_capability{,_noaudit}() with security_capable(), which doesn't set PF_SUPERPRIV neither. Since commit 98f368e9e263 ("kernel: Add noaudit variant of ns_capable()"), a new ns_capable_noaudit() helper is available. Let's use it! As a result, the signature of ptrace_has_cap() is restored to its original one. Cc: Christian Brauner Cc: Eric Paris Cc: Jann Horn Cc: Kees Cook Cc: Oleg Nesterov Cc: Serge E. Hallyn Cc: Tyler Hicks Cc: stable@vger.kernel.org Fixes: 6b3ad6649a4c ("ptrace: reintroduce usage of subjective credentials in ptrace_has_cap()") Fixes: 69f594a38967 ("ptrace: do not audit capability check when outputing /proc/pid/stat") Signed-off-by: Mickaël Salaün Reviewed-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201030123849.770769-2-mic@digikod.net --- kernel/ptrace.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 43d6179508d6..79de1294f8eb 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -264,17 +264,11 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) return ret; } -static bool ptrace_has_cap(const struct cred *cred, struct user_namespace *ns, - unsigned int mode) +static bool ptrace_has_cap(struct user_namespace *ns, unsigned int mode) { - int ret; - if (mode & PTRACE_MODE_NOAUDIT) - ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); - else - ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NONE); - - return ret == 0; + return ns_capable_noaudit(ns, CAP_SYS_PTRACE); + return ns_capable(ns, CAP_SYS_PTRACE); } /* Returns 0 on success, -errno on denial. */ @@ -326,7 +320,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) gid_eq(caller_gid, tcred->sgid) && gid_eq(caller_gid, tcred->gid)) goto ok; - if (ptrace_has_cap(cred, tcred->user_ns, mode)) + if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; rcu_read_unlock(); return -EPERM; @@ -345,7 +339,7 @@ ok: mm = task->mm; if (mm && ((get_dumpable(mm) != SUID_DUMP_USER) && - !ptrace_has_cap(cred, mm->user_ns, mode))) + !ptrace_has_cap(mm->user_ns, mode))) return -EPERM; return security_ptrace_access_check(task, mode); -- cgit v1.2.3 From fb14528e443646dd3fd02df4437fcf5265b66baa Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Fri, 30 Oct 2020 13:38:49 +0100 Subject: seccomp: Set PF_SUPERPRIV when checking capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the use of security_capable(current_cred(), ...) with ns_capable_noaudit() which set PF_SUPERPRIV. Since commit 98f368e9e263 ("kernel: Add noaudit variant of ns_capable()"), a new ns_capable_noaudit() helper is available. Let's use it! Cc: Jann Horn Cc: Kees Cook Cc: Tyler Hicks Cc: Will Drewry Cc: stable@vger.kernel.org Fixes: e2cfabdfd075 ("seccomp: add system call filtering using BPF") Signed-off-by: Mickaël Salaün Reviewed-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201030123849.770769-3-mic@digikod.net --- kernel/seccomp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 8ad7a293255a..53a7d1512dd7 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include #include #include @@ -558,8 +558,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) * behavior of privileged children. */ if (!task_no_new_privs(current) && - security_capable(current_cred(), current_user_ns(), - CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) != 0) + !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); /* Allocate a new seccomp_filter */ -- cgit v1.2.3 From 3f6719c7b62f0327c9091e26d0da10e65668229e Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 17 Nov 2020 23:29:28 +0000 Subject: bpf: Add bpf_bprm_opts_set helper The helper allows modification of certain bits on the linux_binprm struct starting with the secureexec bit which can be updated using the BPF_F_BPRM_SECUREEXEC flag. secureexec can be set by the LSM for privilege gaining executions to set the AT_SECURE auxv for glibc. When set, the dynamic linker disables the use of certain environment variables (like LD_PRELOAD). Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201117232929.2156341-1-kpsingh@chromium.org --- include/uapi/linux/bpf.h | 16 ++++++++++++++++ kernel/bpf/bpf_lsm.c | 26 ++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 16 ++++++++++++++++ 4 files changed, 60 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 162999b12790..a52299b80b9d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3787,6 +3787,16 @@ union bpf_attr { * *ARG_PTR_TO_BTF_ID* of type *task_struct*. * Return * Pointer to the current task. + * + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) + * Description + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * Return + * **-EINVAL** if invalid *flags* are passed, zero otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3948,6 +3958,7 @@ union bpf_attr { FN(task_storage_get), \ FN(task_storage_delete), \ FN(get_current_task_btf), \ + FN(bprm_opts_set), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4119,6 +4130,11 @@ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_IP, }; +/* Flags for bpf_bprm_opts_set helper */ +enum { + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), +}; + #define __bpf_md_ptr(type, name) \ union { \ type name; \ diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 553107f4706a..b4f27a874092 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,29 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return 0; } +/* Mask for all the currently supported BPRM option flags */ +#define BPF_F_BRPM_OPTS_MASK BPF_F_BPRM_SECUREEXEC + +BPF_CALL_2(bpf_bprm_opts_set, struct linux_binprm *, bprm, u64, flags) +{ + if (flags & ~BPF_F_BRPM_OPTS_MASK) + return -EINVAL; + + bprm->secureexec = (flags & BPF_F_BPRM_SECUREEXEC); + return 0; +} + +BTF_ID_LIST_SINGLE(bpf_bprm_opts_set_btf_ids, struct, linux_binprm) + +const static struct bpf_func_proto bpf_bprm_opts_set_proto = { + .func = bpf_bprm_opts_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_bprm_opts_set_btf_ids[0], + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -71,6 +95,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_get_proto; case BPF_FUNC_task_storage_delete: return &bpf_task_storage_delete_proto; + case BPF_FUNC_bprm_opts_set: + return &bpf_bprm_opts_set_proto; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 31484377b8b1..c5bc947a70ad 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -418,6 +418,7 @@ class PrinterHelpers(Printer): 'struct bpf_tcp_sock', 'struct bpf_tunnel_key', 'struct bpf_xfrm_state', + 'struct linux_binprm', 'struct pt_regs', 'struct sk_reuseport_md', 'struct sockaddr', @@ -465,6 +466,7 @@ class PrinterHelpers(Printer): 'struct bpf_tcp_sock', 'struct bpf_tunnel_key', 'struct bpf_xfrm_state', + 'struct linux_binprm', 'struct pt_regs', 'struct sk_reuseport_md', 'struct sockaddr', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 162999b12790..a52299b80b9d 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3787,6 +3787,16 @@ union bpf_attr { * *ARG_PTR_TO_BTF_ID* of type *task_struct*. * Return * Pointer to the current task. + * + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) + * Description + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * Return + * **-EINVAL** if invalid *flags* are passed, zero otherwise. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3948,6 +3958,7 @@ union bpf_attr { FN(task_storage_get), \ FN(task_storage_delete), \ FN(get_current_task_btf), \ + FN(bprm_opts_set), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper @@ -4119,6 +4130,11 @@ enum bpf_lwt_encap_mode { BPF_LWT_ENCAP_IP, }; +/* Flags for bpf_bprm_opts_set helper */ +enum { + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), +}; + #define __bpf_md_ptr(type, name) \ union { \ type name; \ -- cgit v1.2.3 From 16fee29b07358293f135759d9fdbf1267da57ebd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 6 Nov 2020 17:02:17 +0100 Subject: dma-mapping: remove the dma_direct_set_offset export Drop the dma_direct_set_offset export and move the declaration to dma-map-ops.h now that the Allwinner drivers have stopped calling it. Signed-off-by: Christoph Hellwig Signed-off-by: Maxime Ripard --- arch/arm/mach-keystone/keystone.c | 2 +- arch/arm/mach-omap1/usb.c | 2 +- arch/sh/drivers/pci/pcie-sh7786.c | 2 +- arch/x86/pci/sta2x11-fixup.c | 3 ++- include/linux/dma-map-ops.h | 3 +++ include/linux/dma-mapping.h | 7 ------- kernel/dma/direct.c | 1 - 7 files changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mach-keystone/keystone.c b/arch/arm/mach-keystone/keystone.c index 09a65c2dfd73..cd711bfc591f 100644 --- a/arch/arm/mach-keystone/keystone.c +++ b/arch/arm/mach-keystone/keystone.c @@ -8,7 +8,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/arch/arm/mach-omap1/usb.c b/arch/arm/mach-omap1/usb.c index ba8566204ea9..86d3b3c157af 100644 --- a/arch/arm/mach-omap1/usb.c +++ b/arch/arm/mach-omap1/usb.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sh/drivers/pci/pcie-sh7786.c b/arch/sh/drivers/pci/pcie-sh7786.c index 4468289ab2ca..4d499476c33a 100644 --- a/arch/sh/drivers/pci/pcie-sh7786.c +++ b/arch/sh/drivers/pci/pcie-sh7786.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 5701d5ba3df4..7d2525691854 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -11,7 +11,8 @@ #include #include #include -#include +#include +#include #include #define STA2X11_SWIOTLB_SIZE (4*1024*1024) diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index a5f89fc4d6df..03925e438ec3 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -226,6 +226,9 @@ struct page *dma_alloc_from_pool(struct device *dev, size_t size, bool (*phys_addr_ok)(struct device *, phys_addr_t, size_t)); bool dma_free_from_pool(struct device *dev, void *start, size_t size); +int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, + dma_addr_t dma_start, u64 size); + #ifdef CONFIG_ARCH_HAS_DMA_COHERENCE_H #include #elif defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 956151052d45..199d85285246 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -558,13 +558,6 @@ static inline int dma_mmap_wc(struct device *dev, #define dma_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) #endif -/* - * Legacy interface to set up the dma offset map. Drivers really should not - * actually use it, but we have a few legacy cases left. - */ -int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, - dma_addr_t dma_start, u64 size); - extern const struct dma_map_ops dma_virt_ops; #endif /* _LINUX_DMA_MAPPING_H */ diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 06c111544f61..002268262c9a 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -547,4 +547,3 @@ int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start, dev->dma_range_map = map; return 0; } -EXPORT_SYMBOL_GPL(dma_direct_set_offset); -- cgit v1.2.3 From 5c62634fc65101d350cbd47722fb76f02693059d Mon Sep 17 00:00:00 2001 From: Hui Su Date: Wed, 18 Nov 2020 00:17:50 +0800 Subject: namespace: make timens_on_fork() return nothing timens_on_fork() always return 0, and maybe not need to judge the return value in copy_namespaces(). So make timens_on_fork() return nothing and do not judge its return val in copy_namespaces(). Signed-off-by: Hui Su Link: https://lore.kernel.org/r/20201117161750.GA45121@rlk Signed-off-by: Christian Brauner --- include/linux/time_namespace.h | 6 +++--- kernel/nsproxy.c | 7 +------ kernel/time/namespace.c | 6 ++---- 3 files changed, 6 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/time_namespace.h b/include/linux/time_namespace.h index 68770ac9ba89..30312166e70a 100644 --- a/include/linux/time_namespace.h +++ b/include/linux/time_namespace.h @@ -45,7 +45,7 @@ struct time_namespace *copy_time_ns(unsigned long flags, struct user_namespace *user_ns, struct time_namespace *old_ns); void free_time_ns(struct kref *kref); -int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); +void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk); struct vdso_data *arch_get_vdso_data(void *vvar_page); static inline void put_time_ns(struct time_namespace *ns) @@ -136,10 +136,10 @@ struct time_namespace *copy_time_ns(unsigned long flags, return old_ns; } -static inline int timens_on_fork(struct nsproxy *nsproxy, +static inline void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { - return 0; + return; } static inline void timens_add_monotonic(struct timespec64 *ts) { } diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 12dd41b39a7f..e2e6c5dc433f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -153,7 +153,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; - int ret; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | CLONE_NEWPID | CLONE_NEWNET | @@ -180,11 +179,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) if (IS_ERR(new_ns)) return PTR_ERR(new_ns); - ret = timens_on_fork(new_ns, tsk); - if (ret) { - free_nsproxy(new_ns); - return ret; - } + timens_on_fork(new_ns, tsk); tsk->nsproxy = new_ns; return 0; diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index afc65e6be33e..e0f9509b17c3 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -308,22 +308,20 @@ static int timens_install(struct nsset *nsset, struct ns_common *new) return 0; } -int timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) +void timens_on_fork(struct nsproxy *nsproxy, struct task_struct *tsk) { struct ns_common *nsc = &nsproxy->time_ns_for_children->ns; struct time_namespace *ns = to_time_ns(nsc); /* create_new_namespaces() already incremented the ref counter */ if (nsproxy->time_ns == nsproxy->time_ns_for_children) - return 0; + return; get_time_ns(ns); put_time_ns(nsproxy->time_ns); nsproxy->time_ns = ns; timens_commit(tsk, ns); - - return 0; } static struct user_namespace *timens_owner(struct ns_common *ns) -- cgit v1.2.3 From d055126180564a57fe533728a4e93d0cb53d49b3 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Tue, 17 Nov 2020 18:45:49 +0000 Subject: bpf: Add bpf_ktime_get_coarse_ns helper The helper uses CLOCK_MONOTONIC_COARSE source of time that is less accurate but more performant. We have a BPF CGROUP_SKB firewall that supports event logging through bpf_perf_event_output(). Each event has a timestamp and currently we use bpf_ktime_get_ns() for it. Use of bpf_ktime_get_coarse_ns() saves ~15-20 ns in time required for event logging. bpf_ktime_get_ns(): EgressLogByRemoteEndpoint 113.82ns 8.79M bpf_ktime_get_coarse_ns(): EgressLogByRemoteEndpoint 95.40ns 10.48M Signed-off-by: Dmitrii Banshchikov Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201117184549.257280-1-me@ubique.spb.ru --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 11 +++++++++++ kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 13 +++++++++++++ kernel/trace/bpf_trace.c | 2 ++ tools/include/uapi/linux/bpf.h | 11 +++++++++++ 6 files changed, 39 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 581b2a2e78eb..e1bcb6d7345c 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1842,6 +1842,7 @@ extern const struct bpf_func_proto bpf_copy_from_user_proto; extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; +extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a52299b80b9d..3ca6146f001a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3797,6 +3797,16 @@ union bpf_attr { * is cleared if the flag is not specified. * Return * **-EINVAL** if invalid *flags* are passed, zero otherwise. + * + * u64 bpf_ktime_get_coarse_ns(void) + * Description + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3959,6 +3969,7 @@ union bpf_attr { FN(task_storage_delete), \ FN(get_current_task_btf), \ FN(bprm_opts_set), \ + FN(ktime_get_coarse_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 55454d2278b1..ff55cbcfbab4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2211,6 +2211,7 @@ const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; const struct bpf_func_proto bpf_get_numa_node_id_proto __weak; const struct bpf_func_proto bpf_ktime_get_ns_proto __weak; const struct bpf_func_proto bpf_ktime_get_boot_ns_proto __weak; +const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto __weak; const struct bpf_func_proto bpf_get_current_pid_tgid_proto __weak; const struct bpf_func_proto bpf_get_current_uid_gid_proto __weak; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 25520f5eeaf6..2c395deae279 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -167,6 +167,17 @@ const struct bpf_func_proto bpf_ktime_get_boot_ns_proto = { .ret_type = RET_INTEGER, }; +BPF_CALL_0(bpf_ktime_get_coarse_ns) +{ + return ktime_get_coarse_ns(); +} + +const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto = { + .func = bpf_ktime_get_coarse_ns, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; + BPF_CALL_0(bpf_get_current_pid_tgid) { struct task_struct *task = current; @@ -685,6 +696,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_ringbuf_output: return &bpf_ringbuf_output_proto; case BPF_FUNC_ringbuf_reserve: diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 02986c7b90eb..d255bc9b2bfa 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1280,6 +1280,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_ktime_get_ns_proto; case BPF_FUNC_ktime_get_boot_ns: return &bpf_ktime_get_boot_ns_proto; + case BPF_FUNC_ktime_get_coarse_ns: + return &bpf_ktime_get_coarse_ns_proto; case BPF_FUNC_tail_call: return &bpf_tail_call_proto; case BPF_FUNC_get_current_pid_tgid: diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a52299b80b9d..3ca6146f001a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3797,6 +3797,16 @@ union bpf_attr { * is cleared if the flag is not specified. * Return * **-EINVAL** if invalid *flags* are passed, zero otherwise. + * + * u64 bpf_ktime_get_coarse_ns(void) + * Description + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * Return + * Current *ktime*. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3959,6 +3969,7 @@ union bpf_attr { FN(task_storage_delete), \ FN(get_current_task_btf), \ FN(bprm_opts_set), \ + FN(ktime_get_coarse_ns), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From f73f64d5687192bc8eb7f3d9521ca6256b79f224 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2020 14:19:43 +0100 Subject: tick/broadcast: Serialize access to tick_next_period tick_broadcast_setup_oneshot() accesses tick_next_period twice without any serialization. This is wrong in two aspects: - Reading it twice might make the broadcast data inconsistent if the variable is updated concurrently. - On 32bit systems the access might see an partial update Protect it with jiffies_lock. That's safe as none of the callchains leading up to this function can create a lock ordering violation: timer interrupt run_local_timers() hrtimer_run_queues() hrtimer_switch_to_hres() tick_init_highres() tick_switch_to_oneshot() tick_broadcast_switch_to_oneshot() or tick_check_oneshot_change() tick_nohz_switch_to_nohz() tick_switch_to_oneshot() tick_broadcast_switch_to_oneshot() Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201117132006.061341507@linutronix.de --- kernel/time/tick-broadcast.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 36d7464c8962..2a47c8f80e53 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -877,6 +877,22 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, } } +static inline ktime_t tick_get_next_period(void) +{ + ktime_t next; + + /* + * Protect against concurrent updates (store /load tearing on + * 32bit). It does not matter if the time is already in the + * past. The broadcast device which is about to be programmed will + * fire in any case. + */ + raw_spin_lock(&jiffies_lock); + next = tick_next_period; + raw_spin_unlock(&jiffies_lock); + return next; +} + /** * tick_broadcast_setup_oneshot - setup the broadcast device */ @@ -905,10 +921,11 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_oneshot_mask, tmpmask); if (was_periodic && !cpumask_empty(tmpmask)) { + ktime_t nextevt = tick_get_next_period(); + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); - tick_broadcast_init_next_event(tmpmask, - tick_next_period); - tick_broadcast_set_event(bc, cpu, tick_next_period); + tick_broadcast_init_next_event(tmpmask, nextevt); + tick_broadcast_set_event(bc, cpu, nextevt); } else bc->next_event = KTIME_MAX; } else { -- cgit v1.2.3 From c398960cd82b233886fbff163986f998b5a5c008 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2020 14:19:44 +0100 Subject: tick: Document protections for tick related data The protection rules for tick_next_period and last_jiffies_update are blury at best. Clarify this. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201117132006.197713794@linutronix.de --- kernel/time/tick-common.c | 4 +++- kernel/time/tick-sched.c | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 6c9c342dd0e5..68504eb0d38b 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -27,7 +27,9 @@ */ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); /* - * Tick next event: keeps track of the tick time + * Tick next event: keeps track of the tick time. It's updated by the + * CPU which handles the tick and protected by jiffies_lock. There is + * no requirement to write hold the jiffies seqcount for it. */ ktime_t tick_next_period; ktime_t tick_period; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 81632cd5e3b7..15360e652c85 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -44,7 +44,9 @@ struct tick_sched *tick_get_tick_sched(int cpu) #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) /* - * The time, when the last jiffy update happened. Protected by jiffies_lock. + * The time, when the last jiffy update happened. Write access must hold + * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a + * consistent view of jiffies and last_jiffies_update. */ static ktime_t last_jiffies_update; -- cgit v1.2.3 From 372acbbaa80940189593f9d69c7c069955f24f7a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2020 14:19:45 +0100 Subject: tick/sched: Use tick_next_period for lockless quick check No point in doing calculations. tick_next_period = last_jiffies_update + tick_period Just check whether now is before tick_next_period to figure out whether jiffies need an update. Add a comment why the intentional data race in the quick check is safe or not so safe in a 32bit corner case and why we don't worry about it. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201117132006.337366695@linutronix.de --- kernel/time/tick-sched.c | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 15360e652c85..b4b6abc81e4a 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -59,11 +59,29 @@ static void tick_do_update_jiffies64(ktime_t now) ktime_t delta; /* - * Do a quick check without holding jiffies_lock: - * The READ_ONCE() pairs with two updates done later in this function. + * Do a quick check without holding jiffies_lock. The READ_ONCE() + * pairs with the update done later in this function. + * + * This is also an intentional data race which is even safe on + * 32bit in theory. If there is a concurrent update then the check + * might give a random answer. It does not matter because if it + * returns then the concurrent update is already taking care, if it + * falls through then it will pointlessly contend on jiffies_lock. + * + * Though there is one nasty case on 32bit due to store tearing of + * the 64bit value. If the first 32bit store makes the quick check + * return on all other CPUs and the writing CPU context gets + * delayed to complete the second store (scheduled out on virt) + * then jiffies can become stale for up to ~2^32 nanoseconds + * without noticing. After that point all CPUs will wait for + * jiffies lock. + * + * OTOH, this is not any different than the situation with NOHZ=off + * where one CPU is responsible for updating jiffies and + * timekeeping. If that CPU goes out for lunch then all other CPUs + * will operate on stale jiffies until it decides to come back. */ - delta = ktime_sub(now, READ_ONCE(last_jiffies_update)); - if (delta < tick_period) + if (ktime_before(now, READ_ONCE(tick_next_period))) return; /* Reevaluate with jiffies_lock held */ @@ -74,9 +92,8 @@ static void tick_do_update_jiffies64(ktime_t now) if (delta >= tick_period) { delta = ktime_sub(delta, tick_period); - /* Pairs with the lockless read in this function. */ - WRITE_ONCE(last_jiffies_update, - ktime_add(last_jiffies_update, tick_period)); + last_jiffies_update = ktime_add(last_jiffies_update, + tick_period); /* Slow path for long timeouts */ if (unlikely(delta >= tick_period)) { @@ -84,15 +101,18 @@ static void tick_do_update_jiffies64(ktime_t now) ticks = ktime_divns(delta, incr); - /* Pairs with the lockless read in this function. */ - WRITE_ONCE(last_jiffies_update, - ktime_add_ns(last_jiffies_update, - incr * ticks)); + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * ticks); } do_timer(++ticks); - /* Keep the tick_next_period variable up to date */ - tick_next_period = ktime_add(last_jiffies_update, tick_period); + /* + * Keep the tick_next_period variable up to date. + * WRITE_ONCE() pairs with the READ_ONCE() in the lockless + * quick check above. + */ + WRITE_ONCE(tick_next_period, + ktime_add(last_jiffies_update, tick_period)); } else { write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); -- cgit v1.2.3 From 94ad2e3cedb82af034f6d97c58022f162b669f9b Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Tue, 17 Nov 2020 14:19:46 +0100 Subject: tick/sched: Reduce seqcount held scope in tick_do_update_jiffies64() If jiffies are up to date already (caller lost the race against another CPU) there is no point to change the sequence count. Doing that just forces other CPUs into the seqcount retry loop in tick_nohz_next_event() for nothing. Just bail out early. [ tglx: Rewrote most of it ] Signed-off-by: Yunfeng Ye Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201117132006.462195901@linutronix.de --- kernel/time/tick-sched.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b4b6abc81e4a..ca9191ced4b5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -86,38 +86,35 @@ static void tick_do_update_jiffies64(ktime_t now) /* Reevaluate with jiffies_lock held */ raw_spin_lock(&jiffies_lock); + if (ktime_before(now, tick_next_period)) { + raw_spin_unlock(&jiffies_lock); + return; + } + write_seqcount_begin(&jiffies_seq); - delta = ktime_sub(now, last_jiffies_update); - if (delta >= tick_period) { + last_jiffies_update = ktime_add(last_jiffies_update, tick_period); - delta = ktime_sub(delta, tick_period); - last_jiffies_update = ktime_add(last_jiffies_update, - tick_period); + delta = ktime_sub(now, tick_next_period); + if (unlikely(delta >= tick_period)) { + /* Slow path for long idle sleep times */ + s64 incr = ktime_to_ns(tick_period); - /* Slow path for long timeouts */ - if (unlikely(delta >= tick_period)) { - s64 incr = ktime_to_ns(tick_period); + ticks = ktime_divns(delta, incr); - ticks = ktime_divns(delta, incr); + last_jiffies_update = ktime_add_ns(last_jiffies_update, + incr * ticks); + } - last_jiffies_update = ktime_add_ns(last_jiffies_update, - incr * ticks); - } - do_timer(++ticks); + do_timer(++ticks); + + /* + * Keep the tick_next_period variable up to date. WRITE_ONCE() + * pairs with the READ_ONCE() in the lockless quick check above. + */ + WRITE_ONCE(tick_next_period, + ktime_add(last_jiffies_update, tick_period)); - /* - * Keep the tick_next_period variable up to date. - * WRITE_ONCE() pairs with the READ_ONCE() in the lockless - * quick check above. - */ - WRITE_ONCE(tick_next_period, - ktime_add(last_jiffies_update, tick_period)); - } else { - write_seqcount_end(&jiffies_seq); - raw_spin_unlock(&jiffies_lock); - return; - } write_seqcount_end(&jiffies_seq); raw_spin_unlock(&jiffies_lock); update_wall_time(); -- cgit v1.2.3 From 7a35bf2a6a871cd0252cd371d741e7d070b53af9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2020 14:19:47 +0100 Subject: tick/sched: Optimize tick_do_update_jiffies64() further Now that it's clear that there is always one tick to account, simplify the calculations some more. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201117132006.565663056@linutronix.de --- kernel/time/tick-sched.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index ca9191ced4b5..306adeb6ce4c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -55,7 +55,7 @@ static ktime_t last_jiffies_update; */ static void tick_do_update_jiffies64(ktime_t now) { - unsigned long ticks = 0; + unsigned long ticks = 1; ktime_t delta; /* @@ -93,20 +93,21 @@ static void tick_do_update_jiffies64(ktime_t now) write_seqcount_begin(&jiffies_seq); - last_jiffies_update = ktime_add(last_jiffies_update, tick_period); - delta = ktime_sub(now, tick_next_period); if (unlikely(delta >= tick_period)) { /* Slow path for long idle sleep times */ s64 incr = ktime_to_ns(tick_period); - ticks = ktime_divns(delta, incr); + ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); + } else { + last_jiffies_update = ktime_add(last_jiffies_update, + tick_period); } - do_timer(++ticks); + do_timer(ticks); /* * Keep the tick_next_period variable up to date. WRITE_ONCE() -- cgit v1.2.3 From 896b969e6732b68ee3c12ae4e1aeddf5db99bc46 Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Tue, 17 Nov 2020 14:19:48 +0100 Subject: tick/sched: Release seqcount before invoking calc_load_global() calc_load_global() does not need the sequence count protection. [ tglx: Split it up properly and added comments ] Signed-off-by: Yunfeng Ye Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201117132006.660902274@linutronix.de --- kernel/time/tick-sched.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 306adeb6ce4c..33c897bb88c6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -107,7 +108,8 @@ static void tick_do_update_jiffies64(ktime_t now) tick_period); } - do_timer(ticks); + /* Advance jiffies to complete the jiffies_seq protected job */ + jiffies_64 += ticks; /* * Keep the tick_next_period variable up to date. WRITE_ONCE() @@ -116,7 +118,15 @@ static void tick_do_update_jiffies64(ktime_t now) WRITE_ONCE(tick_next_period, ktime_add(last_jiffies_update, tick_period)); + /* + * Release the sequence count. calc_global_load() below is not + * protected by it, but jiffies_lock needs to be held to prevent + * concurrent invocations. + */ write_seqcount_end(&jiffies_seq); + + calc_global_load(); + raw_spin_unlock(&jiffies_lock); update_wall_time(); } -- cgit v1.2.3 From b996544916429946bf4934c1c01a306d1690972c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Nov 2020 14:19:49 +0100 Subject: tick: Get rid of tick_period The variable tick_period is initialized to NSEC_PER_TICK / HZ during boot and never updated again. If NSEC_PER_TICK is not an integer multiple of HZ this computation is less accurate than TICK_NSEC which has proper rounding in place. Aside of the inaccuracy there is no reason for having this variable at all. It's just a pointless indirection and all usage sites can just use the TICK_NSEC constant. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201117132006.766643526@linutronix.de --- kernel/time/tick-broadcast.c | 2 +- kernel/time/tick-common.c | 8 +++----- kernel/time/tick-internal.h | 1 - kernel/time/tick-sched.c | 22 +++++++++++----------- 4 files changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2a47c8f80e53..5a23829372c7 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -331,7 +331,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) bc_local = tick_do_periodic_broadcast(); if (clockevent_state_oneshot(dev)) { - ktime_t next = ktime_add(dev->next_event, tick_period); + ktime_t next = ktime_add_ns(dev->next_event, TICK_NSEC); clockevents_program_event(dev, next, true); } diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 68504eb0d38b..a03764df5366 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -32,7 +32,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device); * no requirement to write hold the jiffies seqcount for it. */ ktime_t tick_next_period; -ktime_t tick_period; /* * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR @@ -90,7 +89,7 @@ static void tick_periodic(int cpu) write_seqcount_begin(&jiffies_seq); /* Keep track of the next tick event */ - tick_next_period = ktime_add(tick_next_period, tick_period); + tick_next_period = ktime_add_ns(tick_next_period, TICK_NSEC); do_timer(1); write_seqcount_end(&jiffies_seq); @@ -129,7 +128,7 @@ void tick_handle_periodic(struct clock_event_device *dev) * Setup the next period for devices, which do not have * periodic mode: */ - next = ktime_add(next, tick_period); + next = ktime_add_ns(next, TICK_NSEC); if (!clockevents_program_event(dev, next, false)) return; @@ -175,7 +174,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) for (;;) { if (!clockevents_program_event(dev, next, false)) return; - next = ktime_add(next, tick_period); + next = ktime_add_ns(next, TICK_NSEC); } } } @@ -222,7 +221,6 @@ static void tick_setup_device(struct tick_device *td, tick_do_timer_cpu = cpu; tick_next_period = ktime_get(); - tick_period = NSEC_PER_SEC / HZ; #ifdef CONFIG_NO_HZ_FULL /* * The boot CPU may be nohz_full, in which case set diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 7b2496136729..7a981c9e87a4 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -15,7 +15,6 @@ DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; -extern ktime_t tick_period; extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 33c897bb88c6..cc7cba20382e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -95,17 +95,17 @@ static void tick_do_update_jiffies64(ktime_t now) write_seqcount_begin(&jiffies_seq); delta = ktime_sub(now, tick_next_period); - if (unlikely(delta >= tick_period)) { + if (unlikely(delta >= TICK_NSEC)) { /* Slow path for long idle sleep times */ - s64 incr = ktime_to_ns(tick_period); + s64 incr = TICK_NSEC; ticks += ktime_divns(delta, incr); last_jiffies_update = ktime_add_ns(last_jiffies_update, incr * ticks); } else { - last_jiffies_update = ktime_add(last_jiffies_update, - tick_period); + last_jiffies_update = ktime_add_ns(last_jiffies_update, + TICK_NSEC); } /* Advance jiffies to complete the jiffies_seq protected job */ @@ -116,7 +116,7 @@ static void tick_do_update_jiffies64(ktime_t now) * pairs with the READ_ONCE() in the lockless quick check above. */ WRITE_ONCE(tick_next_period, - ktime_add(last_jiffies_update, tick_period)); + ktime_add_ns(last_jiffies_update, TICK_NSEC)); /* * Release the sequence count. calc_global_load() below is not @@ -691,7 +691,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) hrtimer_set_expires(&ts->sched_timer, ts->last_tick); /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { hrtimer_start_expires(&ts->sched_timer, @@ -1260,7 +1260,7 @@ static void tick_nohz_handler(struct clock_event_device *dev) if (unlikely(ts->tick_stopped)) return; - hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); } @@ -1297,7 +1297,7 @@ static void tick_nohz_switch_to_nohz(void) next = tick_init_jiffy_update(); hrtimer_set_expires(&ts->sched_timer, next); - hrtimer_forward_now(&ts->sched_timer, tick_period); + hrtimer_forward_now(&ts->sched_timer, TICK_NSEC); tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } @@ -1363,7 +1363,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) if (unlikely(ts->tick_stopped)) return HRTIMER_NORESTART; - hrtimer_forward(timer, now, tick_period); + hrtimer_forward(timer, now, TICK_NSEC); return HRTIMER_RESTART; } @@ -1397,13 +1397,13 @@ void tick_setup_sched_timer(void) /* Offset the tick to avert jiffies_lock contention. */ if (sched_skew_tick) { - u64 offset = ktime_to_ns(tick_period) >> 1; + u64 offset = TICK_NSEC >> 1; do_div(offset, num_possible_cpus()); offset *= smp_processor_id(); hrtimer_add_expires_ns(&ts->sched_timer, offset); } - hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_forward(&ts->sched_timer, now, TICK_NSEC); hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD); tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } -- cgit v1.2.3 From aabe19b8279340c43294688b4d9527a893c60463 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Mon, 16 Nov 2020 02:00:54 +0800 Subject: nsproxy: use put_nsproxy() in switch_task_namespaces() We already have a dedicated helper that handles reference count checking so stop open-coding the reference count check in switch_task_namespaces() and use the dedicated put_nsproxy() helper instead. Take the change to fix a whitespace issue too. Signed-off-by: Hui Su [christian.brauner@ubuntu.com: expand commit message] Acked-by: Christian Brauner Link: https://lore.kernel.org/r/20201115180054.GA371317@rlk Signed-off-by: Christian Brauner --- kernel/nsproxy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 12dd41b39a7f..3ebfd090398a 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -173,7 +173,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) * it along with CLONE_NEWIPC. */ if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) == - (CLONE_NEWIPC | CLONE_SYSVSEM)) + (CLONE_NEWIPC | CLONE_SYSVSEM)) return -EINVAL; new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); @@ -250,8 +250,8 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new) p->nsproxy = new; task_unlock(p); - if (ns && atomic_dec_and_test(&ns->count)) - free_nsproxy(ns); + if (ns) + put_nsproxy(ns); } void exit_task_namespaces(struct task_struct *p) -- cgit v1.2.3 From 9f68b5b74c48761bcbd7d90cf1426049bdbaabb7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 17 Nov 2020 16:16:35 +0100 Subject: sched: Detect call to schedule from critical entry code Detect calls to schedule() between user_enter() and user_exit(). Those are symptoms of early entry code that either forgot to protect a call to schedule() inside exception_enter()/exception_exit() or, in the case of HAVE_CONTEXT_TRACKING_OFFSTACK, enabled interrupts or preemption in a wrong spot. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201117151637.259084-4-frederic@kernel.org --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2003a7d5ab5..c23d7cb5aee3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4291,6 +4291,7 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) preempt_count_set(PREEMPT_DISABLED); } rcu_sleep_check(); + SCHED_WARN_ON(ct_state() == CONTEXT_USER); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); -- cgit v1.2.3 From 6775de4984ea83ce39f19a40c09f8813d7423831 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 17 Nov 2020 16:16:36 +0100 Subject: context_tracking: Only define schedule_user() on !HAVE_CONTEXT_TRACKING_OFFSTACK archs schedule_user() was traditionally used by the entry code's tail to preempt userspace after the call to user_enter(). Indeed the call to user_enter() used to be performed upon syscall exit slow path which was right before the last opportunity to schedule() while resuming to userspace. The context tracking state had to be saved on the task stack and set back to CONTEXT_KERNEL temporarily in order to safely switch to another task. Only a few archs use it now (namely sparc64 and powerpc64) and those implementing HAVE_CONTEXT_TRACKING_OFFSTACK definetly can't rely on it. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201117151637.259084-5-frederic@kernel.org --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c23d7cb5aee3..44426e5acde2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4631,7 +4631,7 @@ void __sched schedule_idle(void) } while (need_resched()); } -#ifdef CONFIG_CONTEXT_TRACKING +#if defined(CONFIG_CONTEXT_TRACKING) && !defined(CONFIG_HAVE_CONTEXT_TRACKING_OFFSTACK) asmlinkage __visible void __sched schedule_user(void) { /* -- cgit v1.2.3 From d707faa64d03d26b529cc4aea59dab1b016d4d33 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 13 Nov 2020 11:24:14 +0000 Subject: sched/core: Add missing completion for affine_move_task() waiters Qian reported that some fuzzer issuing sched_setaffinity() ends up stuck on a wait_for_completion(). The problematic pattern seems to be: affine_move_task() // task_running() case stop_one_cpu(); wait_for_completion(&pending->done); Combined with, on the stopper side: migration_cpu_stop() // Task moved between unlocks and scheduling the stopper task_rq(p) != rq && // task_running() case dest_cpu >= 0 => no complete_all() This can happen with both PREEMPT and !PREEMPT, although !PREEMPT should be more likely to see this given the targeted task has a much bigger window to block and be woken up elsewhere before the stopper runs. Make migration_cpu_stop() always look at pending affinity requests; signal their completion if the stopper hits a rq mismatch but the task is still within its allowed mask. When Migrate-Disable isn't involved, this matches the previous set_cpus_allowed_ptr() vs migration_cpu_stop() behaviour. Fixes: 6d337eab041d ("sched: Fix migrate_disable() vs set_cpus_allowed_ptr()") Reported-by: Qian Cai Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/lkml/8b62fd1ad1b18def27f18e2ee2df3ff5b36d0762.camel@redhat.com --- kernel/sched/core.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a6aaf9fb3400..4d1fd4b783ee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1923,7 +1923,7 @@ static int migration_cpu_stop(void *data) else p->wake_cpu = dest_cpu; - } else if (dest_cpu < 0) { + } else if (dest_cpu < 0 || pending) { /* * This happens when we get migrated between migrate_enable()'s * preempt_enable() and scheduling the stopper task. At that @@ -1933,6 +1933,17 @@ static int migration_cpu_stop(void *data) * more likely. */ + /* + * The task moved before the stopper got to run. We're holding + * ->pi_lock, so the allowed mask is stable - if it got + * somewhere allowed, we're done. + */ + if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { + p->migration_pending = NULL; + complete = true; + goto out; + } + /* * When this was migrate_enable() but we no longer have an * @pending, a concurrent SCA 'fixed' things and we should be -- cgit v1.2.3 From 1293771e4353c148d5f6908fb32d1c1cfd653e47 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 17 Nov 2020 12:14:51 +0100 Subject: sched: Fix migration_cpu_stop() WARN Oleksandr reported hitting the WARN in the 'task_rq(p) != rq' branch of migration_cpu_stop(). Valentin noted that using cpu_of(rq) in that case is just plain wrong to begin with, since per the earlier branch that isn't the actual CPU of the task. Replace both instances of is_cpu_allowed() by a direct p->cpus_mask test using task_cpu(). Reported-by: Oleksandr Natalenko Debugged-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) --- kernel/sched/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d1fd4b783ee..28d541a3c74d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1911,7 +1911,7 @@ static int migration_cpu_stop(void *data) * and we should be valid again. Nothing to do. */ if (!pending) { - WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); + WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); goto out; } @@ -1950,7 +1950,7 @@ static int migration_cpu_stop(void *data) * valid again. Nothing to do. */ if (!pending) { - WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); + WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); goto out; } -- cgit v1.2.3 From 406100f3da08066c00105165db8520bbc7694a36 Mon Sep 17 00:00:00 2001 From: Daniel Jordan Date: Thu, 12 Nov 2020 12:17:11 -0500 Subject: cpuset: fix race between hotplug work and later CPU offline One of our machines keeled over trying to rebuild the scheduler domains. Mainline produces the same splat: BUG: unable to handle page fault for address: 0000607f820054db CPU: 2 PID: 149 Comm: kworker/1:1 Not tainted 5.10.0-rc1-master+ #6 Workqueue: events cpuset_hotplug_workfn RIP: build_sched_domains Call Trace: partition_sched_domains_locked rebuild_sched_domains_locked cpuset_hotplug_workfn It happens with cgroup2 and exclusive cpusets only. This reproducer triggers it on an 8-cpu vm and works most effectively with no preexisting child cgroups: cd $UNIFIED_ROOT mkdir cg1 echo 4-7 > cg1/cpuset.cpus echo root > cg1/cpuset.cpus.partition # with smt/control reading 'on', echo off > /sys/devices/system/cpu/smt/control RIP maps to sd->shared = *per_cpu_ptr(sdd->sds, sd_id); from sd_init(). sd_id is calculated earlier in the same function: cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); sd_id = cpumask_first(sched_domain_span(sd)); tl->mask(cpu), which reads cpu_sibling_map on x86, returns an empty mask and so cpumask_first() returns >= nr_cpu_ids, which leads to the bogus value from per_cpu_ptr() above. The problem is a race between cpuset_hotplug_workfn() and a later offline of CPU N. cpuset_hotplug_workfn() updates the effective masks when N is still online, the offline clears N from cpu_sibling_map, and then the worker uses the stale effective masks that still have N to generate the scheduling domains, leading the worker to read N's empty cpu_sibling_map in sd_init(). rebuild_sched_domains_locked() prevented the race during the cgroup2 cpuset series up until the Fixes commit changed its check. Make the check more robust so that it can detect an offline CPU in any exclusive cpuset's effective mask, not just the top one. Fixes: 0ccea8feb980 ("cpuset: Make generate_sched_domains() work with partition") Signed-off-by: Daniel Jordan Signed-off-by: Peter Zijlstra (Intel) Acked-by: Tejun Heo Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20201112171711.639541-1-daniel.m.jordan@oracle.com --- kernel/cgroup/cpuset.c | 33 ++++++++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 57b5b5d0a5fd..53c70c470a38 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -983,25 +983,48 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], */ static void rebuild_sched_domains_locked(void) { + struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; + struct cpuset *cs; int ndoms; lockdep_assert_cpus_held(); percpu_rwsem_assert_held(&cpuset_rwsem); /* - * We have raced with CPU hotplug. Don't do anything to avoid + * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, hotplug work item will rebuild sched domains. + * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * + * With no CPUs in any subpartitions, top_cpuset's effective CPUs + * should be the same as the active CPUs, so checking only top_cpuset + * is enough to detect racing CPU offlines. */ if (!top_cpuset.nr_subparts_cpus && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) return; - if (top_cpuset.nr_subparts_cpus && - !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) - return; + /* + * With subpartition CPUs, however, the effective CPUs of a partition + * root should be only a subset of the active CPUs. Since a CPU in any + * partition root could be offlined, all must be checked. + */ + if (top_cpuset.nr_subparts_cpus) { + rcu_read_lock(); + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + if (!is_partition_root(cs)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + if (!cpumask_subset(cs->effective_cpus, + cpu_active_mask)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + } /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); -- cgit v1.2.3 From b5b217346de85ed1b03fdecd5c5076b34fbb2f0b Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Tue, 10 Nov 2020 18:43:00 +0000 Subject: sched/topology: Warn when NUMA diameter > 2 NUMA topologies where the shortest path between some two nodes requires three or more hops (i.e. diameter > 2) end up being misrepresented in the scheduler topology structures. This is currently detected when booting a kernel with CONFIG_SCHED_DEBUG=y + sched_debug on the cmdline, although this will only yield a warning about sched_group spans not matching sched_domain spans: ERROR: groups don't span domain->span Add an explicit warning for that case, triggered regardless of CONFIG_SCHED_DEBUG, and decorate it with an appropriate comment. The topology described in the comment can be booted up on QEMU by appending the following to your usual QEMU incantation: -smp cores=4 \ -numa node,cpus=0,nodeid=0 -numa node,cpus=1,nodeid=1, \ -numa node,cpus=2,nodeid=2, -numa node,cpus=3,nodeid=3, \ -numa dist,src=0,dst=1,val=20, -numa dist,src=0,dst=2,val=30, \ -numa dist,src=0,dst=3,val=40, -numa dist,src=1,dst=2,val=20, \ -numa dist,src=1,dst=3,val=30, -numa dist,src=2,dst=3,val=20 A somewhat more realistic topology (6-node mesh) with the same affliction can be conjured with: -smp cores=6 \ -numa node,cpus=0,nodeid=0 -numa node,cpus=1,nodeid=1, \ -numa node,cpus=2,nodeid=2, -numa node,cpus=3,nodeid=3, \ -numa node,cpus=4,nodeid=4, -numa node,cpus=5,nodeid=5, \ -numa dist,src=0,dst=1,val=20, -numa dist,src=0,dst=2,val=30, \ -numa dist,src=0,dst=3,val=40, -numa dist,src=0,dst=4,val=30, \ -numa dist,src=0,dst=5,val=20, \ -numa dist,src=1,dst=2,val=20, -numa dist,src=1,dst=3,val=30, \ -numa dist,src=1,dst=4,val=20, -numa dist,src=1,dst=5,val=30, \ -numa dist,src=2,dst=3,val=20, -numa dist,src=2,dst=4,val=30, \ -numa dist,src=2,dst=5,val=40, \ -numa dist,src=3,dst=4,val=20, -numa dist,src=3,dst=5,val=30, \ -numa dist,src=4,dst=5,val=20 Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mel Gorman Link: https://lore.kernel.org/lkml/jhjtux5edo2.mognet@arm.com --- kernel/sched/topology.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 90f3e5558fa2..b296c1c6b961 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -675,6 +675,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) { struct rq *rq = cpu_rq(cpu); struct sched_domain *tmp; + int numa_distance = 0; /* Remove the sched domains which do not contribute to scheduling. */ for (tmp = sd; tmp; ) { @@ -706,6 +707,38 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) sd->child = NULL; } + for (tmp = sd; tmp; tmp = tmp->parent) + numa_distance += !!(tmp->flags & SD_NUMA); + + /* + * FIXME: Diameter >=3 is misrepresented. + * + * Smallest diameter=3 topology is: + * + * node 0 1 2 3 + * 0: 10 20 30 40 + * 1: 20 10 20 30 + * 2: 30 20 10 20 + * 3: 40 30 20 10 + * + * 0 --- 1 --- 2 --- 3 + * + * NUMA-3 0-3 N/A N/A 0-3 + * groups: {0-2},{1-3} {1-3},{0-2} + * + * NUMA-2 0-2 0-3 0-3 1-3 + * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} + * + * NUMA-1 0-1 0-2 1-3 2-3 + * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} + * + * NUMA-0 0 1 2 3 + * + * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the + * group span isn't a subset of the domain span. + */ + WARN_ONCE(numa_distance > 2, "Shortest NUMA path spans too many nodes\n"); + sched_domain_debug(sd, cpu); rq_attach_root(rq, rd); -- cgit v1.2.3 From b19a888c1e9bdf12e0d8dd9aeb887ca7de91c8a5 Mon Sep 17 00:00:00 2001 From: Tal Zussman Date: Thu, 12 Nov 2020 19:51:56 -0500 Subject: sched/core: Fix typos in comments Signed-off-by: Tal Zussman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201113005156.GA8408@charmander --- kernel/sched/core.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28d541a3c74d..a9e6d630eb83 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -97,7 +97,7 @@ int sysctl_sched_rt_runtime = 950000; * * Normal scheduling state is serialized by rq->lock. __schedule() takes the * local CPU's rq->lock, it optionally removes the task from the runqueue and - * always looks at the local rq data structures to find the most elegible task + * always looks at the local rq data structures to find the most eligible task * to run next. * * Task enqueue is also under rq->lock, possibly taken from another CPU. @@ -518,7 +518,7 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) /* * Atomically grab the task, if ->wake_q is !nil already it means - * its already queued (either by us or someone else) and will get the + * it's already queued (either by us or someone else) and will get the * wakeup due to that. * * In order to ensure that a pending wakeup will observe our pending @@ -769,7 +769,7 @@ bool sched_can_stop_tick(struct rq *rq) return false; /* - * If there are more than one RR tasks, we need the tick to effect the + * If there are more than one RR tasks, we need the tick to affect the * actual RR behaviour. */ if (rq->rt.rr_nr_running) { @@ -1187,14 +1187,14 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, * accounting was performed at enqueue time and we can just return * here. * - * Need to be careful of the following enqeueue/dequeue ordering + * Need to be careful of the following enqueue/dequeue ordering * problem too * * enqueue(taskA) * // sched_uclamp_used gets enabled * enqueue(taskB) * dequeue(taskA) - * // Must not decrement bukcet->tasks here + * // Must not decrement bucket->tasks here * dequeue(taskB) * * where we could end up with stale data in uc_se and @@ -2924,7 +2924,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* - * Our task @p is fully woken up and running; so its safe to + * Our task @p is fully woken up and running; so it's safe to * drop the rq->lock, hereafter rq is only used for statistics. */ rq_unpin_lock(rq, rf); @@ -3411,7 +3411,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) /* * If the owning (remote) CPU is still in the middle of schedule() with - * this task as prev, wait until its done referencing the task. + * this task as prev, wait until it's done referencing the task. * * Pairs with the smp_store_release() in finish_task(). * @@ -3816,7 +3816,7 @@ void wake_up_new_task(struct task_struct *p) #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* - * Nothing relies on rq->lock after this, so its fine to + * Nothing relies on rq->lock after this, so it's fine to * drop it. */ rq_unpin_lock(rq, &rf); @@ -4343,7 +4343,7 @@ unsigned long nr_iowait_cpu(int cpu) } /* - * IO-wait accounting, and how its mostly bollocks (on SMP). + * IO-wait accounting, and how it's mostly bollocks (on SMP). * * The idea behind IO-wait account is to account the idle time that we could * have spend running if it were not for IO. That is, if we were to improve the @@ -4838,7 +4838,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Optimization: we know that if all tasks are in the fair class we can * call that function directly, but only if the @prev task wasn't of a - * higher scheduling class, because otherwise those loose the + * higher scheduling class, because otherwise those lose the * opportunity to pull in more work from other CPUs. */ if (likely(prev->sched_class <= &fair_sched_class && @@ -5361,7 +5361,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to * ensure a task is de-boosted (pi_task is set to NULL) before the * task is allowed to run again (and can exit). This ensures the pointer - * points to a blocked task -- which guaratees the task is present. + * points to a blocked task -- which guarantees the task is present. */ p->pi_top_task = pi_task; @@ -5479,7 +5479,7 @@ void set_user_nice(struct task_struct *p, long nice) /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected - * it wont have any effect on scheduling until the task is + * it won't have any effect on scheduling until the task is * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR: */ if (task_has_dl_policy(p) || task_has_rt_policy(p)) { @@ -6668,7 +6668,7 @@ EXPORT_SYMBOL(__cond_resched_lock); * * The scheduler is at all times free to pick the calling task as the most * eligible task to run, if removing the yield() call from your code breaks - * it, its already broken. + * it, it's already broken. * * Typical broken usage is: * @@ -7042,7 +7042,7 @@ void init_idle(struct task_struct *idle, int cpu) #ifdef CONFIG_SMP /* - * Its possible that init_idle() gets called multiple times on a task, + * It's possible that init_idle() gets called multiple times on a task, * in that case do_set_cpus_allowed() will not do the right thing. * * And since this is boot we can forgo the serialization. @@ -8225,7 +8225,7 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) return -EINVAL; #endif /* - * Serialize against wake_up_new_task() such that if its + * Serialize against wake_up_new_task() such that if it's * running, we're sure to observe its full state. */ raw_spin_lock_irq(&task->pi_lock); -- cgit v1.2.3 From 480a6ca2dc6ed82c783faf7e4a9644769b8397d8 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Fri, 13 Nov 2020 12:34:54 +0100 Subject: sched/uclamp: Allow to reset a task uclamp constraint value In case the user wants to stop controlling a uclamp constraint value for a task, use the magic value -1 in sched_util_{min,max} with the appropriate sched_flags (SCHED_FLAG_UTIL_CLAMP_{MIN,MAX}) to indicate the reset. The advantage over the 'additional flag' approach (i.e. introducing SCHED_FLAG_UTIL_CLAMP_RESET) is that no additional flag has to be exported via uapi. This avoids the need to document how this new flag has be used in conjunction with the existing uclamp related flags. The following subtle issue is fixed as well. When a uclamp constraint value is set on a !user_defined uclamp_se it is currently first reset and then set. Fix this by AND'ing !user_defined with !SCHED_FLAG_UTIL_CLAMP which stands for the 'sched class change' case. The related condition 'if (uc_se->user_defined)' moved from __setscheduler_uclamp() into uclamp_reset(). Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Yun Hsiang Link: https://lkml.kernel.org/r/20201113113454.25868-1-dietmar.eggemann@arm.com --- include/uapi/linux/sched/types.h | 2 ++ kernel/sched/core.c | 70 +++++++++++++++++++++++++++++----------- 2 files changed, 53 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h index c852153ddb0d..f2c4589d4dbf 100644 --- a/include/uapi/linux/sched/types.h +++ b/include/uapi/linux/sched/types.h @@ -96,6 +96,8 @@ struct sched_param { * on a CPU with a capacity big enough to fit the specified value. * A task with a max utilization value smaller than 1024 is more likely * scheduled on a CPU with no more capacity than the specified value. + * + * A task utilization boundary can be reset by setting the attribute to -1. */ struct sched_attr { __u32 size; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a9e6d630eb83..e6473ecaab3c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1413,17 +1413,24 @@ done: static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) { - unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; - unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; + int util_min = p->uclamp_req[UCLAMP_MIN].value; + int util_max = p->uclamp_req[UCLAMP_MAX].value; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) - lower_bound = attr->sched_util_min; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) - upper_bound = attr->sched_util_max; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + util_min = attr->sched_util_min; - if (lower_bound > upper_bound) - return -EINVAL; - if (upper_bound > SCHED_CAPACITY_SCALE) + if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + util_max = attr->sched_util_max; + + if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (util_min != -1 && util_max != -1 && util_min > util_max) return -EINVAL; /* @@ -1438,20 +1445,41 @@ static int uclamp_validate(struct task_struct *p, return 0; } +static bool uclamp_reset(const struct sched_attr *attr, + enum uclamp_id clamp_id, + struct uclamp_se *uc_se) +{ + /* Reset on sched class change for a non user-defined clamp value. */ + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && + !uc_se->user_defined) + return true; + + /* Reset on sched_util_{min,max} == -1. */ + if (clamp_id == UCLAMP_MIN && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min == -1) { + return true; + } + + if (clamp_id == UCLAMP_MAX && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max == -1) { + return true; + } + + return false; +} + static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { enum uclamp_id clamp_id; - /* - * On scheduling class change, reset to default clamps for tasks - * without a task-specific value. - */ for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; + unsigned int value; - /* Keep using defined clamps across class changes */ - if (uc_se->user_defined) + if (!uclamp_reset(attr, clamp_id, uc_se)) continue; /* @@ -1459,21 +1487,25 @@ static void __setscheduler_uclamp(struct task_struct *p, * at runtime. */ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - __uclamp_update_util_min_rt_default(p); + value = sysctl_sched_uclamp_util_min_rt_default; else - uclamp_se_set(uc_se, uclamp_none(clamp_id), false); + value = uclamp_none(clamp_id); + + uclamp_se_set(uc_se, value, false); } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) return; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true); } - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], attr->sched_util_max, true); } -- cgit v1.2.3 From 31f6a8c0a471be7d7d05c93eac50fcb729e79b9d Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Tue, 27 Oct 2020 18:07:11 +0000 Subject: sched/topology,schedutil: Wrap sched domains rebuild Add the rebuild_sched_domains_energy() function to wrap the functionality that rebuilds the scheduling domains if any of the Energy Aware Scheduling (EAS) initialisation conditions change. This functionality is used when schedutil is added or removed or when EAS is enabled or disabled through the sched_energy_aware sysctl. Therefore, create a single function that is used in both these cases and that can be later reused. Signed-off-by: Ionela Voinescu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Quentin Perret Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20201027180713.7642-2-ionela.voinescu@arm.com --- include/linux/sched/topology.h | 8 ++++++++ kernel/sched/cpufreq_schedutil.c | 9 +-------- kernel/sched/topology.c | 18 +++++++++++------- 3 files changed, 20 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 9ef7bf686a9f..8f0f778b7c91 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -225,6 +225,14 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) #endif /* !CONFIG_SMP */ +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +extern void rebuild_sched_domains_energy(void); +#else +static inline void rebuild_sched_domains_energy(void) +{ +} +#endif + #ifndef arch_scale_cpu_capacity /** * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e254745a82cb..37b303890336 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -899,16 +899,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL -extern bool sched_energy_update; -extern struct mutex sched_energy_mutex; - static void rebuild_sd_workfn(struct work_struct *work) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = true; - rebuild_sched_domains(); - sched_energy_update = false; - mutex_unlock(&sched_energy_mutex); + rebuild_sched_domains_energy(); } static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index b296c1c6b961..04d9ebfaedd6 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -211,6 +211,15 @@ unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +void rebuild_sched_domains_energy(void) +{ + mutex_lock(&sched_energy_mutex); + sched_energy_update = true; + rebuild_sched_domains(); + sched_energy_update = false; + mutex_unlock(&sched_energy_mutex); +} + #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -223,13 +232,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); - if (state != sysctl_sched_energy_aware) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = 1; - rebuild_sched_domains(); - sched_energy_update = 0; - mutex_unlock(&sched_energy_mutex); - } + if (state != sysctl_sched_energy_aware) + rebuild_sched_domains_energy(); } return ret; -- cgit v1.2.3 From fa50e2b452c60cff9f4000de5b372a61d6695c26 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Tue, 27 Oct 2020 18:07:13 +0000 Subject: sched/topology: Condition EAS enablement on FIE support In order to make accurate predictions across CPUs and for all performance states, Energy Aware Scheduling (EAS) needs frequency-invariant load tracking signals. EAS task placement aims to minimize energy consumption, and does so in part by limiting the search space to only CPUs with the highest spare capacity (CPU capacity - CPU utilization) in their performance domain. Those candidates are the placement choices that will keep frequency at its lowest possible and therefore save the most energy. But without frequency invariance, a CPU's utilization is relative to the CPU's current performance level, and not relative to its maximum performance level, which determines its capacity. As a result, it will fail to correctly indicate any potential spare capacity obtained by an increase in a CPU's performance level. Therefore, a non-invariant utilization signal would render the EAS task placement logic invalid. Now that we properly report support for the Frequency Invariance Engine (FIE) through arch_scale_freq_invariant() for arm and arm64 systems, while also ensuring a re-evaluation of the EAS use conditions for possible invariance status change, we can assert this is the case when initializing EAS. Warn and bail out otherwise. Suggested-by: Quentin Perret Signed-off-by: Ionela Voinescu Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201027180713.7642-4-ionela.voinescu@arm.com --- kernel/sched/topology.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 04d9ebfaedd6..5d3675c7a76b 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -328,6 +328,7 @@ static void sched_energy_set(bool has_eas) * 3. no SMT is detected. * 4. the EM complexity is low enough to keep scheduling overheads low; * 5. schedutil is driving the frequency of all CPUs of the rd; + * 6. frequency invariance support is present; * * The complexity of the Energy Model is defined as: * @@ -376,6 +377,14 @@ static bool build_perf_domains(const struct cpumask *cpu_map) goto free; } + if (!arch_scale_freq_invariant()) { + if (sched_debug()) { + pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", + cpumask_pr_args(cpu_map)); + } + goto free; + } + for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ if (find_pd(pd, i)) -- cgit v1.2.3 From 6fa6d28051e9fcaa1570e69648ea13a353a5d218 Mon Sep 17 00:00:00 2001 From: Daniel Xu Date: Tue, 17 Nov 2020 12:05:45 -0800 Subject: lib/strncpy_from_user.c: Mask out bytes after NUL terminator. do_strncpy_from_user() may copy some extra bytes after the NUL terminator into the destination buffer. This usually does not matter for normal string operations. However, when BPF programs key BPF maps with strings, this matters a lot. A BPF program may read strings from user memory by calling the bpf_probe_read_user_str() helper which eventually calls do_strncpy_from_user(). The program can then key a map with the destination buffer. BPF map keys are fixed-width and string-agnostic, meaning that map keys are treated as a set of bytes. The issue is when do_strncpy_from_user() overcopies bytes after the NUL terminator, it can result in seemingly identical strings occupying multiple slots in a BPF map. This behavior is subtle and totally unexpected by the user. This commit masks out the bytes following the NUL while preserving long-sized stride in the fast path. Fixes: 6ae08ae3dea2 ("bpf: Add probe_read_{user, kernel} and probe_read_{user, kernel}_str helpers") Signed-off-by: Daniel Xu Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/21efc982b3e9f2f7b0379eed642294caaa0c27a7.1605642949.git.dxu@dxuuu.xyz --- kernel/trace/bpf_trace.c | 10 ++++++++++ lib/strncpy_from_user.c | 19 +++++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 5113fd423cdf..048c655315f1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -181,6 +181,16 @@ bpf_probe_read_user_str_common(void *dst, u32 size, { int ret; + /* + * NB: We rely on strncpy_from_user() not copying junk past the NUL + * terminator into `dst`. + * + * strncpy_from_user() does long-sized strides in the fast path. If the + * strncpy does not mask out the bytes after the NUL in `unsafe_ptr`, + * then there could be junk after the NUL in `dst`. If user takes `dst` + * and keys a hash map with it, then semantically identical strings can + * occupy multiple entries in the map. + */ ret = strncpy_from_user_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) memset(dst, 0, size); diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index e6d5fcc2cdf3..122d8d0e253c 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -35,17 +35,32 @@ static inline long do_strncpy_from_user(char *dst, const char __user *src, goto byte_at_a_time; while (max >= sizeof(unsigned long)) { - unsigned long c, data; + unsigned long c, data, mask; /* Fall back to byte-at-a-time if we get a page fault */ unsafe_get_user(c, (unsigned long __user *)(src+res), byte_at_a_time); - *(unsigned long *)(dst+res) = c; + /* + * Note that we mask out the bytes following the NUL. This is + * important to do because string oblivious code may read past + * the NUL. For those routines, we don't want to give them + * potentially random bytes after the NUL in `src`. + * + * One example of such code is BPF map keys. BPF treats map keys + * as an opaque set of bytes. Without the post-NUL mask, any BPF + * maps keyed by strings returned from strncpy_from_user() may + * have multiple entries for semantically identical strings. + */ if (has_zero(c, &data, &constants)) { data = prep_zero_mask(c, data, &constants); data = create_zero_mask(data); + mask = zero_bytemask(data); + *(unsigned long *)(dst+res) = c & mask; return res + find_zero(data); } + + *(unsigned long *)(dst+res) = c; + res += sizeof(unsigned long); max -= sizeof(unsigned long); } -- cgit v1.2.3 From 2801a5da5b25b7af9dd2addd19b2315c02d17b64 Mon Sep 17 00:00:00 2001 From: Luo Meng Date: Wed, 18 Nov 2020 22:49:31 +0900 Subject: fail_function: Remove a redundant mutex unlock Fix a mutex_unlock() issue where before copy_from_user() is not called mutex_locked. Fixes: 4b1a29a7f542 ("error-injection: Support fault injection framework") Reported-by: Hulk Robot Signed-off-by: Luo Meng Signed-off-by: Masami Hiramatsu Signed-off-by: Alexei Starovoitov Acked-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/160570737118.263807.8358435412898356284.stgit@devnote2 --- kernel/fail_function.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fail_function.c b/kernel/fail_function.c index 63b349168da7..b0b1ad93fa95 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -253,7 +253,7 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, if (copy_from_user(buf, buffer, count)) { ret = -EFAULT; - goto out; + goto out_free; } buf[count] = '\0'; sym = strstrip(buf); @@ -307,8 +307,9 @@ static ssize_t fei_write(struct file *file, const char __user *buffer, ret = count; } out: - kfree(buf); mutex_unlock(&fei_lock); +out_free: + kfree(buf); return ret; } -- cgit v1.2.3 From 6dbce04d8417ae706596366e16841d77c454ba52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 16 Nov 2020 13:10:12 +0100 Subject: rcu: Allow rcu_irq_enter_check_tick() from NMI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eugenio managed to tickle #PF from NMI context which resulted in hitting a WARN in RCU through irqentry_enter() -> __rcu_irq_enter_check_tick(). However, this situation is perfectly sane and does not warrant an WARN. The #PF will (necessarily) be atomic and not require messing with the tick state, so early return is correct. This commit therefore removes the WARN. Fixes: aaf2bc50df1f ("rcu: Abstract out rcu_irq_enter_check_tick() from rcu_nmi_enter()") Reported-by: "Eugenio Pérez" Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Andy Lutomirski Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 06895ef85d69..93e1808ac3fc 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -928,8 +928,8 @@ void __rcu_irq_enter_check_tick(void) { struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - // Enabling the tick is unsafe in NMI handlers. - if (WARN_ON_ONCE(in_nmi())) + // If we're here from NMI there's nothing to do. + if (in_nmi()) return; RCU_LOCKDEP_WARN(rcu_dynticks_curr_cpu_in_eqs(), -- cgit v1.2.3 From dfe564045c653d9e6969ccca57a8a04771d333f7 Mon Sep 17 00:00:00 2001 From: chao Date: Sun, 30 Aug 2020 23:41:17 -0700 Subject: rcu: Panic after fixed number of stalls Some stalls are transient, so that system fully recovers. This commit therefore allows users to configure the number of stalls that must happen in order to trigger kernel panic. Signed-off-by: chao Signed-off-by: Paul E. McKenney --- include/linux/kernel.h | 1 + kernel/rcu/tree_stall.h | 6 ++++++ kernel/sysctl.c | 11 +++++++++++ 3 files changed, 18 insertions(+) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2f05e9128201..4b5fd3da5fe8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -536,6 +536,7 @@ extern int panic_on_warn; extern unsigned long panic_on_taint; extern bool panic_on_taint_nousertaint; extern int sysctl_panic_on_rcu_stall; +extern int sysctl_max_rcu_stall_to_panic; extern int sysctl_panic_on_stackoverflow; extern bool crash_kexec_post_notifiers; diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index ca21d28a0f98..70d48c52fabc 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -13,6 +13,7 @@ /* panic() on RCU Stall sysctl. */ int sysctl_panic_on_rcu_stall __read_mostly; +int sysctl_max_rcu_stall_to_panic __read_mostly; #ifdef CONFIG_PROVE_RCU #define RCU_STALL_DELAY_DELTA (5 * HZ) @@ -106,6 +107,11 @@ early_initcall(check_cpu_stall_init); /* If so specified via sysctl, panic, yielding cleaner stall-warning output. */ static void panic_on_rcu_stall(void) { + static int cpu_stall; + + if (++cpu_stall < sysctl_max_rcu_stall_to_panic) + return; + if (sysctl_panic_on_rcu_stall) panic("RCU Stall\n"); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index afad085960b8..c9fbdd848138 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2650,6 +2650,17 @@ static struct ctl_table kern_table[] = { .extra2 = SYSCTL_ONE, }, #endif +#if defined(CONFIG_TREE_RCU) + { + .procname = "max_rcu_stall_to_panic", + .data = &sysctl_max_rcu_stall_to_panic, + .maxlen = sizeof(sysctl_max_rcu_stall_to_panic), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, + }, +#endif #ifdef CONFIG_STACKLEAK_RUNTIME_DISABLE { .procname = "stack_erasing", -- cgit v1.2.3 From e3771c850d3b9349b48449c9a91c98944a08650c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 21 Sep 2020 14:43:40 +0200 Subject: rcu: Implement rcu_segcblist_is_offloaded() config dependent This commit simplifies the use of the rcu_segcblist_is_offloaded() API so that its callers no longer need to check the RCU_NOCB_CPU Kconfig option. Note that rcu_segcblist_is_offloaded() is defined in the header file, which means that the generated code should be just as efficient as before. Suggested-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.h | 2 +- kernel/rcu/tree.c | 21 +++++++-------------- 2 files changed, 8 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 5c293afc07b8..492262bcb591 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -62,7 +62,7 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) /* Is the specified rcu_segcblist offloaded? */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - return rsclp->offloaded; + return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rsclp->offloaded; } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 93e1808ac3fc..0ccdca441ddf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1603,8 +1603,7 @@ static bool __note_gp_changes(struct rcu_node *rnp, struct rcu_data *rdp) { bool ret = false; bool need_qs; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); raw_lockdep_assert_held_rcu_node(rnp); @@ -2048,8 +2047,7 @@ static void rcu_gp_cleanup(void) needgp = true; } /* Advance CBs to reduce false positives below. */ - offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); if ((offloaded || !rcu_accelerate_cbs(rnp, rdp)) && needgp) { WRITE_ONCE(rcu_state.gp_flags, RCU_GP_FLAG_INIT); WRITE_ONCE(rcu_state.gp_req_activity, jiffies); @@ -2248,8 +2246,7 @@ rcu_report_qs_rdp(struct rcu_data *rdp) unsigned long flags; unsigned long mask; bool needwake = false; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_node *rnp; WARN_ON_ONCE(rdp->cpu != smp_processor_id()); @@ -2417,8 +2414,7 @@ static void rcu_do_batch(struct rcu_data *rdp) { int div; unsigned long flags; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); long bl, count; @@ -2675,8 +2671,7 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_is_offloaded(&rdp->cblist); + const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2978,8 +2973,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { + if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ } else { __call_rcu_core(rdp, head, flags); @@ -3712,8 +3706,7 @@ static int rcu_pending(int user) /* Has RCU gone idle with this CPU needing another grace period? */ if (!gp_in_progress && rcu_segcblist_is_enabled(&rdp->cblist) && - (!IS_ENABLED(CONFIG_RCU_NOCB_CPU) || - !rcu_segcblist_is_offloaded(&rdp->cblist)) && + !rcu_segcblist_is_offloaded(&rdp->cblist) && !rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return 1; -- cgit v1.2.3 From ed73860cecc3ec12aa50a6dcfb4900e5b4ae9507 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Wed, 23 Sep 2020 12:59:33 +0530 Subject: rcu: Fix single-CPU check in rcu_blocking_is_gp() Currently, for CONFIG_PREEMPTION=n kernels, rcu_blocking_is_gp() uses num_online_cpus() to determine whether there is only one CPU online. When there is only a single CPU online, the simple fact that synchronize_rcu() could be legally called implies that a full grace period has elapsed. Therefore, in the single-CPU case, synchronize_rcu() simply returns immediately. Unfortunately, num_online_cpus() is unreliable while a CPU-hotplug operation is transitioning to or from single-CPU operation because: 1. num_online_cpus() uses atomic_read(&__num_online_cpus) to locklessly sample the number of online CPUs. The hotplug locks are not held, which means that an incoming CPU can concurrently update this count. This in turn means that an RCU read-side critical section on the incoming CPU might observe updates prior to the grace period, but also that this critical section might extend beyond the end of the optimized synchronize_rcu(). This breaks RCU's fundamental guarantee. 2. In addition, num_online_cpus() does no ordering, thus providing another way that RCU's fundamental guarantee can be broken by the current code. 3. The most probable failure mode happens on outgoing CPUs. The outgoing CPU updates the count of online CPUs in the CPUHP_TEARDOWN_CPU stop-machine handler, which is fine in and of itself due to preemption being disabled at the call to num_online_cpus(). Unfortunately, after that stop-machine handler returns, the CPU takes one last trip through the scheduler (which has RCU readers) and, after the resulting context switch, one final dive into the idle loop. During this time, RCU needs to keep track of two CPUs, but num_online_cpus() will say that there is only one, which in turn means that the surviving CPU will incorrectly ignore the outgoing CPU's RCU read-side critical sections. This problem is illustrated by the following litmus test in which P0() corresponds to synchronize_rcu() and P1() corresponds to the incoming CPU. The herd7 tool confirms that the "exists" clause can be satisfied, thus demonstrating that this breakage can happen according to the Linux kernel memory model. { int x = 0; atomic_t numonline = ATOMIC_INIT(1); } P0(int *x, atomic_t *numonline) { int r0; WRITE_ONCE(*x, 1); r0 = atomic_read(numonline); if (r0 == 1) { smp_mb(); } else { synchronize_rcu(); } WRITE_ONCE(*x, 2); } P1(int *x, atomic_t *numonline) { int r0; int r1; atomic_inc(numonline); smp_mb(); rcu_read_lock(); r0 = READ_ONCE(*x); smp_rmb(); r1 = READ_ONCE(*x); rcu_read_unlock(); } locations [x;numonline;] exists (1:r0=0 /\ 1:r1=2) It is important to note that these problems arise only when the system is transitioning to or from single-CPU operation. One solution would be to hold the CPU-hotplug locks while sampling num_online_cpus(), which was in fact the intent of the (redundant) preempt_disable() and preempt_enable() surrounding this call to num_online_cpus(). Actually blocking CPU hotplug would not only result in excessive overhead, but would also unnecessarily impede CPU-hotplug operations. This commit therefore follows long-standing RCU tradition by maintaining a separate RCU-specific set of CPU-hotplug books. This separate set of books is implemented by a new ->n_online_cpus field in the rcu_state structure that maintains RCU's count of the online CPUs. This count is incremented early in the CPU-online process, so that the critical transition away from single-CPU operation will occur when there is only a single CPU. Similarly for the critical transition to single-CPU operation, the counter is decremented late in the CPU-offline process, again while there is only a single CPU. Because there is only ever a single CPU when the ->n_online_cpus field undergoes the critical 1->2 and 2->1 transitions, full memory ordering and mutual exclusion is provided implicitly and, better yet, for free. In the case where the CPU is coming online, nothing will happen until the current CPU helps it come online. Therefore, the new CPU will see all accesses prior to the optimized grace period, which means that RCU does not need to further delay this new CPU. In the case where the CPU is going offline, the outgoing CPU is totally out of the picture before the optimized grace period starts, which means that this outgoing CPU cannot see any of the accesses following that grace period. Again, RCU needs no further interaction with the outgoing CPU. This does mean that synchronize_rcu() will unnecessarily do a few grace periods the hard way just before the second CPU comes online and just after the second-to-last CPU goes offline, but it is not worth optimizing this uncommon case. Reviewed-by: Joel Fernandes (Google) Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 19 +++++++++++++++++-- kernel/rcu/tree.h | 1 + 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0ccdca441ddf..39e14cf6a9c0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2396,6 +2396,7 @@ int rcutree_dead_cpu(unsigned int cpu) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return 0; + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); /* Do any needed no-CB deferred wakeups from this CPU. */ @@ -3577,7 +3578,20 @@ static int rcu_blocking_is_gp(void) return rcu_scheduler_active == RCU_SCHEDULER_INACTIVE; might_sleep(); /* Check for RCU read-side critical section. */ preempt_disable(); - ret = num_online_cpus() <= 1; + /* + * If the rcu_state.n_online_cpus counter is equal to one, + * there is only one CPU, and that CPU sees all prior accesses + * made by any CPU that was online at the time of its access. + * Furthermore, if this counter is equal to one, its value cannot + * change until after the preempt_enable() below. + * + * Furthermore, if rcu_state.n_online_cpus is equal to one here, + * all later CPUs (both this one and any that come online later + * on) are guaranteed to see all accesses prior to this point + * in the code, without the need for additional memory barriers. + * Those memory barriers are provided by CPU-hotplug code. + */ + ret = READ_ONCE(rcu_state.n_online_cpus) <= 1; preempt_enable(); return ret; } @@ -3622,7 +3636,7 @@ void synchronize_rcu(void) lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); if (rcu_blocking_is_gp()) - return; + return; // Context allows vacuous grace periods. if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else @@ -3962,6 +3976,7 @@ int rcutree_prepare_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_prepare_kthreads(cpu); rcu_spawn_cpu_nocb_kthread(cpu); + WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); return 0; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e4f66b8f7c47..805c9eb6f7ae 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -298,6 +298,7 @@ struct rcu_state { /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ int ncpus; /* # CPUs seen so far. */ + int n_online_cpus; /* # CPUs online for RCU. */ /* The following fields are guarded by the root rcu_node's lock. */ -- cgit v1.2.3 From a3941517fcd6625adc540aef5ec3f717c8fa71e8 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Thu, 24 Sep 2020 12:04:10 +0530 Subject: rcu: Clarify nocb kthreads naming in RCU_NOCB_CPU config This commit clarifies that the "p" and the "s" in the in the RCU_NOCB_CPU config-option description refer to the "x" in the "rcuox/N" kthread name. Signed-off-by: Neeraj Upadhyay [ paulmck: While in the area, update description and advice. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b71e21f73c40..cdc57b4f6d48 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -221,19 +221,23 @@ config RCU_NOCB_CPU Use this option to reduce OS jitter for aggressive HPC or real-time workloads. It can also be used to offload RCU callback invocation to energy-efficient CPUs in battery-powered - asymmetric multiprocessors. + asymmetric multiprocessors. The price of this reduced jitter + is that the overhead of call_rcu() increases and that some + workloads will incur significant increases in context-switch + rates. This option offloads callback invocation from the set of CPUs specified at boot time by the rcu_nocbs parameter. For each such CPU, a kthread ("rcuox/N") will be created to invoke callbacks, where the "N" is the CPU being offloaded, and where - the "p" for RCU-preempt (PREEMPTION kernels) and "s" for RCU-sched - (!PREEMPTION kernels). Nothing prevents this kthread from running - on the specified CPUs, but (1) the kthreads may be preempted - between each callback, and (2) affinity or cgroups can be used - to force the kthreads to run on whatever set of CPUs is desired. - - Say Y here if you want to help to debug reduced OS jitter. + the "x" is "p" for RCU-preempt (PREEMPTION kernels) and "s" for + RCU-sched (!PREEMPTION kernels). Nothing prevents this kthread + from running on the specified CPUs, but (1) the kthreads may be + preempted between each callback, and (2) affinity or cgroups can + be used to force the kthreads to run on whatever set of CPUs is + desired. + + Say Y here if you need reduced OS jitter, despite added overhead. Say N here if you are unsure. config TASKS_TRACE_RCU_READ_MB -- cgit v1.2.3 From 9f866dac94292f93d3b6bf8dbe860a44b954e555 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 29 Sep 2020 15:29:27 -0400 Subject: rcu/tree: Add a warning if CPU being onlined did not report QS already Currently, rcu_cpu_starting() checks to see if the RCU core expects a quiescent state from the incoming CPU. However, the current interaction between RCU quiescent-state reporting and CPU-hotplug operations should mean that the incoming CPU never needs to report a quiescent state. First, the outgoing CPU reports a quiescent state if needed. Second, the race where the CPU is leaving just as RCU is initializing a new grace period is handled by an explicit check for this condition. Third, the CPU's leaf rcu_node structure's ->lock serializes these checks. This means that if rcu_cpu_starting() ever feels the need to report a quiescent state, then there is a bug somewhere in the CPU hotplug code or the RCU grace-period handling code. This commit therefore adds a WARN_ON_ONCE() to bring that bug to everyone's attention. Cc: Neeraj Upadhyay Suggested-by: Paul E. McKenney Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 39e14cf6a9c0..e4d6d0b1b853 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4075,7 +4075,9 @@ void rcu_cpu_starting(unsigned int cpu) rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); - if (rnp->qsmask & mask) { /* RCU waiting on incoming CPU? */ + + /* An incoming CPU should never be blocking a grace period. */ + if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ rcu_disable_urgency_upon_qs(rdp); /* Report QS -after- changing ->qsmaskinitnext! */ rcu_report_qs_rnp(mask, rnp, rnp->gp_seq, flags); -- cgit v1.2.3 From 7c47ee5aa00817d8b10f415b4a92d5fb3ac35273 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sat, 3 Oct 2020 17:18:08 -0700 Subject: rcu/tree: Make struct kernel_param_ops definitions const These should be const, so make it so. Signed-off-by: Joe Perches Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index e4d6d0b1b853..5f458e4efc95 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -546,12 +546,12 @@ static int param_set_next_fqs_jiffies(const char *val, const struct kernel_param return ret; } -static struct kernel_param_ops first_fqs_jiffies_ops = { +static const struct kernel_param_ops first_fqs_jiffies_ops = { .set = param_set_first_fqs_jiffies, .get = param_get_ulong, }; -static struct kernel_param_ops next_fqs_jiffies_ops = { +static const struct kernel_param_ops next_fqs_jiffies_ops = { .set = param_set_next_fqs_jiffies, .get = param_get_ulong, }; -- cgit v1.2.3 From d2098b4440981705e844c50254540ba7b5f82795 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 29 Sep 2020 13:33:40 +0200 Subject: rcu,ftrace: Fix ftrace recursion Kim reported that perf-ftrace made his box unhappy. It turns out that commit: ff5c4f5cad33 ("rcu/tree: Mark the idle relevant functions noinstr") removed one too many notrace qualifiers, probably due to there not being a helpful comment. This commit therefore reinstates the notrace and adds a comment to avoid losing it again. [ paulmck: Apply Steven Rostedt's feedback on the comment. ] Fixes: ff5c4f5cad33 ("rcu/tree: Mark the idle relevant functions noinstr") Reported-by: Kim Phillips Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5f458e4efc95..d6a015e68649 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1093,8 +1093,11 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp) * CPU can safely enter RCU read-side critical sections. In other words, * if the current CPU is not in its idle loop or is in an interrupt or * NMI handler, return true. + * + * Make notrace because it can be called by the internal functions of + * ftrace, and making this notrace removes unnecessary recursion calls. */ -bool rcu_is_watching(void) +notrace bool rcu_is_watching(void) { bool ret; -- cgit v1.2.3 From bd56e0a4a291bc9db2cbaddef20ec61a1aad4208 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 7 Oct 2020 13:50:36 -0700 Subject: rcu/tree: nocb: Avoid raising softirq for offloaded ready-to-execute CBs Testing showed that rcu_pending() can return 1 when offloaded callbacks are ready to execute. This invokes RCU core processing, for example, by raising RCU_SOFTIRQ, eventually resulting in a call to rcu_core(). However, rcu_core() explicitly avoids in any way manipulating offloaded callbacks, which are instead handled by the rcuog and rcuoc kthreads, which work independently of rcu_core(). One exception to this independence is that rcu_core() invokes do_nocb_deferred_wakeup(), however, rcu_pending() also checks rcu_nocb_need_deferred_wakeup() in order to correctly handle this case, invoking rcu_core() when needed. This commit therefore avoids needlessly invoking RCU core processing by checking rcu_segcblist_ready_cbs() only on non-offloaded CPUs. This reduces overhead, for example, by reducing softirq activity. This change passed 30 minute tests of TREE01 through TREE09 each. On TREE08, there is at most 150us from the time that rcu_pending() chose not to invoke RCU core processing to the time when the ready callbacks were invoked by the rcuoc kthread. This provides further evidence that there is no need to invoke rcu_core() for offloaded callbacks that are ready to invoke. Cc: Neeraj Upadhyay Reviewed-by: Frederic Weisbecker Reviewed-by: Neeraj Upadhyay Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d6a015e68649..50d90ee6dfe1 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3718,7 +3718,8 @@ static int rcu_pending(int user) return 1; /* Does this CPU have callbacks ready to invoke? */ - if (rcu_segcblist_ready_cbs(&rdp->cblist)) + if (!rcu_segcblist_is_offloaded(&rdp->cblist) && + rcu_segcblist_ready_cbs(&rdp->cblist)) return 1; /* Has RCU gone idle with this CPU needing another grace period? */ -- cgit v1.2.3 From 4d60b475f858ebdb06c1339f01a890f287b5e587 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 13 Oct 2020 12:39:23 -0700 Subject: rcu: Prevent lockdep-RCU splats on lock acquisition/release The rcu_cpu_starting() and rcu_report_dead() functions transition the current CPU between online and offline state from an RCU perspective. Unfortunately, this means that the rcu_cpu_starting() function's lock acquisition and the rcu_report_dead() function's lock releases happen while the CPU is offline from an RCU perspective, which can result in lockdep-RCU splats about using RCU from an offline CPU. And this situation can also result in too-short grace periods, especially in guest OSes that are subject to vCPU preemption. This commit therefore uses sequence-count-like synchronization to forgive use of RCU while RCU thinks a CPU is offline across the full extent of the rcu_cpu_starting() and rcu_report_dead() function's lock acquisitions and releases. One approach would have been to use the actual sequence-count primitives provided by the Linux kernel. Unfortunately, the resulting code looks completely broken and wrong, and is likely to result in patches that break RCU in an attempt to address this appearance of broken wrongness. Plus there is no net savings in lines of code, given the additional explicit memory barriers required. Therefore, this sequence count is instead implemented by a new ->ofl_seq field in the rcu_node structure. If this counter's value is an odd number, RCU forgives RCU read-side critical sections on other CPUs covered by the same rcu_node structure, even if those CPUs are offline from an RCU perspective. In addition, if a given leaf rcu_node structure's ->ofl_seq counter value is an odd number, rcu_gp_init() delays starting the grace period until that counter value changes. [ paulmck: Apply Peter Zijlstra feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 21 ++++++++++++++++++++- kernel/rcu/tree.h | 1 + 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 50d90ee6dfe1..34385341f66a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1152,7 +1152,7 @@ bool rcu_lockdep_current_cpu_online(void) preempt_disable_notrace(); rdp = this_cpu_ptr(&rcu_data); rnp = rdp->mynode; - if (rdp->grpmask & rcu_rnp_online_cpus(rnp)) + if (rdp->grpmask & rcu_rnp_online_cpus(rnp) || READ_ONCE(rnp->ofl_seq) & 0x1) ret = true; preempt_enable_notrace(); return ret; @@ -1717,6 +1717,7 @@ static void rcu_strict_gp_boundary(void *unused) */ static bool rcu_gp_init(void) { + unsigned long firstseq; unsigned long flags; unsigned long oldmask; unsigned long mask; @@ -1760,6 +1761,12 @@ static bool rcu_gp_init(void) */ rcu_state.gp_state = RCU_GP_ONOFF; rcu_for_each_leaf_node(rnp) { + smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values. + firstseq = READ_ONCE(rnp->ofl_seq); + if (firstseq & 0x1) + while (firstseq == READ_ONCE(rnp->ofl_seq)) + schedule_timeout_idle(1); // Can't wake unless RCU is watching. + smp_mb(); // Pair with barriers used when updating ->ofl_seq to even values. raw_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && @@ -4069,6 +4076,9 @@ void rcu_cpu_starting(unsigned int cpu) rnp = rdp->mynode; mask = rdp->grpmask; + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). raw_spin_lock_irqsave_rcu_node(rnp, flags); WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext | mask); newcpu = !(rnp->expmaskinitnext & mask); @@ -4088,6 +4098,9 @@ void rcu_cpu_starting(unsigned int cpu) } else { raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(rnp->ofl_seq & 0x1); smp_mb(); /* Ensure RCU read-side usage follows above initialization. */ } @@ -4115,6 +4128,9 @@ void rcu_report_dead(unsigned int cpu) /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ mask = rdp->grpmask; + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(!(rnp->ofl_seq & 0x1)); + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). raw_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); @@ -4127,6 +4143,9 @@ void rcu_report_dead(unsigned int cpu) WRITE_ONCE(rnp->qsmaskinitnext, rnp->qsmaskinitnext & ~mask); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); raw_spin_unlock(&rcu_state.ofl_lock); + smp_mb(); // Pair with rcu_gp_cleanup()'s ->ofl_seq barrier(). + WRITE_ONCE(rnp->ofl_seq, rnp->ofl_seq + 1); + WARN_ON_ONCE(rnp->ofl_seq & 0x1); rdp->cpu_started = false; } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 805c9eb6f7ae..7708ed161f4a 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -56,6 +56,7 @@ struct rcu_node { /* Initialized from ->qsmaskinitnext at the */ /* beginning of each grace period. */ unsigned long qsmaskinitnext; + unsigned long ofl_seq; /* CPU-hotplug operation sequence count. */ /* Online CPUs for next grace period. */ unsigned long expmask; /* CPUs or groups that need to check in */ /* to allow the current expedited GP */ -- cgit v1.2.3 From 354c3f0e22dcb17c10d0b79f6e1c5ba286eec0b0 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Thu, 15 Oct 2020 03:53:03 +0000 Subject: rcu: Fix a typo in rcu_blocking_is_gp() header comment This commit fixes a typo in the rcu_blocking_is_gp() function's header comment. Signed-off-by: Zhouyi Zhou Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 34385341f66a..0f278d6486c2 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3572,7 +3572,7 @@ void __init kfree_rcu_scheduler_running(void) * During early boot, any blocking grace-period wait automatically * implies a grace period. Later on, this is never the case for PREEMPTION. * - * Howevr, because a context switch is a grace period for !PREEMPTION, any + * However, because a context switch is a grace period for !PREEMPTION, any * blocking grace-period wait automatically implies a grace period if * there is only one CPU online at any point time during execution of * either synchronize_rcu() or synchronize_rcu_expedited(). It is OK to -- cgit v1.2.3 From bfb3aa735f82c8d98b32a669934ee7d6b346264d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 30 Oct 2020 13:11:24 -0700 Subject: rcu: Do not report strict GPs for outgoing CPUs An outgoing CPU is marked offline in a stop-machine handler and most of that CPU's services stop at that point, including IRQ work queues. However, that CPU must take another pass through the scheduler and through a number of CPU-hotplug notifiers, many of which contain RCU readers. In the past, these readers were not a problem because the outgoing CPU has interrupts disabled, so that rcu_read_unlock_special() would not be invoked, and thus RCU would never attempt to queue IRQ work on the outgoing CPU. This changed with the advent of the CONFIG_RCU_STRICT_GRACE_PERIOD Kconfig option, in which rcu_read_unlock_special() is invoked upon exit from almost all RCU read-side critical sections. Worse yet, because interrupts are disabled, rcu_read_unlock_special() cannot immediately report a quiescent state and will therefore attempt to defer this reporting, for example, by queueing IRQ work. Which fails with a splat because the CPU is already marked as being offline. But it turns out that there is no need to report this quiescent state because rcu_report_dead() will do this job shortly after the outgoing CPU makes its final dive into the idle loop. This commit therefore makes rcu_read_unlock_special() refrain from queuing IRQ work onto outgoing CPUs. Fixes: 44bad5b3cca2 ("rcu: Do full report for .need_qs for strict GPs") Signed-off-by: Paul E. McKenney Cc: Jann Horn --- kernel/rcu/tree_plugin.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fd8a52e9a887..7e291ce0a1d6 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -628,7 +628,7 @@ static void rcu_read_unlock_special(struct task_struct *t) set_tsk_need_resched(current); set_preempt_need_resched(); if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled && - !rdp->defer_qs_iw_pending && exp) { + !rdp->defer_qs_iw_pending && exp && cpu_online(rdp->cpu)) { // Get scheduler to re-evaluate and call hooks. // If !IRQ_WORK, FQS scan will eventually IPI. init_irq_work(&rdp->defer_qs_iw, -- cgit v1.2.3 From 56292e8609e39537297a7468dda4d87b9bd81d6a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 29 Oct 2020 17:50:04 +0100 Subject: rcu/tree: Defer kvfree_rcu() allocation to a clean context The current memmory-allocation interface causes the following difficulties for kvfree_rcu(): a) If built with CONFIG_PROVE_RAW_LOCK_NESTING, the lockdep will complain about violation of the nesting rules, as in "BUG: Invalid wait context". This Kconfig option checks for proper raw_spinlock vs. spinlock nesting, in particular, it is not legal to acquire a spinlock_t while holding a raw_spinlock_t. This is a problem because kfree_rcu() uses raw_spinlock_t whereas the "page allocator" internally deals with spinlock_t to access to its zones. The code also can be broken from higher level of view: raw_spin_lock(&some_lock); kfree_rcu(some_pointer, some_field_offset); b) If built with CONFIG_PREEMPT_RT, spinlock_t is converted into sleeplock. This means that invoking the page allocator from atomic contexts results in "BUG: scheduling while atomic". c) Please note that call_rcu() is already invoked from raw atomic context, so it is only reasonable to expaect that kfree_rcu() and kvfree_rcu() will also be called from atomic raw context. This commit therefore defers page allocation to a clean context using the combination of an hrtimer and a workqueue. The hrtimer stage is required in order to avoid deadlocks with the scheduler. This deferred allocation is required only when kvfree_rcu()'s per-CPU page cache is empty. Link: https://lore.kernel.org/lkml/20200630164543.4mdcf6zb4zfclhln@linutronix.de/ Fixes: 3042f83f19be ("rcu: Support reclaim for head-less object") Reported-by: Sebastian Andrzej Siewior Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 109 +++++++++++++++++++++++++++++++++--------------------- 1 file changed, 66 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0f278d6486c2..01918d8cffb3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -177,7 +177,7 @@ module_param(rcu_unlock_delay, int, 0444); * per-CPU. Object size is equal to one page. This value * can be changed at boot time. */ -static int rcu_min_cached_objs = 2; +static int rcu_min_cached_objs = 5; module_param(rcu_min_cached_objs, int, 0444); /* Retrieve RCU kthreads priority for rcutorture */ @@ -3089,6 +3089,9 @@ struct kfree_rcu_cpu_work { * In order to save some per-cpu space the list is singular. * Even though it is lockless an access has to be protected by the * per-cpu lock. + * @page_cache_work: A work to refill the cache when it is empty + * @work_in_progress: Indicates that page_cache_work is running + * @hrtimer: A hrtimer for scheduling a page_cache_work * @nr_bkv_objs: number of allocated objects at @bkvcache. * * This is a per-CPU structure. The reason that it is not included in @@ -3105,6 +3108,11 @@ struct kfree_rcu_cpu { bool monitor_todo; bool initialized; int count; + + struct work_struct page_cache_work; + atomic_t work_in_progress; + struct hrtimer hrtimer; + struct llist_head bkvcache; int nr_bkv_objs; }; @@ -3222,10 +3230,10 @@ static void kfree_rcu_work(struct work_struct *work) } rcu_lock_release(&rcu_callback_map); - krcp = krc_this_cpu_lock(&flags); + raw_spin_lock_irqsave(&krcp->lock, flags); if (put_cached_bnode(krcp, bkvhead[i])) bkvhead[i] = NULL; - krc_this_cpu_unlock(krcp, flags); + raw_spin_unlock_irqrestore(&krcp->lock, flags); if (bkvhead[i]) free_page((unsigned long) bkvhead[i]); @@ -3352,6 +3360,57 @@ static void kfree_rcu_monitor(struct work_struct *work) raw_spin_unlock_irqrestore(&krcp->lock, flags); } +static enum hrtimer_restart +schedule_page_work_fn(struct hrtimer *t) +{ + struct kfree_rcu_cpu *krcp = + container_of(t, struct kfree_rcu_cpu, hrtimer); + + queue_work(system_highpri_wq, &krcp->page_cache_work); + return HRTIMER_NORESTART; +} + +static void fill_page_cache_func(struct work_struct *work) +{ + struct kvfree_rcu_bulk_data *bnode; + struct kfree_rcu_cpu *krcp = + container_of(work, struct kfree_rcu_cpu, + page_cache_work); + unsigned long flags; + bool pushed; + int i; + + for (i = 0; i < rcu_min_cached_objs; i++) { + bnode = (struct kvfree_rcu_bulk_data *) + __get_free_page(GFP_KERNEL | __GFP_NOWARN); + + if (bnode) { + raw_spin_lock_irqsave(&krcp->lock, flags); + pushed = put_cached_bnode(krcp, bnode); + raw_spin_unlock_irqrestore(&krcp->lock, flags); + + if (!pushed) { + free_page((unsigned long) bnode); + break; + } + } + } + + atomic_set(&krcp->work_in_progress, 0); +} + +static void +run_page_cache_worker(struct kfree_rcu_cpu *krcp) +{ + if (rcu_scheduler_active == RCU_SCHEDULER_RUNNING && + !atomic_xchg(&krcp->work_in_progress, 1)) { + hrtimer_init(&krcp->hrtimer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + krcp->hrtimer.function = schedule_page_work_fn; + hrtimer_start(&krcp->hrtimer, 0, HRTIMER_MODE_REL); + } +} + static inline bool kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) { @@ -3368,32 +3427,8 @@ kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr) if (!krcp->bkvhead[idx] || krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) { bnode = get_cached_bnode(krcp); - if (!bnode) { - /* - * To keep this path working on raw non-preemptible - * sections, prevent the optional entry into the - * allocator as it uses sleeping locks. In fact, even - * if the caller of kfree_rcu() is preemptible, this - * path still is not, as krcp->lock is a raw spinlock. - * With additional page pre-allocation in the works, - * hitting this return is going to be much less likely. - */ - if (IS_ENABLED(CONFIG_PREEMPT_RT)) - return false; - - /* - * NOTE: For one argument of kvfree_rcu() we can - * drop the lock and get the page in sleepable - * context. That would allow to maintain an array - * for the CONFIG_PREEMPT_RT as well if no cached - * pages are available. - */ - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - } - /* Switch to emergency path. */ - if (unlikely(!bnode)) + if (!bnode) return false; /* Initialize the new block. */ @@ -3457,12 +3492,10 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) goto unlock_return; } - /* - * Under high memory pressure GFP_NOWAIT can fail, - * in that case the emergency path is maintained. - */ success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); if (!success) { + run_page_cache_worker(krcp); + if (head == NULL) // Inline if kvfree_rcu(one_arg) call. goto unlock_return; @@ -4482,24 +4515,14 @@ static void __init kfree_rcu_batch_init(void) for_each_possible_cpu(cpu) { struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu); - struct kvfree_rcu_bulk_data *bnode; for (i = 0; i < KFREE_N_BATCHES; i++) { INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work); krcp->krw_arr[i].krcp = krcp; } - for (i = 0; i < rcu_min_cached_objs; i++) { - bnode = (struct kvfree_rcu_bulk_data *) - __get_free_page(GFP_NOWAIT | __GFP_NOWARN); - - if (bnode) - put_cached_bnode(krcp, bnode); - else - pr_err("Failed to preallocate for %d CPU!\n", cpu); - } - INIT_DELAYED_WORK(&krcp->monitor_work, kfree_rcu_monitor); + INIT_WORK(&krcp->page_cache_work, fill_page_cache_func); krcp->initialized = true; } if (register_shrinker(&kfree_rcu_shrinker)) -- cgit v1.2.3 From 50edb988534c621a56ca103c0c16ac59e7399f01 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 Sep 2020 11:54:42 -0700 Subject: srcu: Take early exit on memory-allocation failure It turns out that init_srcu_struct() can be invoked from usermode tasks, and that fatal signals received by these tasks can cause memory-allocation failures. These failures are not handled well by init_srcu_struct(), so much so that NULL pointer dereferences can result. This commit therefore causes init_srcu_struct() to take an early exit upon detection of memory-allocation failure. Link: https://lore.kernel.org/lkml/20200908144306.33355-1-aik@ozlabs.ru/ Reported-by: Alexey Kardashevskiy Tested-by: Alexey Kardashevskiy Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c13348ee80a5..6f7880acfdd5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -177,11 +177,13 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) INIT_DELAYED_WORK(&ssp->work, process_srcu); if (!is_static) ssp->sda = alloc_percpu(struct srcu_data); + if (!ssp->sda) + return -ENOMEM; init_srcu_struct_nodes(ssp, is_static); ssp->srcu_gp_seq_needed_exp = 0; ssp->srcu_last_gp_end = ktime_get_mono_fast_ns(); smp_store_release(&ssp->srcu_gp_seq_needed, 0); /* Init done. */ - return ssp->sda ? 0 : -ENOMEM; + return 0; } #ifdef CONFIG_DEBUG_LOCK_ALLOC -- cgit v1.2.3 From a24d22b225ce158651378869a6b88105c4bdb887 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 12 Nov 2020 21:20:21 -0800 Subject: crypto: sha - split sha.h into sha1.h and sha2.h Currently contains declarations for both SHA-1 and SHA-2, and contains declarations for SHA-3. This organization is inconsistent, but more importantly SHA-1 is no longer considered to be cryptographically secure. So to the extent possible, SHA-1 shouldn't be grouped together with any of the other SHA versions, and usage of it should be phased out. Therefore, split into two headers and , and make everyone explicitly specify whether they want the declarations for SHA-1, SHA-2, or both. This avoids making the SHA-1 declarations visible to files that don't want anything to do with SHA-1. It also prepares for potentially moving sha1.h into a new insecure/ or dangerous/ directory. Signed-off-by: Eric Biggers Acked-by: Ard Biesheuvel Acked-by: Jason A. Donenfeld Signed-off-by: Herbert Xu --- arch/arm/crypto/sha1-ce-glue.c | 2 +- arch/arm/crypto/sha1.h | 2 +- arch/arm/crypto/sha1_glue.c | 2 +- arch/arm/crypto/sha1_neon_glue.c | 2 +- arch/arm/crypto/sha2-ce-glue.c | 2 +- arch/arm/crypto/sha256_glue.c | 2 +- arch/arm/crypto/sha256_neon_glue.c | 2 +- arch/arm/crypto/sha512-glue.c | 2 +- arch/arm/crypto/sha512-neon-glue.c | 2 +- arch/arm64/crypto/aes-glue.c | 2 +- arch/arm64/crypto/sha1-ce-glue.c | 2 +- arch/arm64/crypto/sha2-ce-glue.c | 2 +- arch/arm64/crypto/sha256-glue.c | 2 +- arch/arm64/crypto/sha512-ce-glue.c | 2 +- arch/arm64/crypto/sha512-glue.c | 2 +- arch/mips/cavium-octeon/crypto/octeon-sha1.c | 2 +- arch/mips/cavium-octeon/crypto/octeon-sha256.c | 2 +- arch/mips/cavium-octeon/crypto/octeon-sha512.c | 2 +- arch/powerpc/crypto/sha1-spe-glue.c | 2 +- arch/powerpc/crypto/sha1.c | 2 +- arch/powerpc/crypto/sha256-spe-glue.c | 2 +- arch/s390/crypto/sha.h | 3 +- arch/s390/crypto/sha1_s390.c | 2 +- arch/s390/crypto/sha256_s390.c | 2 +- arch/s390/crypto/sha3_256_s390.c | 1 - arch/s390/crypto/sha3_512_s390.c | 1 - arch/s390/crypto/sha512_s390.c | 2 +- arch/s390/purgatory/purgatory.c | 2 +- arch/sparc/crypto/sha1_glue.c | 2 +- arch/sparc/crypto/sha256_glue.c | 2 +- arch/sparc/crypto/sha512_glue.c | 2 +- arch/x86/crypto/sha1_ssse3_glue.c | 2 +- arch/x86/crypto/sha256_ssse3_glue.c | 2 +- arch/x86/crypto/sha512_ssse3_glue.c | 2 +- arch/x86/purgatory/purgatory.c | 2 +- crypto/asymmetric_keys/asym_tpm.c | 2 +- crypto/sha1_generic.c | 2 +- crypto/sha256_generic.c | 2 +- crypto/sha512_generic.c | 2 +- drivers/char/random.c | 2 +- drivers/crypto/allwinner/sun4i-ss/sun4i-ss.h | 2 +- drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c | 3 +- drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h | 3 +- drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c | 3 +- drivers/crypto/allwinner/sun8i-ss/sun8i-ss.h | 3 +- drivers/crypto/amcc/crypto4xx_alg.c | 2 +- drivers/crypto/amcc/crypto4xx_core.c | 2 +- drivers/crypto/atmel-authenc.h | 3 +- drivers/crypto/atmel-sha.c | 3 +- drivers/crypto/axis/artpec6_crypto.c | 3 +- drivers/crypto/bcm/cipher.c | 3 +- drivers/crypto/bcm/cipher.h | 3 +- drivers/crypto/bcm/spu.h | 3 +- drivers/crypto/caam/compat.h | 3 +- drivers/crypto/cavium/nitrox/nitrox_aead.c | 1 - drivers/crypto/ccp/ccp-crypto-sha.c | 3 +- drivers/crypto/ccp/ccp-crypto.h | 3 +- drivers/crypto/ccree/cc_driver.h | 3 +- drivers/crypto/chelsio/chcr_algo.c | 3 +- drivers/crypto/hisilicon/sec2/sec_crypto.c | 3 +- drivers/crypto/img-hash.c | 3 +- drivers/crypto/inside-secure/safexcel.h | 3 +- drivers/crypto/inside-secure/safexcel_cipher.c | 3 +- drivers/crypto/inside-secure/safexcel_hash.c | 3 +- drivers/crypto/ixp4xx_crypto.c | 2 +- drivers/crypto/marvell/cesa/hash.c | 3 +- drivers/crypto/marvell/octeontx/otx_cptvf_algs.c | 3 +- drivers/crypto/mediatek/mtk-sha.c | 3 +- drivers/crypto/mxs-dcp.c | 3 +- drivers/crypto/n2_core.c | 3 +- drivers/crypto/nx/nx-sha256.c | 2 +- drivers/crypto/nx/nx-sha512.c | 2 +- drivers/crypto/nx/nx.c | 2 +- drivers/crypto/omap-sham.c | 3 +- drivers/crypto/padlock-sha.c | 3 +- drivers/crypto/picoxcell_crypto.c | 3 +- drivers/crypto/qat/qat_common/qat_algs.c | 3 +- drivers/crypto/qce/common.c | 3 +- drivers/crypto/qce/core.c | 1 - drivers/crypto/qce/sha.h | 3 +- drivers/crypto/rockchip/rk3288_crypto.h | 3 +- drivers/crypto/s5p-sss.c | 3 +- drivers/crypto/sa2ul.c | 3 +- drivers/crypto/sa2ul.h | 3 +- drivers/crypto/sahara.c | 3 +- drivers/crypto/stm32/stm32-hash.c | 3 +- drivers/crypto/talitos.c | 3 +- drivers/crypto/ux500/hash/hash_core.c | 3 +- drivers/firmware/efi/embedded-firmware.c | 2 +- .../chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c | 3 +- .../ethernet/chelsio/inline_crypto/chtls/chtls.h | 3 +- drivers/nfc/s3fwrn5/firmware.c | 2 +- drivers/tee/tee_core.c | 2 +- fs/crypto/fname.c | 2 +- fs/crypto/hkdf.c | 2 +- fs/ubifs/auth.c | 1 - fs/verity/fsverity_private.h | 2 +- include/crypto/hash_info.h | 3 +- include/crypto/sha.h | 167 --------------------- include/crypto/sha1.h | 46 ++++++ include/crypto/sha1_base.h | 2 +- include/crypto/sha2.h | 134 +++++++++++++++++ include/crypto/sha256_base.h | 2 +- include/crypto/sha512_base.h | 2 +- include/linux/ccp.h | 3 +- include/linux/filter.h | 2 +- include/linux/purgatory.h | 2 +- kernel/crash_core.c | 2 +- kernel/kexec_core.c | 1 - kernel/kexec_file.c | 2 +- lib/crypto/sha256.c | 2 +- lib/digsig.c | 2 +- lib/sha1.c | 2 +- net/ipv6/seg6_hmac.c | 1 - net/mptcp/crypto.c | 2 +- net/mptcp/options.c | 2 +- net/mptcp/subflow.c | 2 +- security/integrity/integrity.h | 2 +- security/keys/encrypted-keys/encrypted.c | 2 +- security/keys/trusted-keys/trusted_tpm1.c | 2 +- sound/soc/codecs/cros_ec_codec.c | 2 +- 121 files changed, 335 insertions(+), 285 deletions(-) delete mode 100644 include/crypto/sha.h create mode 100644 include/crypto/sha1.h create mode 100644 include/crypto/sha2.h (limited to 'kernel') diff --git a/arch/arm/crypto/sha1-ce-glue.c b/arch/arm/crypto/sha1-ce-glue.c index e79b1fb4b4dc..de9100c67b37 100644 --- a/arch/arm/crypto/sha1-ce-glue.c +++ b/arch/arm/crypto/sha1-ce-glue.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arm/crypto/sha1.h b/arch/arm/crypto/sha1.h index 758db3e9ff0a..b1b7e21da2c3 100644 --- a/arch/arm/crypto/sha1.h +++ b/arch/arm/crypto/sha1.h @@ -3,7 +3,7 @@ #define ASM_ARM_CRYPTO_SHA1_H #include -#include +#include extern int sha1_update_arm(struct shash_desc *desc, const u8 *data, unsigned int len); diff --git a/arch/arm/crypto/sha1_glue.c b/arch/arm/crypto/sha1_glue.c index 4e954b3f7ecd..6c2b849e459d 100644 --- a/arch/arm/crypto/sha1_glue.c +++ b/arch/arm/crypto/sha1_glue.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/arm/crypto/sha1_neon_glue.c b/arch/arm/crypto/sha1_neon_glue.c index 0071e5e4411a..cfe36ae0f3f5 100644 --- a/arch/arm/crypto/sha1_neon_glue.c +++ b/arch/arm/crypto/sha1_neon_glue.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c index 87f0b62386c6..c62ce89dd3e0 100644 --- a/arch/arm/crypto/sha2-ce-glue.c +++ b/arch/arm/crypto/sha2-ce-glue.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arm/crypto/sha256_glue.c b/arch/arm/crypto/sha256_glue.c index b8a4f79020cf..433ee4ddce6c 100644 --- a/arch/arm/crypto/sha256_glue.c +++ b/arch/arm/crypto/sha256_glue.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm/crypto/sha256_neon_glue.c b/arch/arm/crypto/sha256_neon_glue.c index 79820b9e2541..701706262ef3 100644 --- a/arch/arm/crypto/sha256_neon_glue.c +++ b/arch/arm/crypto/sha256_neon_glue.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm/crypto/sha512-glue.c b/arch/arm/crypto/sha512-glue.c index 8775aa42bbbe..0635a65aa488 100644 --- a/arch/arm/crypto/sha512-glue.c +++ b/arch/arm/crypto/sha512-glue.c @@ -6,7 +6,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/arm/crypto/sha512-neon-glue.c b/arch/arm/crypto/sha512-neon-glue.c index 96cb94403540..c879ad32db51 100644 --- a/arch/arm/crypto/sha512-neon-glue.c +++ b/arch/arm/crypto/sha512-neon-glue.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c index 395bbf64b2ab..34b8a89197be 100644 --- a/arch/arm64/crypto/aes-glue.c +++ b/arch/arm64/crypto/aes-glue.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c index c63b99211db3..c93121bcfdeb 100644 --- a/arch/arm64/crypto/sha1-ce-glue.c +++ b/arch/arm64/crypto/sha1-ce-glue.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c index 5e956d7582a5..31ba3da5e61b 100644 --- a/arch/arm64/crypto/sha2-ce-glue.c +++ b/arch/arm64/crypto/sha2-ce-glue.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm64/crypto/sha256-glue.c b/arch/arm64/crypto/sha256-glue.c index 77bc6e72abae..9462f6088b3f 100644 --- a/arch/arm64/crypto/sha256-glue.c +++ b/arch/arm64/crypto/sha256-glue.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c index dc890a719f54..faa83f6cf376 100644 --- a/arch/arm64/crypto/sha512-ce-glue.c +++ b/arch/arm64/crypto/sha512-ce-glue.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm64/crypto/sha512-glue.c b/arch/arm64/crypto/sha512-glue.c index 370ccb29602f..2acff1c7df5d 100644 --- a/arch/arm64/crypto/sha512-glue.c +++ b/arch/arm64/crypto/sha512-glue.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha1.c b/arch/mips/cavium-octeon/crypto/octeon-sha1.c index 75e79b47abfe..30f1d75208a5 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-sha1.c +++ b/arch/mips/cavium-octeon/crypto/octeon-sha1.c @@ -14,7 +14,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha256.c b/arch/mips/cavium-octeon/crypto/octeon-sha256.c index a682ce76716a..36cb92895d72 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-sha256.c +++ b/arch/mips/cavium-octeon/crypto/octeon-sha256.c @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/mips/cavium-octeon/crypto/octeon-sha512.c b/arch/mips/cavium-octeon/crypto/octeon-sha512.c index 50722a0cfb53..359f039820d8 100644 --- a/arch/mips/cavium-octeon/crypto/octeon-sha512.c +++ b/arch/mips/cavium-octeon/crypto/octeon-sha512.c @@ -14,7 +14,7 @@ */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/crypto/sha1-spe-glue.c b/arch/powerpc/crypto/sha1-spe-glue.c index cb57be4ada61..b1e577cbf00c 100644 --- a/arch/powerpc/crypto/sha1-spe-glue.c +++ b/arch/powerpc/crypto/sha1-spe-glue.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/crypto/sha1.c b/arch/powerpc/crypto/sha1.c index b40dc50a6908..7a55d790cdb1 100644 --- a/arch/powerpc/crypto/sha1.c +++ b/arch/powerpc/crypto/sha1.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include void powerpc_sha_transform(u32 *state, const u8 *src); diff --git a/arch/powerpc/crypto/sha256-spe-glue.c b/arch/powerpc/crypto/sha256-spe-glue.c index ceb0b6c980b3..88530ae0791f 100644 --- a/arch/powerpc/crypto/sha256-spe-glue.c +++ b/arch/powerpc/crypto/sha256-spe-glue.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/s390/crypto/sha.h b/arch/s390/crypto/sha.h index ada2f98c27b7..65ea12fc87a1 100644 --- a/arch/s390/crypto/sha.h +++ b/arch/s390/crypto/sha.h @@ -11,7 +11,8 @@ #define _CRYPTO_ARCH_S390_SHA_H #include -#include +#include +#include #include /* must be big enough for the largest SHA variant */ diff --git a/arch/s390/crypto/sha1_s390.c b/arch/s390/crypto/sha1_s390.c index 698b1e6d3c14..a3fabf310a38 100644 --- a/arch/s390/crypto/sha1_s390.c +++ b/arch/s390/crypto/sha1_s390.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include "sha.h" diff --git a/arch/s390/crypto/sha256_s390.c b/arch/s390/crypto/sha256_s390.c index b52c87e44939..24983f175676 100644 --- a/arch/s390/crypto/sha256_s390.c +++ b/arch/s390/crypto/sha256_s390.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include "sha.h" diff --git a/arch/s390/crypto/sha3_256_s390.c b/arch/s390/crypto/sha3_256_s390.c index 460cbbbaa44a..30ac49b635bf 100644 --- a/arch/s390/crypto/sha3_256_s390.c +++ b/arch/s390/crypto/sha3_256_s390.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include diff --git a/arch/s390/crypto/sha3_512_s390.c b/arch/s390/crypto/sha3_512_s390.c index 72cf460a53e5..e70d50f7620f 100644 --- a/arch/s390/crypto/sha3_512_s390.c +++ b/arch/s390/crypto/sha3_512_s390.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include diff --git a/arch/s390/crypto/sha512_s390.c b/arch/s390/crypto/sha512_s390.c index ad29db085a18..29a6bd404c59 100644 --- a/arch/s390/crypto/sha512_s390.c +++ b/arch/s390/crypto/sha512_s390.c @@ -8,7 +8,7 @@ * Author(s): Jan Glauber (jang@de.ibm.com) */ #include -#include +#include #include #include #include diff --git a/arch/s390/purgatory/purgatory.c b/arch/s390/purgatory/purgatory.c index 0a423bcf6746..030efda05dbe 100644 --- a/arch/s390/purgatory/purgatory.c +++ b/arch/s390/purgatory/purgatory.c @@ -9,7 +9,7 @@ #include #include -#include +#include #include int verify_sha256_digest(void) diff --git a/arch/sparc/crypto/sha1_glue.c b/arch/sparc/crypto/sha1_glue.c index dc017782be52..86a654cce5ab 100644 --- a/arch/sparc/crypto/sha1_glue.c +++ b/arch/sparc/crypto/sha1_glue.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sparc/crypto/sha256_glue.c b/arch/sparc/crypto/sha256_glue.c index ca2547df9652..60ec524cf9ca 100644 --- a/arch/sparc/crypto/sha256_glue.c +++ b/arch/sparc/crypto/sha256_glue.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/sparc/crypto/sha512_glue.c b/arch/sparc/crypto/sha512_glue.c index 3b2ca732ff7a..273ce21918c1 100644 --- a/arch/sparc/crypto/sha512_glue.c +++ b/arch/sparc/crypto/sha512_glue.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 18200135603f..44340a1139e0 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c index dd06249229e1..3a5f6be7dbba 100644 --- a/arch/x86/crypto/sha256_ssse3_glue.c +++ b/arch/x86/crypto/sha256_ssse3_glue.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index b0b05c93409e..30e70f4fe2f7 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/purgatory/purgatory.c b/arch/x86/purgatory/purgatory.c index 7b37a412f829..f03b64d9cb51 100644 --- a/arch/x86/purgatory/purgatory.c +++ b/arch/x86/purgatory/purgatory.c @@ -9,7 +9,7 @@ */ #include -#include +#include #include #include "../boot/string.h" diff --git a/crypto/asymmetric_keys/asym_tpm.c b/crypto/asymmetric_keys/asym_tpm.c index 378b18b9bc34..511932aa94a6 100644 --- a/crypto/asymmetric_keys/asym_tpm.c +++ b/crypto/asymmetric_keys/asym_tpm.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/crypto/sha1_generic.c b/crypto/sha1_generic.c index 1d43472fecbd..325b57fe28dc 100644 --- a/crypto/sha1_generic.c +++ b/crypto/sha1_generic.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/crypto/sha256_generic.c b/crypto/sha256_generic.c index 88156e3e2a33..3b377197236e 100644 --- a/crypto/sha256_generic.c +++ b/crypto/sha256_generic.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/crypto/sha512_generic.c b/crypto/sha512_generic.c index e34d09dd9971..c72d72ad828e 100644 --- a/crypto/sha512_generic.c +++ b/crypto/sha512_generic.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/char/random.c b/drivers/char/random.c index 2a41b21623ae..5f3b8ac9d97b 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -336,7 +336,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/crypto/allwinner/sun4i-ss/sun4i-ss.h b/drivers/crypto/allwinner/sun4i-ss/sun4i-ss.h index 163962f9e284..5c291e4a6857 100644 --- a/drivers/crypto/allwinner/sun4i-ss/sun4i-ss.h +++ b/drivers/crypto/allwinner/sun4i-ss/sun4i-ss.h @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c index fa2f1b4fad7b..4927a6c82d32 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce-hash.c @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include #include #include "sun8i-ce.h" diff --git a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h index 558027516aed..cec781d5063c 100644 --- a/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h +++ b/drivers/crypto/allwinner/sun8i-ce/sun8i-ce.h @@ -16,7 +16,8 @@ #include #include #include -#include +#include +#include /* CE Registers */ #define CE_TDQ 0x00 diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c index b6ab2054f217..11cbcbc83a7b 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss-hash.c @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include #include #include "sun8i-ss.h" diff --git a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss.h b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss.h index 1a66457f4a20..28188685b910 100644 --- a/drivers/crypto/allwinner/sun8i-ss/sun8i-ss.h +++ b/drivers/crypto/allwinner/sun8i-ss/sun8i-ss.h @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include #define SS_START 1 diff --git a/drivers/crypto/amcc/crypto4xx_alg.c b/drivers/crypto/amcc/crypto4xx_alg.c index 7729a637fb02..a3fa849b139a 100644 --- a/drivers/crypto/amcc/crypto4xx_alg.c +++ b/drivers/crypto/amcc/crypto4xx_alg.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include "crypto4xx_reg_def.h" diff --git a/drivers/crypto/amcc/crypto4xx_core.c b/drivers/crypto/amcc/crypto4xx_core.c index 2e3690f65786..8d1b918a0533 100644 --- a/drivers/crypto/amcc/crypto4xx_core.c +++ b/drivers/crypto/amcc/crypto4xx_core.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/crypto/atmel-authenc.h b/drivers/crypto/atmel-authenc.h index c6530a1c8c20..45171e89a7d2 100644 --- a/drivers/crypto/atmel-authenc.h +++ b/drivers/crypto/atmel-authenc.h @@ -16,7 +16,8 @@ #include #include -#include +#include +#include #include "atmel-sha-regs.h" struct atmel_aes_dev; diff --git a/drivers/crypto/atmel-sha.c b/drivers/crypto/atmel-sha.c index 0eb6f54e3b66..352d80cb5ae9 100644 --- a/drivers/crypto/atmel-sha.c +++ b/drivers/crypto/atmel-sha.c @@ -33,7 +33,8 @@ #include #include #include -#include +#include +#include #include #include #include "atmel-sha-regs.h" diff --git a/drivers/crypto/axis/artpec6_crypto.c b/drivers/crypto/axis/artpec6_crypto.c index 809c3033ca74..9ad188cffd0d 100644 --- a/drivers/crypto/axis/artpec6_crypto.c +++ b/drivers/crypto/axis/artpec6_crypto.c @@ -28,7 +28,8 @@ #include #include #include -#include +#include +#include #include /* Max length of a line in all cache levels for Artpec SoCs. */ diff --git a/drivers/crypto/bcm/cipher.c b/drivers/crypto/bcm/cipher.c index 50d169e61b41..30390a7324b2 100644 --- a/drivers/crypto/bcm/cipher.c +++ b/drivers/crypto/bcm/cipher.c @@ -26,11 +26,12 @@ #include #include #include -#include #include #include #include #include +#include +#include #include #include "util.h" diff --git a/drivers/crypto/bcm/cipher.h b/drivers/crypto/bcm/cipher.h index 035c8389cb3d..0ad5892b445d 100644 --- a/drivers/crypto/bcm/cipher.h +++ b/drivers/crypto/bcm/cipher.h @@ -16,7 +16,8 @@ #include #include #include -#include +#include +#include #include #include "spu.h" diff --git a/drivers/crypto/bcm/spu.h b/drivers/crypto/bcm/spu.h index dd132389bcaa..1c386a2d5506 100644 --- a/drivers/crypto/bcm/spu.h +++ b/drivers/crypto/bcm/spu.h @@ -17,7 +17,8 @@ #include #include -#include +#include +#include enum spu_cipher_alg { CIPHER_ALG_NONE = 0x0, diff --git a/drivers/crypto/caam/compat.h b/drivers/crypto/caam/compat.h index c3c22a8de4c0..c4f79764172b 100644 --- a/drivers/crypto/caam/compat.h +++ b/drivers/crypto/caam/compat.h @@ -34,7 +34,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/cavium/nitrox/nitrox_aead.c b/drivers/crypto/cavium/nitrox/nitrox_aead.c index e5d8607ecb1d..c93c4e41d267 100644 --- a/drivers/crypto/cavium/nitrox/nitrox_aead.c +++ b/drivers/crypto/cavium/nitrox/nitrox_aead.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/crypto/ccp/ccp-crypto-sha.c b/drivers/crypto/ccp/ccp-crypto-sha.c index 8fbfdb9e8cd3..74fa5360e722 100644 --- a/drivers/crypto/ccp/ccp-crypto-sha.c +++ b/drivers/crypto/ccp/ccp-crypto-sha.c @@ -17,7 +17,8 @@ #include #include #include -#include +#include +#include #include #include diff --git a/drivers/crypto/ccp/ccp-crypto.h b/drivers/crypto/ccp/ccp-crypto.h index aed3d2192d01..e42450d07168 100644 --- a/drivers/crypto/ccp/ccp-crypto.h +++ b/drivers/crypto/ccp/ccp-crypto.h @@ -19,7 +19,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/ccree/cc_driver.h b/drivers/crypto/ccree/cc_driver.h index af77b2020350..ed2b2f13a256 100644 --- a/drivers/crypto/ccree/cc_driver.h +++ b/drivers/crypto/ccree/cc_driver.h @@ -17,7 +17,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/chelsio/chcr_algo.c b/drivers/crypto/chelsio/chcr_algo.c index 13b908ea4873..f5a336634daa 100644 --- a/drivers/crypto/chelsio/chcr_algo.c +++ b/drivers/crypto/chelsio/chcr_algo.c @@ -53,7 +53,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/hisilicon/sec2/sec_crypto.c b/drivers/crypto/hisilicon/sec2/sec_crypto.c index 891e04914615..2eaa516b3231 100644 --- a/drivers/crypto/hisilicon/sec2/sec_crypto.c +++ b/drivers/crypto/hisilicon/sec2/sec_crypto.c @@ -7,7 +7,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/img-hash.c b/drivers/crypto/img-hash.c index 91f555ccbb31..e813115d5432 100644 --- a/drivers/crypto/img-hash.c +++ b/drivers/crypto/img-hash.c @@ -19,7 +19,8 @@ #include #include -#include +#include +#include #define CR_RESET 0 #define CR_RESET_SET 1 diff --git a/drivers/crypto/inside-secure/safexcel.h b/drivers/crypto/inside-secure/safexcel.h index 9045f2d7f4c6..ce1e611a163e 100644 --- a/drivers/crypto/inside-secure/safexcel.h +++ b/drivers/crypto/inside-secure/safexcel.h @@ -11,7 +11,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/inside-secure/safexcel_cipher.c b/drivers/crypto/inside-secure/safexcel_cipher.c index 9bcfb79a030f..d68ef16650d4 100644 --- a/drivers/crypto/inside-secure/safexcel_cipher.c +++ b/drivers/crypto/inside-secure/safexcel_cipher.c @@ -18,7 +18,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/inside-secure/safexcel_hash.c b/drivers/crypto/inside-secure/safexcel_hash.c index 56d5ccb5cc00..50fb6d90a2e0 100644 --- a/drivers/crypto/inside-secure/safexcel_hash.c +++ b/drivers/crypto/inside-secure/safexcel_hash.c @@ -8,7 +8,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/ixp4xx_crypto.c b/drivers/crypto/ixp4xx_crypto.c index 276012e7c482..8b0f17fc09fb 100644 --- a/drivers/crypto/ixp4xx_crypto.c +++ b/drivers/crypto/ixp4xx_crypto.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/crypto/marvell/cesa/hash.c b/drivers/crypto/marvell/cesa/hash.c index add7ea011c98..8cf9fd518d86 100644 --- a/drivers/crypto/marvell/cesa/hash.c +++ b/drivers/crypto/marvell/cesa/hash.c @@ -11,7 +11,8 @@ #include #include -#include +#include +#include #include #include diff --git a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c index 90bb31329d4b..ccbef01888d4 100644 --- a/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c +++ b/drivers/crypto/marvell/octeontx/otx_cptvf_algs.c @@ -13,7 +13,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/mediatek/mtk-sha.c b/drivers/crypto/mediatek/mtk-sha.c index 3d5d7d68b03b..f55aacdafbef 100644 --- a/drivers/crypto/mediatek/mtk-sha.c +++ b/drivers/crypto/mediatek/mtk-sha.c @@ -10,7 +10,8 @@ */ #include -#include +#include +#include #include "mtk-platform.h" #define SHA_ALIGN_MSK (sizeof(u32) - 1) diff --git a/drivers/crypto/mxs-dcp.c b/drivers/crypto/mxs-dcp.c index 909a7eb748e3..d6a7784d2988 100644 --- a/drivers/crypto/mxs-dcp.c +++ b/drivers/crypto/mxs-dcp.c @@ -17,7 +17,8 @@ #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/n2_core.c b/drivers/crypto/n2_core.c index 3642bf83d809..3b0bf6fea491 100644 --- a/drivers/crypto/n2_core.c +++ b/drivers/crypto/n2_core.c @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/nx/nx-sha256.c b/drivers/crypto/nx/nx-sha256.c index 02fb53453195..90d9a37a57f6 100644 --- a/drivers/crypto/nx/nx-sha256.c +++ b/drivers/crypto/nx/nx-sha256.c @@ -8,7 +8,7 @@ */ #include -#include +#include #include #include #include diff --git a/drivers/crypto/nx/nx-sha512.c b/drivers/crypto/nx/nx-sha512.c index 4c7a3e3eeebf..eb8627a0f317 100644 --- a/drivers/crypto/nx/nx-sha512.c +++ b/drivers/crypto/nx/nx-sha512.c @@ -8,7 +8,7 @@ */ #include -#include +#include #include #include diff --git a/drivers/crypto/nx/nx.c b/drivers/crypto/nx/nx.c index 40882d6d52c1..0d2dc5be7f19 100644 --- a/drivers/crypto/nx/nx.c +++ b/drivers/crypto/nx/nx.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/drivers/crypto/omap-sham.c b/drivers/crypto/omap-sham.c index a3b38d2c92e7..ae0d320d3c60 100644 --- a/drivers/crypto/omap-sham.c +++ b/drivers/crypto/omap-sham.c @@ -35,7 +35,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/padlock-sha.c b/drivers/crypto/padlock-sha.c index a697a4a3f2d0..6865c7f1fc1a 100644 --- a/drivers/crypto/padlock-sha.c +++ b/drivers/crypto/padlock-sha.c @@ -9,7 +9,8 @@ #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/picoxcell_crypto.c b/drivers/crypto/picoxcell_crypto.c index fb34bf92861d..84f9c16d984c 100644 --- a/drivers/crypto/picoxcell_crypto.c +++ b/drivers/crypto/picoxcell_crypto.c @@ -8,7 +8,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/qat/qat_common/qat_algs.c b/drivers/crypto/qat/qat_common/qat_algs.c index 0fab8bb8ca59..b3a68d986417 100644 --- a/drivers/crypto/qat/qat_common/qat_algs.c +++ b/drivers/crypto/qat/qat_common/qat_algs.c @@ -6,7 +6,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/qce/common.c b/drivers/crypto/qce/common.c index 5006e74c40cd..a73db2a5637f 100644 --- a/drivers/crypto/qce/common.c +++ b/drivers/crypto/qce/common.c @@ -7,7 +7,8 @@ #include #include #include -#include +#include +#include #include "cipher.h" #include "common.h" diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c index ea616b7259ae..5e6717f9bbda 100644 --- a/drivers/crypto/qce/core.c +++ b/drivers/crypto/qce/core.c @@ -13,7 +13,6 @@ #include #include #include -#include #include "core.h" #include "cipher.h" diff --git a/drivers/crypto/qce/sha.h b/drivers/crypto/qce/sha.h index d63526e3804d..a22695361f16 100644 --- a/drivers/crypto/qce/sha.h +++ b/drivers/crypto/qce/sha.h @@ -7,7 +7,8 @@ #define _SHA_H_ #include -#include +#include +#include #include "common.h" #include "core.h" diff --git a/drivers/crypto/rockchip/rk3288_crypto.h b/drivers/crypto/rockchip/rk3288_crypto.h index 3db595570c9c..97278c2574ff 100644 --- a/drivers/crypto/rockchip/rk3288_crypto.h +++ b/drivers/crypto/rockchip/rk3288_crypto.h @@ -12,7 +12,8 @@ #include #include -#include +#include +#include #define _SBF(v, f) ((v) << (f)) diff --git a/drivers/crypto/s5p-sss.c b/drivers/crypto/s5p-sss.c index 88a6c853ffd7..682c8a450a57 100644 --- a/drivers/crypto/s5p-sss.c +++ b/drivers/crypto/s5p-sss.c @@ -30,7 +30,8 @@ #include #include -#include +#include +#include #include #define _SBF(s, v) ((v) << (s)) diff --git a/drivers/crypto/sa2ul.c b/drivers/crypto/sa2ul.c index c357010a159e..f300b0a5958a 100644 --- a/drivers/crypto/sa2ul.c +++ b/drivers/crypto/sa2ul.c @@ -25,7 +25,8 @@ #include #include #include -#include +#include +#include #include "sa2ul.h" diff --git a/drivers/crypto/sa2ul.h b/drivers/crypto/sa2ul.h index bb40df3876e5..f597ddecde34 100644 --- a/drivers/crypto/sa2ul.h +++ b/drivers/crypto/sa2ul.h @@ -13,7 +13,8 @@ #define _K3_SA2UL_ #include -#include +#include +#include #define SA_ENGINE_ENABLE_CONTROL 0x1000 diff --git a/drivers/crypto/sahara.c b/drivers/crypto/sahara.c index d60679c79822..8b5be29cb4dc 100644 --- a/drivers/crypto/sahara.c +++ b/drivers/crypto/sahara.c @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include #include #include diff --git a/drivers/crypto/stm32/stm32-hash.c b/drivers/crypto/stm32/stm32-hash.c index e3e25278a970..7ac0573ef663 100644 --- a/drivers/crypto/stm32/stm32-hash.c +++ b/drivers/crypto/stm32/stm32-hash.c @@ -25,7 +25,8 @@ #include #include #include -#include +#include +#include #include #define HASH_CR 0x00 diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index a713a35dc502..4fd85f31630a 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -31,7 +31,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/crypto/ux500/hash/hash_core.c b/drivers/crypto/ux500/hash/hash_core.c index 3d407eebb2ba..da284b0ea1b2 100644 --- a/drivers/crypto/ux500/hash/hash_core.c +++ b/drivers/crypto/ux500/hash/hash_core.c @@ -31,7 +31,8 @@ #include #include -#include +#include +#include #include #include diff --git a/drivers/firmware/efi/embedded-firmware.c b/drivers/firmware/efi/embedded-firmware.c index 21ae0c48232a..f5be8e22305b 100644 --- a/drivers/firmware/efi/embedded-firmware.c +++ b/drivers/firmware/efi/embedded-firmware.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include /* Exported for use by lib/test_firmware.c only */ LIST_HEAD(efi_embedded_fw_list); diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c index 072299b14b8d..47d9268a7e3c 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ipsec/chcr_ipsec.c @@ -51,7 +51,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h index 2d3dfdd2a716..65617752c630 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/chtls/chtls.h @@ -9,7 +9,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/drivers/nfc/s3fwrn5/firmware.c b/drivers/nfc/s3fwrn5/firmware.c index ec930ee2c847..5d5ad8307211 100644 --- a/drivers/nfc/s3fwrn5/firmware.c +++ b/drivers/nfc/s3fwrn5/firmware.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include "s3fwrn5.h" #include "firmware.h" diff --git a/drivers/tee/tee_core.c b/drivers/tee/tee_core.c index f53bf336c0a2..d70d4be91096 100644 --- a/drivers/tee/tee_core.c +++ b/drivers/tee/tee_core.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "tee_private.h" #define TEE_NUM_DEVICES 32 diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 1fbe6c24d705..cf06ea3870eb 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include "fscrypt_private.h" diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index 0cba7928446d..e0ec21055505 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -10,7 +10,7 @@ */ #include -#include +#include #include "fscrypt_private.h" diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c index b93b3cd10bfd..0886d835f597 100644 --- a/fs/ubifs/auth.c +++ b/fs/ubifs/auth.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h index e96d99d5145e..6a8f2e3cce6c 100644 --- a/fs/verity/fsverity_private.h +++ b/fs/verity/fsverity_private.h @@ -14,7 +14,7 @@ #define pr_fmt(fmt) "fs-verity: " fmt -#include +#include #include #include diff --git a/include/crypto/hash_info.h b/include/crypto/hash_info.h index eb9d2e368969..dd4f06785049 100644 --- a/include/crypto/hash_info.h +++ b/include/crypto/hash_info.h @@ -8,7 +8,8 @@ #ifndef _CRYPTO_HASH_INFO_H #define _CRYPTO_HASH_INFO_H -#include +#include +#include #include #include diff --git a/include/crypto/sha.h b/include/crypto/sha.h deleted file mode 100644 index 4ff3da816630..000000000000 --- a/include/crypto/sha.h +++ /dev/null @@ -1,167 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Common values for SHA algorithms - */ - -#ifndef _CRYPTO_SHA_H -#define _CRYPTO_SHA_H - -#include - -#define SHA1_DIGEST_SIZE 20 -#define SHA1_BLOCK_SIZE 64 - -#define SHA224_DIGEST_SIZE 28 -#define SHA224_BLOCK_SIZE 64 - -#define SHA256_DIGEST_SIZE 32 -#define SHA256_BLOCK_SIZE 64 - -#define SHA384_DIGEST_SIZE 48 -#define SHA384_BLOCK_SIZE 128 - -#define SHA512_DIGEST_SIZE 64 -#define SHA512_BLOCK_SIZE 128 - -#define SHA1_H0 0x67452301UL -#define SHA1_H1 0xefcdab89UL -#define SHA1_H2 0x98badcfeUL -#define SHA1_H3 0x10325476UL -#define SHA1_H4 0xc3d2e1f0UL - -#define SHA224_H0 0xc1059ed8UL -#define SHA224_H1 0x367cd507UL -#define SHA224_H2 0x3070dd17UL -#define SHA224_H3 0xf70e5939UL -#define SHA224_H4 0xffc00b31UL -#define SHA224_H5 0x68581511UL -#define SHA224_H6 0x64f98fa7UL -#define SHA224_H7 0xbefa4fa4UL - -#define SHA256_H0 0x6a09e667UL -#define SHA256_H1 0xbb67ae85UL -#define SHA256_H2 0x3c6ef372UL -#define SHA256_H3 0xa54ff53aUL -#define SHA256_H4 0x510e527fUL -#define SHA256_H5 0x9b05688cUL -#define SHA256_H6 0x1f83d9abUL -#define SHA256_H7 0x5be0cd19UL - -#define SHA384_H0 0xcbbb9d5dc1059ed8ULL -#define SHA384_H1 0x629a292a367cd507ULL -#define SHA384_H2 0x9159015a3070dd17ULL -#define SHA384_H3 0x152fecd8f70e5939ULL -#define SHA384_H4 0x67332667ffc00b31ULL -#define SHA384_H5 0x8eb44a8768581511ULL -#define SHA384_H6 0xdb0c2e0d64f98fa7ULL -#define SHA384_H7 0x47b5481dbefa4fa4ULL - -#define SHA512_H0 0x6a09e667f3bcc908ULL -#define SHA512_H1 0xbb67ae8584caa73bULL -#define SHA512_H2 0x3c6ef372fe94f82bULL -#define SHA512_H3 0xa54ff53a5f1d36f1ULL -#define SHA512_H4 0x510e527fade682d1ULL -#define SHA512_H5 0x9b05688c2b3e6c1fULL -#define SHA512_H6 0x1f83d9abfb41bd6bULL -#define SHA512_H7 0x5be0cd19137e2179ULL - -extern const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE]; - -extern const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE]; - -extern const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE]; - -extern const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE]; - -extern const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE]; - -struct sha1_state { - u32 state[SHA1_DIGEST_SIZE / 4]; - u64 count; - u8 buffer[SHA1_BLOCK_SIZE]; -}; - -struct sha256_state { - u32 state[SHA256_DIGEST_SIZE / 4]; - u64 count; - u8 buf[SHA256_BLOCK_SIZE]; -}; - -struct sha512_state { - u64 state[SHA512_DIGEST_SIZE / 8]; - u64 count[2]; - u8 buf[SHA512_BLOCK_SIZE]; -}; - -struct shash_desc; - -extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data, - unsigned int len); - -extern int crypto_sha1_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *hash); - -extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data, - unsigned int len); - -extern int crypto_sha256_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *hash); - -extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data, - unsigned int len); - -extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data, - unsigned int len, u8 *hash); - -/* - * An implementation of SHA-1's compression function. Don't use in new code! - * You shouldn't be using SHA-1, and even if you *have* to use SHA-1, this isn't - * the correct way to hash something with SHA-1 (use crypto_shash instead). - */ -#define SHA1_DIGEST_WORDS (SHA1_DIGEST_SIZE / 4) -#define SHA1_WORKSPACE_WORDS 16 -void sha1_init(__u32 *buf); -void sha1_transform(__u32 *digest, const char *data, __u32 *W); - -/* - * Stand-alone implementation of the SHA256 algorithm. It is designed to - * have as little dependencies as possible so it can be used in the - * kexec_file purgatory. In other cases you should generally use the - * hash APIs from include/crypto/hash.h. Especially when hashing large - * amounts of data as those APIs may be hw-accelerated. - * - * For details see lib/crypto/sha256.c - */ - -static inline void sha256_init(struct sha256_state *sctx) -{ - sctx->state[0] = SHA256_H0; - sctx->state[1] = SHA256_H1; - sctx->state[2] = SHA256_H2; - sctx->state[3] = SHA256_H3; - sctx->state[4] = SHA256_H4; - sctx->state[5] = SHA256_H5; - sctx->state[6] = SHA256_H6; - sctx->state[7] = SHA256_H7; - sctx->count = 0; -} -void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len); -void sha256_final(struct sha256_state *sctx, u8 *out); -void sha256(const u8 *data, unsigned int len, u8 *out); - -static inline void sha224_init(struct sha256_state *sctx) -{ - sctx->state[0] = SHA224_H0; - sctx->state[1] = SHA224_H1; - sctx->state[2] = SHA224_H2; - sctx->state[3] = SHA224_H3; - sctx->state[4] = SHA224_H4; - sctx->state[5] = SHA224_H5; - sctx->state[6] = SHA224_H6; - sctx->state[7] = SHA224_H7; - sctx->count = 0; -} -void sha224_update(struct sha256_state *sctx, const u8 *data, unsigned int len); -void sha224_final(struct sha256_state *sctx, u8 *out); - -#endif diff --git a/include/crypto/sha1.h b/include/crypto/sha1.h new file mode 100644 index 000000000000..044ecea60ac8 --- /dev/null +++ b/include/crypto/sha1.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common values for SHA-1 algorithms + */ + +#ifndef _CRYPTO_SHA1_H +#define _CRYPTO_SHA1_H + +#include + +#define SHA1_DIGEST_SIZE 20 +#define SHA1_BLOCK_SIZE 64 + +#define SHA1_H0 0x67452301UL +#define SHA1_H1 0xefcdab89UL +#define SHA1_H2 0x98badcfeUL +#define SHA1_H3 0x10325476UL +#define SHA1_H4 0xc3d2e1f0UL + +extern const u8 sha1_zero_message_hash[SHA1_DIGEST_SIZE]; + +struct sha1_state { + u32 state[SHA1_DIGEST_SIZE / 4]; + u64 count; + u8 buffer[SHA1_BLOCK_SIZE]; +}; + +struct shash_desc; + +extern int crypto_sha1_update(struct shash_desc *desc, const u8 *data, + unsigned int len); + +extern int crypto_sha1_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *hash); + +/* + * An implementation of SHA-1's compression function. Don't use in new code! + * You shouldn't be using SHA-1, and even if you *have* to use SHA-1, this isn't + * the correct way to hash something with SHA-1 (use crypto_shash instead). + */ +#define SHA1_DIGEST_WORDS (SHA1_DIGEST_SIZE / 4) +#define SHA1_WORKSPACE_WORDS 16 +void sha1_init(__u32 *buf); +void sha1_transform(__u32 *digest, const char *data, __u32 *W); + +#endif /* _CRYPTO_SHA1_H */ diff --git a/include/crypto/sha1_base.h b/include/crypto/sha1_base.h index a5d6033efef7..2e0e7c3827d1 100644 --- a/include/crypto/sha1_base.h +++ b/include/crypto/sha1_base.h @@ -9,7 +9,7 @@ #define _CRYPTO_SHA1_BASE_H #include -#include +#include #include #include #include diff --git a/include/crypto/sha2.h b/include/crypto/sha2.h new file mode 100644 index 000000000000..2838f529f31e --- /dev/null +++ b/include/crypto/sha2.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common values for SHA-2 algorithms + */ + +#ifndef _CRYPTO_SHA2_H +#define _CRYPTO_SHA2_H + +#include + +#define SHA224_DIGEST_SIZE 28 +#define SHA224_BLOCK_SIZE 64 + +#define SHA256_DIGEST_SIZE 32 +#define SHA256_BLOCK_SIZE 64 + +#define SHA384_DIGEST_SIZE 48 +#define SHA384_BLOCK_SIZE 128 + +#define SHA512_DIGEST_SIZE 64 +#define SHA512_BLOCK_SIZE 128 + +#define SHA224_H0 0xc1059ed8UL +#define SHA224_H1 0x367cd507UL +#define SHA224_H2 0x3070dd17UL +#define SHA224_H3 0xf70e5939UL +#define SHA224_H4 0xffc00b31UL +#define SHA224_H5 0x68581511UL +#define SHA224_H6 0x64f98fa7UL +#define SHA224_H7 0xbefa4fa4UL + +#define SHA256_H0 0x6a09e667UL +#define SHA256_H1 0xbb67ae85UL +#define SHA256_H2 0x3c6ef372UL +#define SHA256_H3 0xa54ff53aUL +#define SHA256_H4 0x510e527fUL +#define SHA256_H5 0x9b05688cUL +#define SHA256_H6 0x1f83d9abUL +#define SHA256_H7 0x5be0cd19UL + +#define SHA384_H0 0xcbbb9d5dc1059ed8ULL +#define SHA384_H1 0x629a292a367cd507ULL +#define SHA384_H2 0x9159015a3070dd17ULL +#define SHA384_H3 0x152fecd8f70e5939ULL +#define SHA384_H4 0x67332667ffc00b31ULL +#define SHA384_H5 0x8eb44a8768581511ULL +#define SHA384_H6 0xdb0c2e0d64f98fa7ULL +#define SHA384_H7 0x47b5481dbefa4fa4ULL + +#define SHA512_H0 0x6a09e667f3bcc908ULL +#define SHA512_H1 0xbb67ae8584caa73bULL +#define SHA512_H2 0x3c6ef372fe94f82bULL +#define SHA512_H3 0xa54ff53a5f1d36f1ULL +#define SHA512_H4 0x510e527fade682d1ULL +#define SHA512_H5 0x9b05688c2b3e6c1fULL +#define SHA512_H6 0x1f83d9abfb41bd6bULL +#define SHA512_H7 0x5be0cd19137e2179ULL + +extern const u8 sha224_zero_message_hash[SHA224_DIGEST_SIZE]; + +extern const u8 sha256_zero_message_hash[SHA256_DIGEST_SIZE]; + +extern const u8 sha384_zero_message_hash[SHA384_DIGEST_SIZE]; + +extern const u8 sha512_zero_message_hash[SHA512_DIGEST_SIZE]; + +struct sha256_state { + u32 state[SHA256_DIGEST_SIZE / 4]; + u64 count; + u8 buf[SHA256_BLOCK_SIZE]; +}; + +struct sha512_state { + u64 state[SHA512_DIGEST_SIZE / 8]; + u64 count[2]; + u8 buf[SHA512_BLOCK_SIZE]; +}; + +struct shash_desc; + +extern int crypto_sha256_update(struct shash_desc *desc, const u8 *data, + unsigned int len); + +extern int crypto_sha256_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *hash); + +extern int crypto_sha512_update(struct shash_desc *desc, const u8 *data, + unsigned int len); + +extern int crypto_sha512_finup(struct shash_desc *desc, const u8 *data, + unsigned int len, u8 *hash); + +/* + * Stand-alone implementation of the SHA256 algorithm. It is designed to + * have as little dependencies as possible so it can be used in the + * kexec_file purgatory. In other cases you should generally use the + * hash APIs from include/crypto/hash.h. Especially when hashing large + * amounts of data as those APIs may be hw-accelerated. + * + * For details see lib/crypto/sha256.c + */ + +static inline void sha256_init(struct sha256_state *sctx) +{ + sctx->state[0] = SHA256_H0; + sctx->state[1] = SHA256_H1; + sctx->state[2] = SHA256_H2; + sctx->state[3] = SHA256_H3; + sctx->state[4] = SHA256_H4; + sctx->state[5] = SHA256_H5; + sctx->state[6] = SHA256_H6; + sctx->state[7] = SHA256_H7; + sctx->count = 0; +} +void sha256_update(struct sha256_state *sctx, const u8 *data, unsigned int len); +void sha256_final(struct sha256_state *sctx, u8 *out); +void sha256(const u8 *data, unsigned int len, u8 *out); + +static inline void sha224_init(struct sha256_state *sctx) +{ + sctx->state[0] = SHA224_H0; + sctx->state[1] = SHA224_H1; + sctx->state[2] = SHA224_H2; + sctx->state[3] = SHA224_H3; + sctx->state[4] = SHA224_H4; + sctx->state[5] = SHA224_H5; + sctx->state[6] = SHA224_H6; + sctx->state[7] = SHA224_H7; + sctx->count = 0; +} +void sha224_update(struct sha256_state *sctx, const u8 *data, unsigned int len); +void sha224_final(struct sha256_state *sctx, u8 *out); + +#endif /* _CRYPTO_SHA2_H */ diff --git a/include/crypto/sha256_base.h b/include/crypto/sha256_base.h index 93f9fd21cc06..76173c613058 100644 --- a/include/crypto/sha256_base.h +++ b/include/crypto/sha256_base.h @@ -9,7 +9,7 @@ #define _CRYPTO_SHA256_BASE_H #include -#include +#include #include #include #include diff --git a/include/crypto/sha512_base.h b/include/crypto/sha512_base.h index 93ab73baa38e..b370b3340b16 100644 --- a/include/crypto/sha512_base.h +++ b/include/crypto/sha512_base.h @@ -9,7 +9,7 @@ #define _CRYPTO_SHA512_BASE_H #include -#include +#include #include #include #include diff --git a/include/linux/ccp.h b/include/linux/ccp.h index a5dfbaf2470d..868924dec5a1 100644 --- a/include/linux/ccp.h +++ b/include/linux/ccp.h @@ -15,7 +15,8 @@ #include #include #include -#include +#include +#include struct ccp_device; struct ccp_cmd; diff --git a/include/linux/filter.h b/include/linux/filter.h index 72d62cbc1578..6c00140538b9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include diff --git a/include/linux/purgatory.h b/include/linux/purgatory.h index b950e961cfa8..d7dc1559427f 100644 --- a/include/linux/purgatory.h +++ b/include/linux/purgatory.h @@ -3,7 +3,7 @@ #define _LINUX_PURGATORY_H #include -#include +#include #include struct kexec_sha_region { diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 106e4500fd53..4fcfe0b70c4e 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -11,7 +11,7 @@ #include #include -#include +#include /* vmcoreinfo stuff */ unsigned char *vmcoreinfo_data; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 8798a8183974..4f8efc278aa7 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -42,7 +42,6 @@ #include #include -#include #include "kexec_internal.h" DEFINE_MUTEX(kexec_mutex); diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index e21f6b9234f7..b02086d70492 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/lib/crypto/sha256.c b/lib/crypto/sha256.c index cdef37c05972..72a4b0b1df28 100644 --- a/lib/crypto/sha256.c +++ b/lib/crypto/sha256.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include static const u32 SHA256_K[] = { diff --git a/lib/digsig.c b/lib/digsig.c index e0627c3e53b2..04b5e55ed95f 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/lib/sha1.c b/lib/sha1.c index 49257a915bb6..9bd1935a1472 100644 --- a/lib/sha1.c +++ b/lib/sha1.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include /* diff --git a/net/ipv6/seg6_hmac.c b/net/ipv6/seg6_hmac.c index 85dddfe3a2c6..687d95dce085 100644 --- a/net/ipv6/seg6_hmac.c +++ b/net/ipv6/seg6_hmac.c @@ -35,7 +35,6 @@ #include #include -#include #include #include #include diff --git a/net/mptcp/crypto.c b/net/mptcp/crypto.c index 05d398d3fde4..b472dc149856 100644 --- a/net/mptcp/crypto.c +++ b/net/mptcp/crypto.c @@ -21,7 +21,7 @@ */ #include -#include +#include #include #include "protocol.h" diff --git a/net/mptcp/options.c b/net/mptcp/options.c index a044dd43411d..90cd52df99a6 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -7,7 +7,7 @@ #define pr_fmt(fmt) "MPTCP: " fmt #include -#include +#include #include #include #include "protocol.h" diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index ac4a1fe3550b..b229ae914d76 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/security/integrity/integrity.h b/security/integrity/integrity.h index 413c803c5208..547425c20e11 100644 --- a/security/integrity/integrity.h +++ b/security/integrity/integrity.h @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 192e531c146f..87432b35d771 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include "encrypted.h" diff --git a/security/keys/trusted-keys/trusted_tpm1.c b/security/keys/trusted-keys/trusted_tpm1.c index b9fe02e5f84f..74d82093cbaa 100644 --- a/security/keys/trusted-keys/trusted_tpm1.c +++ b/security/keys/trusted-keys/trusted_tpm1.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/sound/soc/codecs/cros_ec_codec.c b/sound/soc/codecs/cros_ec_codec.c index 28f039adfa13..58894bf47514 100644 --- a/sound/soc/codecs/cros_ec_codec.c +++ b/sound/soc/codecs/cros_ec_codec.c @@ -8,7 +8,7 @@ * EC for audio function. */ -#include +#include #include #include #include -- cgit v1.2.3 From 3cffa06aeef7ece30f6b5ac0ea51f264e8fea4d0 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 11 Nov 2020 14:54:50 +0100 Subject: printk/console: Allow to disable console output by using console="" or console=null The commit 48021f98130880dd74 ("printk: handle blank console arguments passed in.") prevented crash caused by empty console= parameter value. Unfortunately, this value is widely used on Chromebooks to disable the console output. The above commit caused performance regression because the messages were pushed on slow console even though nobody was watching it. Use ttynull driver explicitly for console="" and console=null parameters. It has been created for exactly this purpose. It causes that preferred_console is set. As a result, ttySX and ttyX are not used as a fallback. And only ttynull console gets registered by default. It still allows to register other consoles either by additional console= parameters or SPCR. It prevents regression because it worked this way even before. Also it is a sane semantic. Preventing output on all consoles should be done another way, for example, by introducing mute_console parameter. Link: https://lore.kernel.org/r/20201006025935.GA597@jagdpanzerIV.localdomain Suggested-by: Sergey Senozhatsky Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20201111135450.11214-3-pmladek@suse.com --- kernel/printk/printk.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fe64a49344bf..ac440b879a2c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2189,8 +2189,15 @@ static int __init console_setup(char *str) char *s, *options, *brl_options = NULL; int idx; - if (str[0] == 0) + /* + * console="" or console=null have been suggested as a way to + * disable console output. Use ttynull that has been created + * for exacly this purpose. + */ + if (str[0] == 0 || strcmp(str, "null") == 0) { + __add_preferred_console("ttynull", 0, NULL, NULL, true); return 1; + } if (_braille_console_setup(&str, &brl_options)) return 1; -- cgit v1.2.3 From f9d480b6ffbeb336bf7f6ce44825c00f61b3abae Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Sun, 11 Oct 2020 10:47:42 -0500 Subject: seccomp/cache: Lookup syscall allowlist bitmap for fast path The overhead of running Seccomp filters has been part of some past discussions [1][2][3]. Oftentimes, the filters have a large number of instructions that check syscall numbers one by one and jump based on that. Some users chain BPF filters which further enlarge the overhead. A recent work [6] comprehensively measures the Seccomp overhead and shows that the overhead is non-negligible and has a non-trivial impact on application performance. We observed some common filters, such as docker's [4] or systemd's [5], will make most decisions based only on the syscall numbers, and as past discussions considered, a bitmap where each bit represents a syscall makes most sense for these filters. The fast (common) path for seccomp should be that the filter permits the syscall to pass through, and failing seccomp is expected to be an exceptional case; it is not expected for userspace to call a denylisted syscall over and over. When it can be concluded that an allow must occur for the given architecture and syscall pair (this determination is introduced in the next commit), seccomp will immediately allow the syscall, bypassing further BPF execution. Each architecture number has its own bitmap. The architecture number in seccomp_data is checked against the defined architecture number constant before proceeding to test the bit against the bitmap with the syscall number as the index of the bit in the bitmap, and if the bit is set, seccomp returns allow. The bitmaps are all clear in this patch and will be initialized in the next commit. When only one architecture exists, the check against architecture number is skipped, suggested by Kees Cook [7]. [1] https://lore.kernel.org/linux-security-module/c22a6c3cefc2412cad00ae14c1371711@huawei.com/T/ [2] https://lore.kernel.org/lkml/202005181120.971232B7B@keescook/T/ [3] https://github.com/seccomp/libseccomp/issues/116 [4] https://github.com/moby/moby/blob/ae0ef82b90356ac613f329a8ef5ee42ca923417d/profiles/seccomp/default.json [5] https://github.com/systemd/systemd/blob/6743a1caf4037f03dc51a1277855018e4ab61957/src/shared/seccomp-util.c#L270 [6] Draco: Architectural and Operating System Support for System Call Security https://tianyin.github.io/pub/draco.pdf, MICRO-53, Oct. 2020 [7] https://lore.kernel.org/bpf/202010091614.8BB0EB64@keescook/ Co-developed-by: Dimitrios Skarlatos Signed-off-by: Dimitrios Skarlatos Signed-off-by: YiFei Zhu Reviewed-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/10f91a367ec4fcdea7fc3f086de3f5f13a4a7436.1602431034.git.yifeifz2@illinois.edu --- kernel/seccomp.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 8ad7a293255a..fe35f4f38949 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -143,6 +143,34 @@ struct notification { struct list_head notifications; }; +#ifdef SECCOMP_ARCH_NATIVE +/** + * struct action_cache - per-filter cache of seccomp actions per + * arch/syscall pair + * + * @allow_native: A bitmap where each bit represents whether the + * filter will always allow the syscall, for the + * native architecture. + * @allow_compat: A bitmap where each bit represents whether the + * filter will always allow the syscall, for the + * compat architecture. + */ +struct action_cache { + DECLARE_BITMAP(allow_native, SECCOMP_ARCH_NATIVE_NR); +#ifdef SECCOMP_ARCH_COMPAT + DECLARE_BITMAP(allow_compat, SECCOMP_ARCH_COMPAT_NR); +#endif +}; +#else +struct action_cache { }; + +static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, + const struct seccomp_data *sd) +{ + return false; +} +#endif /* SECCOMP_ARCH_NATIVE */ + /** * struct seccomp_filter - container for seccomp BPF programs * @@ -298,6 +326,52 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) return 0; } +#ifdef SECCOMP_ARCH_NATIVE +static inline bool seccomp_cache_check_allow_bitmap(const void *bitmap, + size_t bitmap_size, + int syscall_nr) +{ + if (unlikely(syscall_nr < 0 || syscall_nr >= bitmap_size)) + return false; + syscall_nr = array_index_nospec(syscall_nr, bitmap_size); + + return test_bit(syscall_nr, bitmap); +} + +/** + * seccomp_cache_check_allow - lookup seccomp cache + * @sfilter: The seccomp filter + * @sd: The seccomp data to lookup the cache with + * + * Returns true if the seccomp_data is cached and allowed. + */ +static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilter, + const struct seccomp_data *sd) +{ + int syscall_nr = sd->nr; + const struct action_cache *cache = &sfilter->cache; + +#ifndef SECCOMP_ARCH_COMPAT + /* A native-only architecture doesn't need to check sd->arch. */ + return seccomp_cache_check_allow_bitmap(cache->allow_native, + SECCOMP_ARCH_NATIVE_NR, + syscall_nr); +#else + if (likely(sd->arch == SECCOMP_ARCH_NATIVE)) + return seccomp_cache_check_allow_bitmap(cache->allow_native, + SECCOMP_ARCH_NATIVE_NR, + syscall_nr); + if (likely(sd->arch == SECCOMP_ARCH_COMPAT)) + return seccomp_cache_check_allow_bitmap(cache->allow_compat, + SECCOMP_ARCH_COMPAT_NR, + syscall_nr); +#endif /* SECCOMP_ARCH_COMPAT */ + + WARN_ON_ONCE(true); + return false; +} +#endif /* SECCOMP_ARCH_NATIVE */ + /** * seccomp_run_filters - evaluates all seccomp filters against @sd * @sd: optional seccomp data to be passed to filters @@ -320,6 +394,9 @@ static u32 seccomp_run_filters(const struct seccomp_data *sd, if (WARN_ON(f == NULL)) return SECCOMP_RET_KILL_PROCESS; + if (seccomp_cache_check_allow(f, sd)) + return SECCOMP_RET_ALLOW; + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). -- cgit v1.2.3 From 8e01b51a31a1e08e2c3e8fcc0ef6790441be2f61 Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Sun, 11 Oct 2020 10:47:43 -0500 Subject: seccomp/cache: Add "emulator" to check if filter is constant allow SECCOMP_CACHE will only operate on syscalls that do not access any syscall arguments or instruction pointer. To facilitate this we need a static analyser to know whether a filter will return allow regardless of syscall arguments for a given architecture number / syscall number pair. This is implemented here with a pseudo-emulator, and stored in a per-filter bitmap. In order to build this bitmap at filter attach time, each filter is emulated for every syscall (under each possible architecture), and checked for any accesses of struct seccomp_data that are not the "arch" nor "nr" (syscall) members. If only "arch" and "nr" are examined, and the program returns allow, then we can be sure that the filter must return allow independent from syscall arguments. Nearly all seccomp filters are built from these cBPF instructions: BPF_LD | BPF_W | BPF_ABS BPF_JMP | BPF_JEQ | BPF_K BPF_JMP | BPF_JGE | BPF_K BPF_JMP | BPF_JGT | BPF_K BPF_JMP | BPF_JSET | BPF_K BPF_JMP | BPF_JA BPF_RET | BPF_K BPF_ALU | BPF_AND | BPF_K Each of these instructions are emulated. Any weirdness or loading from a syscall argument will cause the emulator to bail. The emulation is also halted if it reaches a return. In that case, if it returns an SECCOMP_RET_ALLOW, the syscall is marked as good. Emulator structure and comments are from Kees [1] and Jann [2]. Emulation is done at attach time. If a filter depends on more filters, and if the dependee does not guarantee to allow the syscall, then we skip the emulation of this syscall. [1] https://lore.kernel.org/lkml/20200923232923.3142503-5-keescook@chromium.org/ [2] https://lore.kernel.org/lkml/CAG48ez1p=dR_2ikKq=xVxkoGg0fYpTBpkhJSv1w-6BG=76PAvw@mail.gmail.com/ Suggested-by: Jann Horn Signed-off-by: YiFei Zhu Reviewed-by: Jann Horn Co-developed-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/71c7be2db5ee08905f41c3be5c1ad6e2601ce88f.1602431034.git.yifeifz2@illinois.edu --- kernel/seccomp.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 155 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index fe35f4f38949..d8cf468dbe1e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -169,6 +169,10 @@ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilte { return false; } + +static inline void seccomp_cache_prepare(struct seccomp_filter *sfilter) +{ +} #endif /* SECCOMP_ARCH_NATIVE */ /** @@ -187,6 +191,7 @@ static inline bool seccomp_cache_check_allow(const struct seccomp_filter *sfilte * this filter after reaching 0. The @users count is always smaller * or equal to @refs. Hence, reaching 0 for @users does not mean * the filter can be freed. + * @cache: cache of arch/syscall mappings to actions * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged * @prev: points to a previously installed, or inherited, filter * @prog: the BPF program to evaluate @@ -208,6 +213,7 @@ struct seccomp_filter { refcount_t refs; refcount_t users; bool log; + struct action_cache cache; struct seccomp_filter *prev; struct bpf_prog *prog; struct notification *notif; @@ -621,7 +627,12 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog) { struct seccomp_filter *sfilter; int ret; - const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE); + const bool save_orig = +#if defined(CONFIG_CHECKPOINT_RESTORE) || defined(SECCOMP_ARCH_NATIVE) + true; +#else + false; +#endif if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) return ERR_PTR(-EINVAL); @@ -687,6 +698,148 @@ out: return filter; } +#ifdef SECCOMP_ARCH_NATIVE +/** + * seccomp_is_const_allow - check if filter is constant allow with given data + * @fprog: The BPF programs + * @sd: The seccomp data to check against, only syscall number and arch + * number are considered constant. + */ +static bool seccomp_is_const_allow(struct sock_fprog_kern *fprog, + struct seccomp_data *sd) +{ + unsigned int reg_value = 0; + unsigned int pc; + bool op_res; + + if (WARN_ON_ONCE(!fprog)) + return false; + + for (pc = 0; pc < fprog->len; pc++) { + struct sock_filter *insn = &fprog->filter[pc]; + u16 code = insn->code; + u32 k = insn->k; + + switch (code) { + case BPF_LD | BPF_W | BPF_ABS: + switch (k) { + case offsetof(struct seccomp_data, nr): + reg_value = sd->nr; + break; + case offsetof(struct seccomp_data, arch): + reg_value = sd->arch; + break; + default: + /* can't optimize (non-constant value load) */ + return false; + } + break; + case BPF_RET | BPF_K: + /* reached return with constant values only, check allow */ + return k == SECCOMP_RET_ALLOW; + case BPF_JMP | BPF_JA: + pc += insn->k; + break; + case BPF_JMP | BPF_JEQ | BPF_K: + case BPF_JMP | BPF_JGE | BPF_K: + case BPF_JMP | BPF_JGT | BPF_K: + case BPF_JMP | BPF_JSET | BPF_K: + switch (BPF_OP(code)) { + case BPF_JEQ: + op_res = reg_value == k; + break; + case BPF_JGE: + op_res = reg_value >= k; + break; + case BPF_JGT: + op_res = reg_value > k; + break; + case BPF_JSET: + op_res = !!(reg_value & k); + break; + default: + /* can't optimize (unknown jump) */ + return false; + } + + pc += op_res ? insn->jt : insn->jf; + break; + case BPF_ALU | BPF_AND | BPF_K: + reg_value &= k; + break; + default: + /* can't optimize (unknown insn) */ + return false; + } + } + + /* ran off the end of the filter?! */ + WARN_ON(1); + return false; +} + +static void seccomp_cache_prepare_bitmap(struct seccomp_filter *sfilter, + void *bitmap, const void *bitmap_prev, + size_t bitmap_size, int arch) +{ + struct sock_fprog_kern *fprog = sfilter->prog->orig_prog; + struct seccomp_data sd; + int nr; + + if (bitmap_prev) { + /* The new filter must be as restrictive as the last. */ + bitmap_copy(bitmap, bitmap_prev, bitmap_size); + } else { + /* Before any filters, all syscalls are always allowed. */ + bitmap_fill(bitmap, bitmap_size); + } + + for (nr = 0; nr < bitmap_size; nr++) { + /* No bitmap change: not a cacheable action. */ + if (!test_bit(nr, bitmap)) + continue; + + sd.nr = nr; + sd.arch = arch; + + /* No bitmap change: continue to always allow. */ + if (seccomp_is_const_allow(fprog, &sd)) + continue; + + /* + * Not a cacheable action: always run filters. + * atomic clear_bit() not needed, filter not visible yet. + */ + __clear_bit(nr, bitmap); + } +} + +/** + * seccomp_cache_prepare - emulate the filter to find cachable syscalls + * @sfilter: The seccomp filter + * + * Returns 0 if successful or -errno if error occurred. + */ +static void seccomp_cache_prepare(struct seccomp_filter *sfilter) +{ + struct action_cache *cache = &sfilter->cache; + const struct action_cache *cache_prev = + sfilter->prev ? &sfilter->prev->cache : NULL; + + seccomp_cache_prepare_bitmap(sfilter, cache->allow_native, + cache_prev ? cache_prev->allow_native : NULL, + SECCOMP_ARCH_NATIVE_NR, + SECCOMP_ARCH_NATIVE); + +#ifdef SECCOMP_ARCH_COMPAT + seccomp_cache_prepare_bitmap(sfilter, cache->allow_compat, + cache_prev ? cache_prev->allow_compat : NULL, + SECCOMP_ARCH_COMPAT_NR, + SECCOMP_ARCH_COMPAT); +#endif /* SECCOMP_ARCH_COMPAT */ +} +#endif /* SECCOMP_ARCH_NATIVE */ + /** * seccomp_attach_filter: validate and attach filter * @flags: flags to change filter behavior @@ -736,6 +889,7 @@ static long seccomp_attach_filter(unsigned int flags, * task reference. */ filter->prev = current->seccomp.filter; + seccomp_cache_prepare(filter); current->seccomp.filter = filter; atomic_inc(¤t->seccomp.filter_count); -- cgit v1.2.3 From 0d8315dddd2899f519fe1ca3d4d5cdaf44ea421e Mon Sep 17 00:00:00 2001 From: YiFei Zhu Date: Wed, 11 Nov 2020 07:33:54 -0600 Subject: seccomp/cache: Report cache data through /proc/pid/seccomp_cache Currently the kernel does not provide an infrastructure to translate architecture numbers to a human-readable name. Translating syscall numbers to syscall names is possible through FTRACE_SYSCALL infrastructure but it does not provide support for compat syscalls. This will create a file for each PID as /proc/pid/seccomp_cache. The file will be empty when no seccomp filters are loaded, or be in the format of: where ALLOW means the cache is guaranteed to allow the syscall, and filter means the cache will pass the syscall to the BPF filter. For the docker default profile on x86_64 it looks like: x86_64 0 ALLOW x86_64 1 ALLOW x86_64 2 ALLOW x86_64 3 ALLOW [...] x86_64 132 ALLOW x86_64 133 ALLOW x86_64 134 FILTER x86_64 135 FILTER x86_64 136 FILTER x86_64 137 ALLOW x86_64 138 ALLOW x86_64 139 FILTER x86_64 140 ALLOW x86_64 141 ALLOW [...] This file is guarded by CONFIG_SECCOMP_CACHE_DEBUG with a default of N because I think certain users of seccomp might not want the application to know which syscalls are definitely usable. For the same reason, it is also guarded by CAP_SYS_ADMIN. Suggested-by: Jann Horn Link: https://lore.kernel.org/lkml/CAG48ez3Ofqp4crXGksLmZY6=fGrF_tWyUCg7PBkAetvbbOPeOA@mail.gmail.com/ Signed-off-by: YiFei Zhu Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/94e663fa53136f5a11f432c661794d1ee7060779.1605101222.git.yifeifz2@illinois.edu --- arch/Kconfig | 17 ++++++++++++++ fs/proc/base.c | 6 +++++ include/linux/seccomp.h | 7 ++++++ kernel/seccomp.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 56b6ccc0e32d..35c9463b7d10 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -486,6 +486,9 @@ config HAVE_ARCH_SECCOMP_FILTER - secure_computing return value is checked and a return value of -1 results in the system call being skipped immediately. - seccomp syscall wired up + - if !HAVE_SPARSE_SYSCALL_NR, have SECCOMP_ARCH_NATIVE, + SECCOMP_ARCH_NATIVE_NR, SECCOMP_ARCH_NATIVE_NAME defined. If + COMPAT is supported, have the SECCOMP_ARCH_COMPAT* defines too. config SECCOMP prompt "Enable seccomp to safely execute untrusted bytecode" @@ -514,6 +517,20 @@ config SECCOMP_FILTER See Documentation/userspace-api/seccomp_filter.rst for details. +config SECCOMP_CACHE_DEBUG + bool "Show seccomp filter cache status in /proc/pid/seccomp_cache" + depends on SECCOMP_FILTER && !HAVE_SPARSE_SYSCALL_NR + depends on PROC_FS + help + This enables the /proc/pid/seccomp_cache interface to monitor + seccomp cache data. The file format is subject to change. Reading + the file requires CAP_SYS_ADMIN. + + This option is for debugging only. Enabling presents the risk that + an adversary may be able to infer the seccomp filter logic. + + If unsure, say N. + config HAVE_ARCH_STACKLEAK bool help diff --git a/fs/proc/base.c b/fs/proc/base.c index b362523a9829..8a7d682ba881 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3263,6 +3263,9 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3592,6 +3595,9 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_PROC_PID_ARCH_STATUS ONE("arch_status", S_IRUGO, proc_pid_arch_status), #endif +#ifdef CONFIG_SECCOMP_CACHE_DEBUG + ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 02aef2844c38..76963ec4641a 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -121,4 +121,11 @@ static inline long seccomp_get_metadata(struct task_struct *task, return -EINVAL; } #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */ + +#ifdef CONFIG_SECCOMP_CACHE_DEBUG +struct seq_file; + +int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task); +#endif #endif /* _LINUX_SECCOMP_H */ diff --git a/kernel/seccomp.c b/kernel/seccomp.c index d8cf468dbe1e..76f524e320b1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -553,6 +553,9 @@ void seccomp_filter_release(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; + /* We are effectively holding the siglock by not having any sighand. */ + WARN_ON(tsk->sighand != NULL); + /* Detach task from its filter tree. */ tsk->seccomp.filter = NULL; __seccomp_filter_release(orig); @@ -2335,3 +2338,59 @@ static int __init seccomp_sysctl_init(void) device_initcall(seccomp_sysctl_init) #endif /* CONFIG_SYSCTL */ + +#ifdef CONFIG_SECCOMP_CACHE_DEBUG +/* Currently CONFIG_SECCOMP_CACHE_DEBUG implies SECCOMP_ARCH_NATIVE */ +static void proc_pid_seccomp_cache_arch(struct seq_file *m, const char *name, + const void *bitmap, size_t bitmap_size) +{ + int nr; + + for (nr = 0; nr < bitmap_size; nr++) { + bool cached = test_bit(nr, bitmap); + char *status = cached ? "ALLOW" : "FILTER"; + + seq_printf(m, "%s %d %s\n", name, nr, status); + } +} + +int proc_pid_seccomp_cache(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct seccomp_filter *f; + unsigned long flags; + + /* + * We don't want some sandboxed process to know what their seccomp + * filters consist of. + */ + if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) + return -EACCES; + + if (!lock_task_sighand(task, &flags)) + return -ESRCH; + + f = READ_ONCE(task->seccomp.filter); + if (!f) { + unlock_task_sighand(task, &flags); + return 0; + } + + /* prevent filter from being freed while we are printing it */ + __get_seccomp_filter(f); + unlock_task_sighand(task, &flags); + + proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_NATIVE_NAME, + f->cache.allow_native, + SECCOMP_ARCH_NATIVE_NR); + +#ifdef SECCOMP_ARCH_COMPAT + proc_pid_seccomp_cache_arch(m, SECCOMP_ARCH_COMPAT_NAME, + f->cache.allow_compat, + SECCOMP_ARCH_COMPAT_NR); +#endif /* SECCOMP_ARCH_COMPAT */ + + __put_seccomp_filter(f); + return 0; +} +#endif /* CONFIG_SECCOMP_CACHE_DEBUG */ -- cgit v1.2.3 From 91b2db27d3ff9ad29e8b3108dfbf1e2f49fe9bd3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 19 Nov 2020 16:28:33 -0800 Subject: bpf: Simplify task_file_seq_get_next() Simplify task_file_seq_get_next() by removing two in/out arguments: task and fstruct. Use info->task and info->files instead. Signed-off-by: Song Liu Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201120002833.2481110-1-songliubraving@fb.com --- kernel/bpf/task_iter.c | 54 ++++++++++++++++---------------------------------- 1 file changed, 17 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 1fdb2fc196cd..0458a40edf10 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -136,8 +136,7 @@ struct bpf_iter_seq_task_file_info { }; static struct file * -task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, - struct task_struct **task, struct files_struct **fstruct) +task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info) { struct pid_namespace *ns = info->common.ns; u32 curr_tid = info->tid, max_fds; @@ -150,14 +149,17 @@ task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, * Otherwise, it does not hold any reference. */ again: - if (*task) { - curr_task = *task; - curr_files = *fstruct; + if (info->task) { + curr_task = info->task; + curr_files = info->files; curr_fd = info->fd; } else { curr_task = task_seq_get_next(ns, &curr_tid, true); - if (!curr_task) + if (!curr_task) { + info->task = NULL; + info->files = NULL; return NULL; + } curr_files = get_files_struct(curr_task); if (!curr_files) { @@ -167,9 +169,8 @@ again: goto again; } - /* set *fstruct, *task and info->tid */ - *fstruct = curr_files; - *task = curr_task; + info->files = curr_files; + info->task = curr_task; if (curr_tid == info->tid) { curr_fd = info->fd; } else { @@ -199,8 +200,8 @@ again: rcu_read_unlock(); put_files_struct(curr_files); put_task_struct(curr_task); - *task = NULL; - *fstruct = NULL; + info->task = NULL; + info->files = NULL; info->fd = 0; curr_tid = ++(info->tid); goto again; @@ -209,21 +210,13 @@ again: static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) { struct bpf_iter_seq_task_file_info *info = seq->private; - struct files_struct *files = NULL; - struct task_struct *task = NULL; struct file *file; - file = task_file_seq_get_next(info, &task, &files); - if (!file) { - info->files = NULL; - info->task = NULL; - return NULL; - } - - if (*pos == 0) + info->task = NULL; + info->files = NULL; + file = task_file_seq_get_next(info); + if (file && *pos == 0) ++*pos; - info->task = task; - info->files = files; return file; } @@ -231,24 +224,11 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_task_file_info *info = seq->private; - struct files_struct *files = info->files; - struct task_struct *task = info->task; - struct file *file; ++*pos; ++info->fd; fput((struct file *)v); - file = task_file_seq_get_next(info, &task, &files); - if (!file) { - info->files = NULL; - info->task = NULL; - return NULL; - } - - info->task = task; - info->files = files; - - return file; + return task_file_seq_get_next(info); } struct bpf_iter__task_file { -- cgit v1.2.3 From fab686eb0307121e7a2890b6d6c57edd2457863d Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Fri, 20 Nov 2020 18:05:45 +0100 Subject: seccomp: Remove bogus __user annotations Buffers that are passed to read_actions_logged() and write_actions_logged() are in kernel memory; the sysctl core takes care of copying from/to userspace. Fixes: 32927393dc1c ("sysctl: pass kernel pointers to ->proc_handler") Reviewed-by: Tyler Hicks Signed-off-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20201120170545.1419332-1-jannh@google.com --- kernel/seccomp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 76f524e320b1..0e0e369d2fcb 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -2202,7 +2202,7 @@ static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names) return true; } -static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer, +static int read_actions_logged(struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos) { char names[sizeof(seccomp_actions_avail)]; @@ -2220,7 +2220,7 @@ static int read_actions_logged(struct ctl_table *ro_table, void __user *buffer, return proc_dostring(&table, 0, buffer, lenp, ppos); } -static int write_actions_logged(struct ctl_table *ro_table, void __user *buffer, +static int write_actions_logged(struct ctl_table *ro_table, void *buffer, size_t *lenp, loff_t *ppos, u32 *actions_logged) { char names[sizeof(seccomp_actions_avail)]; -- cgit v1.2.3 From ae9ef58996a4447dd44aa638759f913c883ba816 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 13 Nov 2020 15:02:18 +0100 Subject: softirq: Move related code into one section To prepare for adding a RT aware variant of softirq serialization and processing move related code into one section so the necessary #ifdeffery is reduced to one. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20201113141733.974214480@linutronix.de --- kernel/softirq.c | 107 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 54 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 09229ad82209..617009ccd82c 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -92,6 +92,13 @@ static bool ksoftirqd_running(unsigned long pending) !__kthread_should_park(tsk); } +#ifdef CONFIG_TRACE_IRQFLAGS +DEFINE_PER_CPU(int, hardirqs_enabled); +DEFINE_PER_CPU(int, hardirq_context); +EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); +#endif + /* * preempt_count and SOFTIRQ_OFFSET usage: * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving @@ -102,17 +109,11 @@ static bool ksoftirqd_running(unsigned long pending) * softirq and whether we just have bh disabled. */ +#ifdef CONFIG_TRACE_IRQFLAGS /* - * This one is for softirq.c-internal use, - * where hardirqs are disabled legitimately: + * This is for softirq.c-internal use, where hardirqs are disabled + * legitimately: */ -#ifdef CONFIG_TRACE_IRQFLAGS - -DEFINE_PER_CPU(int, hardirqs_enabled); -DEFINE_PER_CPU(int, hardirq_context); -EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); -EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); - void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -203,6 +204,50 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) } EXPORT_SYMBOL(__local_bh_enable_ip); +static inline void invoke_softirq(void) +{ + if (ksoftirqd_running(local_softirq_pending())) + return; + + if (!force_irqthreads) { +#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK + /* + * We can safely execute softirq on the current stack if + * it is the irq stack, because it should be near empty + * at this stage. + */ + __do_softirq(); +#else + /* + * Otherwise, irq_exit() is called on the task stack that can + * be potentially deep already. So call softirq in its own stack + * to prevent from any overrun. + */ + do_softirq_own_stack(); +#endif + } else { + wakeup_softirqd(); + } +} + +asmlinkage __visible void do_softirq(void) +{ + __u32 pending; + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + pending = local_softirq_pending(); + + if (pending && !ksoftirqd_running(pending)) + do_softirq_own_stack(); + + local_irq_restore(flags); +} + /* * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, * but break the loop if need_resched() is set or after 2 ms. @@ -327,24 +372,6 @@ restart: current_restore_flags(old_flags, PF_MEMALLOC); } -asmlinkage __visible void do_softirq(void) -{ - __u32 pending; - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - - pending = local_softirq_pending(); - - if (pending && !ksoftirqd_running(pending)) - do_softirq_own_stack(); - - local_irq_restore(flags); -} - /** * irq_enter_rcu - Enter an interrupt context with RCU watching */ @@ -371,32 +398,6 @@ void irq_enter(void) irq_enter_rcu(); } -static inline void invoke_softirq(void) -{ - if (ksoftirqd_running(local_softirq_pending())) - return; - - if (!force_irqthreads) { -#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK - /* - * We can safely execute softirq on the current stack if - * it is the irq stack, because it should be near empty - * at this stage. - */ - __do_softirq(); -#else - /* - * Otherwise, irq_exit() is called on the task stack that can - * be potentially deep already. So call softirq in its own stack - * to prevent from any overrun. - */ - do_softirq_own_stack(); -#endif - } else { - wakeup_softirqd(); - } -} - static inline void tick_irq_exit(void) { #ifdef CONFIG_NO_HZ_COMMON -- cgit v1.2.3 From 23acdc76f1798b090bb9dcc90671cd29d929834e Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Thu, 12 Nov 2020 18:53:34 -0800 Subject: signal: clear non-uapi flag bits when passing/returning sa_flags Previously we were not clearing non-uapi flag bits in sigaction.sa_flags when storing the userspace-provided sa_flags or when returning them via oldact. Start doing so. This allows userspace to detect missing support for flag bits and allows the kernel to use non-uapi bits internally, as we are already doing in arch/x86 for two flag bits. Now that this change is in place, we no longer need the code in arch/x86 that was hiding these bits from userspace, so remove it. This is technically a userspace-visible behavior change for sigaction, as the unknown bits returned via oldact.sa_flags are no longer set. However, we are free to define the behavior for unknown bits exactly because their behavior is currently undefined, so for now we can define the meaning of each of them to be "clear the bit in oldact.sa_flags unless the bit becomes known in the future". Furthermore, this behavior is consistent with OpenBSD [1], illumos [2] and XNU [3] (FreeBSD [4] and NetBSD [5] fail the syscall if unknown bits are set). So there is some precedent for this behavior in other kernels, and in particular in XNU, which is probably the most popular kernel among those that I looked at, which means that this change is less likely to be a compatibility issue. Link: [1] https://github.com/openbsd/src/blob/f634a6a4b5bf832e9c1de77f7894ae2625e74484/sys/kern/kern_sig.c#L278 Link: [2] https://github.com/illumos/illumos-gate/blob/76f19f5fdc974fe5be5c82a556e43a4df93f1de1/usr/src/uts/common/syscall/sigaction.c#L86 Link: [3] https://github.com/apple/darwin-xnu/blob/a449c6a3b8014d9406c2ddbdc81795da24aa7443/bsd/kern/kern_sig.c#L480 Link: [4] https://github.com/freebsd/freebsd/blob/eded70c37057857c6e23fae51f86b8f8f43cd2d0/sys/kern/kern_sig.c#L699 Link: [5] https://github.com/NetBSD/src/blob/3365779becdcedfca206091a645a0e8e22b2946e/sys/kern/sys_sig.c#L473 Signed-off-by: Peter Collingbourne Reviewed-by: Dave Martin Acked-by: "Eric W. Biederman" Link: https://linux-review.googlesource.com/id/I35aab6f5be932505d90f3b3450c083b4db1eca86 Link: https://lkml.kernel.org/r/878dbcb5f47bc9b11881c81f745c0bef5c23f97f.1605235762.git.pcc@google.com Signed-off-by: Eric W. Biederman --- arch/arm/include/asm/signal.h | 2 ++ arch/parisc/include/asm/signal.h | 2 ++ arch/x86/kernel/signal_compat.c | 7 ------- include/linux/signal_types.h | 12 ++++++++++++ kernel/signal.c | 10 ++++++++++ 5 files changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/arm/include/asm/signal.h b/arch/arm/include/asm/signal.h index 65530a042009..430be7774402 100644 --- a/arch/arm/include/asm/signal.h +++ b/arch/arm/include/asm/signal.h @@ -17,6 +17,8 @@ typedef struct { unsigned long sig[_NSIG_WORDS]; } sigset_t; +#define __ARCH_UAPI_SA_FLAGS (SA_THIRTYTWO | SA_RESTORER) + #define __ARCH_HAS_SA_RESTORER #include diff --git a/arch/parisc/include/asm/signal.h b/arch/parisc/include/asm/signal.h index 715c96ba2ec8..30dd1e43ef88 100644 --- a/arch/parisc/include/asm/signal.h +++ b/arch/parisc/include/asm/signal.h @@ -21,6 +21,8 @@ typedef struct { unsigned long sig[_NSIG_WORDS]; } sigset_t; +#define __ARCH_UAPI_SA_FLAGS _SA_SIGGFAULT + #include #endif /* !__ASSEMBLY */ diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index a7f3e12cfbdb..ddfd919be46c 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -165,16 +165,9 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { signal_compat_build_tests(); - /* Don't leak in-kernel non-uapi flags to user-space */ - if (oact) - oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); - if (!act) return; - /* Don't let flags to be set from userspace */ - act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); - if (in_ia32_syscall()) act->sa.sa_flags |= SA_IA32_ABI; if (in_x32_syscall()) diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h index f8a90ae9c6ec..a7887ad84d36 100644 --- a/include/linux/signal_types.h +++ b/include/linux/signal_types.h @@ -68,4 +68,16 @@ struct ksignal { int sig; }; +#ifndef __ARCH_UAPI_SA_FLAGS +#ifdef SA_RESTORER +#define __ARCH_UAPI_SA_FLAGS SA_RESTORER +#else +#define __ARCH_UAPI_SA_FLAGS 0 +#endif +#endif + +#define UAPI_SA_FLAGS \ + (SA_NOCLDSTOP | SA_NOCLDWAIT | SA_SIGINFO | SA_ONSTACK | SA_RESTART | \ + SA_NODEFER | SA_RESETHAND | __ARCH_UAPI_SA_FLAGS) + #endif /* _LINUX_SIGNAL_TYPES_H */ diff --git a/kernel/signal.c b/kernel/signal.c index ef8f2a28d37c..8f5bd12ee41b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3985,6 +3985,16 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (oact) *oact = *k; + /* + * Clear unknown flag bits in order to allow userspace to detect missing + * support for flag bits and to allow the kernel to use non-uapi bits + * internally. + */ + if (act) + act->sa.sa_flags &= UAPI_SA_FLAGS; + if (oact) + oact->sa.sa_flags &= UAPI_SA_FLAGS; + sigaction_compat_abi(act, oact); if (act) { -- cgit v1.2.3 From a54f0dfda754c5cecc89a14dab68a3edc1e497b5 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 16 Nov 2020 19:17:25 -0800 Subject: signal: define the SA_UNSUPPORTED bit in sa_flags Define a sa_flags bit, SA_UNSUPPORTED, which will never be supported in the uapi. The purpose of this flag bit is to allow userspace to distinguish an old kernel that does not clear unknown sa_flags bits from a kernel that supports every flag bit. In other words, if userspace does something like: act.sa_flags |= SA_UNSUPPORTED; sigaction(SIGSEGV, &act, 0); sigaction(SIGSEGV, 0, &oldact); and finds that SA_UNSUPPORTED remains set in oldact.sa_flags, it means that the kernel cannot be trusted to have cleared unknown flag bits from sa_flags, so no assumptions about flag bit support can be made. Signed-off-by: Peter Collingbourne Reviewed-by: Dave Martin Link: https://linux-review.googlesource.com/id/Ic2501ad150a3a79c1cf27fb8c99be342e9dffbcb Link: https://lkml.kernel.org/r/bda7ddff8895a9bc4ffc5f3cf3d4d37a32118077.1605582887.git.pcc@google.com Signed-off-by: Eric W. Biederman --- include/uapi/asm-generic/signal-defs.h | 7 +++++++ kernel/signal.c | 6 ++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/uapi/asm-generic/signal-defs.h b/include/uapi/asm-generic/signal-defs.h index 44f070982752..c790f67304ba 100644 --- a/include/uapi/asm-generic/signal-defs.h +++ b/include/uapi/asm-generic/signal-defs.h @@ -14,6 +14,12 @@ * SA_RESTART flag to get restarting signals (which were the default long ago) * SA_NODEFER prevents the current signal from being masked in the handler. * SA_RESETHAND clears the handler when the signal is delivered. + * SA_UNSUPPORTED is a flag bit that will never be supported. Kernels from + * before the introduction of SA_UNSUPPORTED did not clear unknown bits from + * sa_flags when read using the oldact argument to sigaction and rt_sigaction, + * so this bit allows flag bit support to be detected from userspace while + * allowing an old kernel to be distinguished from a kernel that supports every + * flag bit. * * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single * Unix names RESETHAND and NODEFER respectively. @@ -34,6 +40,7 @@ /* 0x00000080 used on parisc */ /* 0x00000100 used on sparc */ /* 0x00000200 used on sparc */ +#define SA_UNSUPPORTED 0x00000400 /* 0x00010000 used on mips */ /* 0x01000000 used on x86 */ /* 0x02000000 used on x86 */ diff --git a/kernel/signal.c b/kernel/signal.c index 8f5bd12ee41b..8f34819e80de 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3985,6 +3985,12 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (oact) *oact = *k; + /* + * Make sure that we never accidentally claim to support SA_UNSUPPORTED, + * e.g. by having an architecture use the bit in their uapi. + */ + BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED); + /* * Clear unknown flag bits in order to allow userspace to detect missing * support for flag bits and to allow the kernel to use non-uapi bits -- cgit v1.2.3 From 6ac05e832a9e96f9b1c42a8917cdd317d7b6c8fa Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 20 Nov 2020 12:33:45 -0800 Subject: signal: define the SA_EXPOSE_TAGBITS bit in sa_flags Architectures that support address tagging, such as arm64, may want to expose fault address tag bits to the signal handler to help diagnose memory errors. However, these bits have not been previously set, and their presence may confuse unaware user applications. Therefore, introduce a SA_EXPOSE_TAGBITS flag bit in sa_flags that a signal handler may use to explicitly request that the bits are set. The generic signal handler APIs expect to receive tagged addresses. Architectures may specify how to untag addresses in the case where SA_EXPOSE_TAGBITS is clear by defining the arch_untagged_si_addr function. Signed-off-by: Peter Collingbourne Acked-by: "Eric W. Biederman" Link: https://linux-review.googlesource.com/id/I16dd0ed2081f091fce97be0190cb8caa874c26cb Link: https://lkml.kernel.org/r/13cf24d00ebdd8e1f55caf1821c7c29d54100191.1605904350.git.pcc@google.com Signed-off-by: Eric W. Biederman --- include/linux/signal.h | 14 ++++++++++++++ include/linux/signal_types.h | 2 +- include/uapi/asm-generic/signal-defs.h | 3 +++ kernel/signal.c | 24 ++++++++++++++++++++++++ 4 files changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index b256f9c65661..205526c4003a 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -469,4 +469,18 @@ struct seq_file; extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); #endif +#ifndef arch_untagged_si_addr +/* + * Given a fault address and a signal and si_code which correspond to the + * _sigfault union member, returns the address that must appear in si_addr if + * the signal handler does not have SA_EXPOSE_TAGBITS enabled in sa_flags. + */ +static inline void __user *arch_untagged_si_addr(void __user *addr, + unsigned long sig, + unsigned long si_code) +{ + return addr; +} +#endif + #endif /* _LINUX_SIGNAL_H */ diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h index a7887ad84d36..68e06c75c5b2 100644 --- a/include/linux/signal_types.h +++ b/include/linux/signal_types.h @@ -78,6 +78,6 @@ struct ksignal { #define UAPI_SA_FLAGS \ (SA_NOCLDSTOP | SA_NOCLDWAIT | SA_SIGINFO | SA_ONSTACK | SA_RESTART | \ - SA_NODEFER | SA_RESETHAND | __ARCH_UAPI_SA_FLAGS) + SA_NODEFER | SA_RESETHAND | SA_EXPOSE_TAGBITS | __ARCH_UAPI_SA_FLAGS) #endif /* _LINUX_SIGNAL_TYPES_H */ diff --git a/include/uapi/asm-generic/signal-defs.h b/include/uapi/asm-generic/signal-defs.h index c790f67304ba..fe929e7b77ca 100644 --- a/include/uapi/asm-generic/signal-defs.h +++ b/include/uapi/asm-generic/signal-defs.h @@ -20,6 +20,8 @@ * so this bit allows flag bit support to be detected from userspace while * allowing an old kernel to be distinguished from a kernel that supports every * flag bit. + * SA_EXPOSE_TAGBITS exposes an architecture-defined set of tag bits in + * siginfo.si_addr. * * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single * Unix names RESETHAND and NODEFER respectively. @@ -41,6 +43,7 @@ /* 0x00000100 used on sparc */ /* 0x00000200 used on sparc */ #define SA_UNSUPPORTED 0x00000400 +#define SA_EXPOSE_TAGBITS 0x00000800 /* 0x00010000 used on mips */ /* 0x01000000 used on x86 */ /* 0x02000000 used on x86 */ diff --git a/kernel/signal.c b/kernel/signal.c index 8f34819e80de..26018c59821d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2524,6 +2524,26 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info) return signr; } +static void hide_si_addr_tag_bits(struct ksignal *ksig) +{ + switch (siginfo_layout(ksig->sig, ksig->info.si_code)) { + case SIL_FAULT: + case SIL_FAULT_MCEERR: + case SIL_FAULT_BNDERR: + case SIL_FAULT_PKUERR: + ksig->info.si_addr = arch_untagged_si_addr( + ksig->info.si_addr, ksig->sig, ksig->info.si_code); + break; + case SIL_KILL: + case SIL_TIMER: + case SIL_POLL: + case SIL_CHLD: + case SIL_RT: + case SIL_SYS: + break; + } +} + bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; @@ -2761,6 +2781,10 @@ relock: spin_unlock_irq(&sighand->siglock); ksig->sig = signr; + + if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) + hide_si_addr_tag_bits(ksig); + return ksig->sig > 0; } -- cgit v1.2.3 From ab150c3f80dcce670926ab3ca412be5047011d22 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 13 Nov 2020 16:58:10 +0800 Subject: PM / suspend: fix kernel-doc markup Add parameter explanation to fix kernel-doc marks: kernel/power/suspend.c:233: warning: Function parameter or member 'state' not described in 'suspend_valid_only_mem' kernel/power/suspend.c:344: warning: Function parameter or member 'state' not described in 'suspend_prepare' Signed-off-by: Alex Shi [ rjw: Change the proposed parameter descriptions. ] Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 32391acc806b..d8cae434f9eb 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -224,6 +224,7 @@ EXPORT_SYMBOL_GPL(suspend_set_ops); /** * suspend_valid_only_mem - Generic memory-only valid callback. + * @state: Target system sleep state. * * Platform drivers that implement mem suspend only and only need to check for * that in their .valid() callback can use this instead of rolling their own @@ -335,6 +336,7 @@ static int suspend_test(int level) /** * suspend_prepare - Prepare for entering system sleep state. + * @state: Target system sleep state. * * Common code run for every system sleep state that can be entered (except for * hibernation). Run suspend notifiers, allocate the "suspend" console and -- cgit v1.2.3 From 74d862b682f51e45d25b95b1ecf212428a4967b0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2020 20:48:42 +0100 Subject: sched: Make migrate_disable/enable() independent of RT Now that the scheduler can deal with migrate disable properly, there is no real compelling reason to make it only available for RT. There are quite some code pathes which needlessly disable preemption in order to prevent migration and some constructs like kmap_atomic() enforce it implicitly. Making it available independent of RT allows to provide a preemptible variant of kmap_atomic() and makes the code more consistent in general. Signed-off-by: Thomas Gleixner Grudgingly-Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201118204007.269943012@linutronix.de --- include/linux/kernel.h | 21 ++++++++++++++------- include/linux/preempt.h | 38 +++----------------------------------- include/linux/sched.h | 2 +- kernel/sched/core.c | 45 +++++++++++++++++++++++++++++++++++---------- kernel/sched/sched.h | 4 ++-- lib/smp_processor_id.c | 2 +- 6 files changed, 56 insertions(+), 56 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2f05e9128201..665837f9a831 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -204,6 +204,7 @@ extern int _cond_resched(void); extern void ___might_sleep(const char *file, int line, int preempt_offset); extern void __might_sleep(const char *file, int line, int preempt_offset); extern void __cant_sleep(const char *file, int line, int preempt_offset); +extern void __cant_migrate(const char *file, int line); /** * might_sleep - annotation for functions that can sleep @@ -227,6 +228,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); # define cant_sleep() \ do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) # define sched_annotate_sleep() (current->task_state_change = 0) + +/** + * cant_migrate - annotation for functions that cannot migrate + * + * Will print a stack trace if executed in code which is migratable + */ +# define cant_migrate() \ + do { \ + if (IS_ENABLED(CONFIG_SMP)) \ + __cant_migrate(__FILE__, __LINE__); \ + } while (0) + /** * non_block_start - annotate the start of section where sleeping is prohibited * @@ -251,6 +264,7 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) # define cant_sleep() do { } while (0) +# define cant_migrate() do { } while (0) # define sched_annotate_sleep() do { } while (0) # define non_block_start() do { } while (0) # define non_block_end() do { } while (0) @@ -258,13 +272,6 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) -#ifndef CONFIG_PREEMPT_RT -# define cant_migrate() cant_sleep() -#else - /* Placeholder for now */ -# define cant_migrate() do { } while (0) -#endif - /** * abs - return absolute value of an argument * @x: the value. If it is unsigned type, it is converted to signed type first. diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 8b43922e65df..6df63cbe8bb0 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -322,7 +322,7 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, #endif -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_SMP /* * Migrate-Disable and why it is undesired. @@ -382,43 +382,11 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, extern void migrate_disable(void); extern void migrate_enable(void); -#elif defined(CONFIG_PREEMPT_RT) +#else static inline void migrate_disable(void) { } static inline void migrate_enable(void) { } -#else /* !CONFIG_PREEMPT_RT */ - -/** - * migrate_disable - Prevent migration of the current task - * - * Maps to preempt_disable() which also disables preemption. Use - * migrate_disable() to annotate that the intent is to prevent migration, - * but not necessarily preemption. - * - * Can be invoked nested like preempt_disable() and needs the corresponding - * number of migrate_enable() invocations. - */ -static __always_inline void migrate_disable(void) -{ - preempt_disable(); -} - -/** - * migrate_enable - Allow migration of the current task - * - * Counterpart to migrate_disable(). - * - * As migrate_disable() can be invoked nested, only the outermost invocation - * reenables migration. - * - * Currently mapped to preempt_enable(). - */ -static __always_inline void migrate_enable(void) -{ - preempt_enable(); -} - -#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */ +#endif /* CONFIG_SMP */ #endif /* __LINUX_PREEMPT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3af9d52fe093..a33f35f68060 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -715,7 +715,7 @@ struct task_struct { const cpumask_t *cpus_ptr; cpumask_t cpus_mask; void *migration_pending; -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_SMP unsigned short migration_disabled; #endif unsigned short migration_flags; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e6473ecaab3c..c962922784d1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1728,8 +1728,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPT_RT - static void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); @@ -1800,8 +1798,6 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) return rq->nr_pinned; } -#endif - /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). @@ -2882,7 +2878,7 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else +#else /* CONFIG_SMP */ static inline int __set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask, @@ -2891,10 +2887,6 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, return set_cpus_allowed_ptr(p, new_mask); } -#endif /* CONFIG_SMP */ - -#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT) - static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } static inline bool rq_has_pinned_tasks(struct rq *rq) @@ -2902,7 +2894,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) return false; } -#endif +#endif /* !CONFIG_SMP */ static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) @@ -7924,6 +7916,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL_GPL(__cant_sleep); + +#ifdef CONFIG_SMP +void __cant_migrate(const char *file, int line) +{ + static unsigned long prev_jiffy; + + if (irqs_disabled()) + return; + + if (is_migration_disabled(current)) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + if (preempt_count() > 0) + return; + + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), is_migration_disabled(current), + current->pid, current->comm); + + debug_show_held_locks(current); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +} +EXPORT_SYMBOL_GPL(__cant_migrate); +#endif #endif #ifdef CONFIG_MAGIC_SYSRQ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 590e6f27068c..f5acb6c5ce49 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1056,7 +1056,7 @@ struct rq { struct cpuidle_state *idle_state; #endif -#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP) +#ifdef CONFIG_SMP unsigned int nr_pinned; #endif unsigned int push_busy; @@ -1092,7 +1092,7 @@ static inline int cpu_of(struct rq *rq) static inline bool is_migration_disabled(struct task_struct *p) { -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_SMP return p->migration_disabled; #else return false; diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index faaa927ac2c8..1c1dbd300325 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -26,7 +26,7 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) if (current->nr_cpus_allowed == 1) goto out; -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) +#ifdef CONFIG_SMP if (current->migration_disabled) goto out; #endif -- cgit v1.2.3 From 5fbda3ecd14a5343644979c98d6eb65b7e7de9d8 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 18 Nov 2020 20:48:43 +0100 Subject: sched: highmem: Store local kmaps in task struct Instead of storing the map per CPU provide and use per task storage. That prepares for local kmaps which are preemptible. The context switch code is preparatory and not yet in use because kmap_atomic() runs with preemption disabled. Will be made usable in the next step. The context switch logic is safe even when an interrupt happens after clearing or before restoring the kmaps. The kmap index in task struct is not modified so any nesting kmap in an interrupt will use unused indices and on return the counter is the same as before. Also add an assert into the return to user space code. Going back to user space with an active kmap local is a nono. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201118204007.372935758@linutronix.de --- include/linux/highmem-internal.h | 10 ++++ include/linux/sched.h | 9 ++++ kernel/entry/common.c | 2 + kernel/fork.c | 1 + kernel/sched/core.c | 25 ++++++++++ mm/highmem.c | 99 ++++++++++++++++++++++++++++++++++++---- 6 files changed, 136 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h index 6ceed907b14e..c5a22177db85 100644 --- a/include/linux/highmem-internal.h +++ b/include/linux/highmem-internal.h @@ -9,6 +9,16 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); void *__kmap_local_page_prot(struct page *page, pgprot_t prot); void kunmap_local_indexed(void *vaddr); +void kmap_local_fork(struct task_struct *tsk); +void __kmap_local_sched_out(void); +void __kmap_local_sched_in(void); +static inline void kmap_assert_nomap(void) +{ + DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); +} +#else +static inline void kmap_local_fork(struct task_struct *tsk) { } +static inline void kmap_assert_nomap(void) { } #endif #ifdef CONFIG_HIGHMEM diff --git a/include/linux/sched.h b/include/linux/sched.h index a33f35f68060..8946911cee9f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -34,6 +34,7 @@ #include #include #include +#include /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -629,6 +630,13 @@ struct wake_q_node { struct wake_q_node *next; }; +struct kmap_ctrl { +#ifdef CONFIG_KMAP_LOCAL + int idx; + pte_t pteval[KM_MAX_IDX]; +#endif +}; + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -1294,6 +1302,7 @@ struct task_struct { unsigned int sequential_io; unsigned int sequential_io_avg; #endif + struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; #endif diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 2b8366693d5c..4ae1fe0898e9 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -194,6 +195,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) /* Ensure that the address limit is intact and no locks are held */ addr_limit_user_check(); + kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); } diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..17dcd1817799 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -930,6 +930,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) account_kernel_stack(tsk, 1); kcov_task_init(tsk); + kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION tsk->fail_nth = 0; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c962922784d1..953abdbe1472 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4094,6 +4094,22 @@ static inline void finish_lock_switch(struct rq *rq) # define finish_arch_post_lock_switch() do { } while (0) #endif +static inline void kmap_local_sched_out(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_out(); +#endif +} + +static inline void kmap_local_sched_in(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_in(); +#endif +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -4116,6 +4132,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, perf_event_task_sched_out(prev, next); rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); + kmap_local_sched_out(); prepare_task(next); prepare_arch_switch(next); } @@ -4182,6 +4199,14 @@ static struct rq *finish_task_switch(struct task_struct *prev) finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); + /* + * kmap_local_sched_out() is invoked with rq::lock held and + * interrupts disabled. There is no requirement for that, but the + * sched out code does not have an interrupt enabled section. + * Restoring the maps on sched in does not require interrupts being + * disabled either. + */ + kmap_local_sched_in(); fire_sched_in_preempt_notifiers(current); /* diff --git a/mm/highmem.c b/mm/highmem.c index 39aaca1a1ece..d1ef06aa6de6 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -365,8 +365,6 @@ EXPORT_SYMBOL(kunmap_high); #include -static DEFINE_PER_CPU(int, __kmap_local_idx); - /* * With DEBUG_KMAP_LOCAL the stack depth is doubled and every second * slot is unused which acts as a guard page @@ -379,23 +377,21 @@ static DEFINE_PER_CPU(int, __kmap_local_idx); static inline int kmap_local_idx_push(void) { - int idx = __this_cpu_add_return(__kmap_local_idx, KM_INCR) - 1; - WARN_ON_ONCE(in_irq() && !irqs_disabled()); - BUG_ON(idx >= KM_MAX_IDX); - return idx; + current->kmap_ctrl.idx += KM_INCR; + BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX); + return current->kmap_ctrl.idx - 1; } static inline int kmap_local_idx(void) { - return __this_cpu_read(__kmap_local_idx) - 1; + return current->kmap_ctrl.idx - 1; } static inline void kmap_local_idx_pop(void) { - int idx = __this_cpu_sub_return(__kmap_local_idx, KM_INCR); - - BUG_ON(idx < 0); + current->kmap_ctrl.idx -= KM_INCR; + BUG_ON(current->kmap_ctrl.idx < 0); } #ifndef arch_kmap_local_post_map @@ -464,6 +460,7 @@ void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) pteval = pfn_pte(pfn, prot); set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval); arch_kmap_local_post_map(vaddr, pteval); + current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; preempt_enable(); return (void *)vaddr; @@ -522,10 +519,92 @@ void kunmap_local_indexed(void *vaddr) arch_kmap_local_pre_unmap(addr); pte_clear(&init_mm, addr, kmap_pte - idx); arch_kmap_local_post_unmap(addr); + current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); kmap_local_idx_pop(); preempt_enable(); } EXPORT_SYMBOL(kunmap_local_indexed); + +/* + * Invoked before switch_to(). This is safe even when during or after + * clearing the maps an interrupt which needs a kmap_local happens because + * the task::kmap_ctrl.idx is not modified by the unmapping code so a + * nested kmap_local will use the next unused index and restore the index + * on unmap. The already cleared kmaps of the outgoing task are irrelevant + * because the interrupt context does not know about them. The same applies + * when scheduling back in for an interrupt which happens before the + * restore is complete. + */ +void __kmap_local_sched_out(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte = kmap_get_pte(); + int i; + + /* Clear kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + WARN_ON_ONCE(!pte_none(pteval)); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* + * This is a horrible hack for XTENSA to calculate the + * coloured PTE index. Uses the PFN encoded into the pteval + * and the map index calculation because the actual mapped + * virtual address is not stored in task::kmap_ctrl. + * For any sane architecture this is optimized out. + */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + arch_kmap_local_pre_unmap(addr); + pte_clear(&init_mm, addr, kmap_pte - idx); + arch_kmap_local_post_unmap(addr); + } +} + +void __kmap_local_sched_in(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte = kmap_get_pte(); + int i; + + /* Restore kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + WARN_ON_ONCE(!pte_none(pteval)); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* See comment in __kmap_local_sched_out() */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte_at(&init_mm, addr, kmap_pte - idx, pteval); + arch_kmap_local_post_map(addr, pteval); + } +} + +void kmap_local_fork(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->kmap_ctrl.idx)) + memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl)); +} + #endif #if defined(HASHED_PAGE_VIRTUAL) -- cgit v1.2.3 From 58c644ba512cfbc2e39b758dd979edd1d6d00e27 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Nov 2020 11:50:35 +0100 Subject: sched/idle: Fix arch_cpu_idle() vs tracing We call arch_cpu_idle() with RCU disabled, but then use local_irq_{en,dis}able(), which invokes tracing, which relies on RCU. Switch all arch_cpu_idle() implementations to use raw_local_irq_{en,dis}able() and carefully manage the lockdep,rcu,tracing state like we do in entry. (XXX: we really should change arch_cpu_idle() to not return with interrupts enabled) Reported-by: Sven Schnelle Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Mark Rutland Tested-by: Mark Rutland Link: https://lkml.kernel.org/r/20201120114925.594122626@infradead.org --- arch/alpha/kernel/process.c | 2 +- arch/arm/kernel/process.c | 2 +- arch/arm64/kernel/process.c | 2 +- arch/csky/kernel/process.c | 2 +- arch/h8300/kernel/process.c | 2 +- arch/hexagon/kernel/process.c | 2 +- arch/ia64/kernel/process.c | 2 +- arch/microblaze/kernel/process.c | 2 +- arch/mips/kernel/idle.c | 12 ++++++------ arch/nios2/kernel/process.c | 2 +- arch/openrisc/kernel/process.c | 2 +- arch/parisc/kernel/process.c | 2 +- arch/powerpc/kernel/idle.c | 4 ++-- arch/riscv/kernel/process.c | 2 +- arch/s390/kernel/idle.c | 6 +++--- arch/sh/kernel/idle.c | 2 +- arch/sparc/kernel/leon_pmc.c | 4 ++-- arch/sparc/kernel/process_32.c | 2 +- arch/sparc/kernel/process_64.c | 4 ++-- arch/um/kernel/process.c | 2 +- arch/x86/include/asm/mwait.h | 2 -- arch/x86/kernel/process.c | 12 +++++++----- kernel/sched/idle.c | 28 +++++++++++++++++++++++++++- 23 files changed, 64 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 7462a7911002..4c7b0414a3ff 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -57,7 +57,7 @@ EXPORT_SYMBOL(pm_power_off); void arch_cpu_idle(void) { wtint(0); - local_irq_enable(); + raw_local_irq_enable(); } void arch_cpu_idle_dead(void) diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 8e6ace03e960..9f199b1e8383 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -71,7 +71,7 @@ void arch_cpu_idle(void) arm_pm_idle(); else cpu_do_idle(); - local_irq_enable(); + raw_local_irq_enable(); } void arch_cpu_idle_prepare(void) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 4784011cecac..9ebe02574127 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -126,7 +126,7 @@ void arch_cpu_idle(void) * tricks */ cpu_do_idle(); - local_irq_enable(); + raw_local_irq_enable(); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index f730869e21ee..69af6bc87e64 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -102,6 +102,6 @@ void arch_cpu_idle(void) #ifdef CONFIG_CPU_PM_STOP asm volatile("stop\n"); #endif - local_irq_enable(); + raw_local_irq_enable(); } #endif diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index aea0a40b77a9..bc1364db58fe 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -57,7 +57,7 @@ asmlinkage void ret_from_kernel_thread(void); */ void arch_cpu_idle(void) { - local_irq_enable(); + raw_local_irq_enable(); __asm__("sleep"); } diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index 5a0a95d93ddb..67767c5ed98c 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -44,7 +44,7 @@ void arch_cpu_idle(void) { __vmwait(); /* interrupts wake us up, but irqs are still disabled */ - local_irq_enable(); + raw_local_irq_enable(); } /* diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 6b61a703bcf5..c9ff8796b509 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -239,7 +239,7 @@ void arch_cpu_idle(void) if (mark_idle) (*mark_idle)(1); - safe_halt(); + raw_safe_halt(); if (mark_idle) (*mark_idle)(0); diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index a9e46e525cd0..f99860771ff4 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -149,5 +149,5 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpregs) void arch_cpu_idle(void) { - local_irq_enable(); + raw_local_irq_enable(); } diff --git a/arch/mips/kernel/idle.c b/arch/mips/kernel/idle.c index 5bc3b04693c7..18e69ebf5691 100644 --- a/arch/mips/kernel/idle.c +++ b/arch/mips/kernel/idle.c @@ -33,19 +33,19 @@ static void __cpuidle r3081_wait(void) { unsigned long cfg = read_c0_conf(); write_c0_conf(cfg | R30XX_CONF_HALT); - local_irq_enable(); + raw_local_irq_enable(); } static void __cpuidle r39xx_wait(void) { if (!need_resched()) write_c0_conf(read_c0_conf() | TX39_CONF_HALT); - local_irq_enable(); + raw_local_irq_enable(); } void __cpuidle r4k_wait(void) { - local_irq_enable(); + raw_local_irq_enable(); __r4k_wait(); } @@ -64,7 +64,7 @@ void __cpuidle r4k_wait_irqoff(void) " .set arch=r4000 \n" " wait \n" " .set pop \n"); - local_irq_enable(); + raw_local_irq_enable(); } /* @@ -84,7 +84,7 @@ static void __cpuidle rm7k_wait_irqoff(void) " wait \n" " mtc0 $1, $12 # stalls until W stage \n" " .set pop \n"); - local_irq_enable(); + raw_local_irq_enable(); } /* @@ -257,7 +257,7 @@ void arch_cpu_idle(void) if (cpu_wait) cpu_wait(); else - local_irq_enable(); + raw_local_irq_enable(); } #ifdef CONFIG_CPU_IDLE diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index 4ffe857e6ada..50b4eb19a6cc 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -33,7 +33,7 @@ EXPORT_SYMBOL(pm_power_off); void arch_cpu_idle(void) { - local_irq_enable(); + raw_local_irq_enable(); } /* diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 0ff391f00334..3c98728cce24 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -79,7 +79,7 @@ void machine_power_off(void) */ void arch_cpu_idle(void) { - local_irq_enable(); + raw_local_irq_enable(); if (mfspr(SPR_UPR) & SPR_UPR_PMP) mtspr(SPR_PMR, mfspr(SPR_PMR) | SPR_PMR_DME); } diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index f196d96e2f9f..a92a23d6acd9 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -169,7 +169,7 @@ void __cpuidle arch_cpu_idle_dead(void) void __cpuidle arch_cpu_idle(void) { - local_irq_enable(); + raw_local_irq_enable(); /* nop on real hardware, qemu will idle sleep. */ asm volatile("or %%r10,%%r10,%%r10\n":::); diff --git a/arch/powerpc/kernel/idle.c b/arch/powerpc/kernel/idle.c index ae0e2632393d..1f835539fda4 100644 --- a/arch/powerpc/kernel/idle.c +++ b/arch/powerpc/kernel/idle.c @@ -52,9 +52,9 @@ void arch_cpu_idle(void) * interrupts enabled, some don't. */ if (irqs_disabled()) - local_irq_enable(); + raw_local_irq_enable(); } else { - local_irq_enable(); + raw_local_irq_enable(); /* * Go into low thread priority and possibly * low power mode. diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 19225ec65db6..dd5f985b1f40 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -36,7 +36,7 @@ extern asmlinkage void ret_from_kernel_thread(void); void arch_cpu_idle(void) { wait_for_interrupt(); - local_irq_enable(); + raw_local_irq_enable(); } void show_regs(struct pt_regs *regs) diff --git a/arch/s390/kernel/idle.c b/arch/s390/kernel/idle.c index f7f1e64e0d98..2b85096964f8 100644 --- a/arch/s390/kernel/idle.c +++ b/arch/s390/kernel/idle.c @@ -33,10 +33,10 @@ void enabled_wait(void) PSW_MASK_IO | PSW_MASK_EXT | PSW_MASK_MCHECK; clear_cpu_flag(CIF_NOHZ_DELAY); - local_irq_save(flags); + raw_local_irq_save(flags); /* Call the assembler magic in entry.S */ psw_idle(idle, psw_mask); - local_irq_restore(flags); + raw_local_irq_restore(flags); /* Account time spent with enabled wait psw loaded as idle time. */ raw_write_seqcount_begin(&idle->seqcount); @@ -123,7 +123,7 @@ void arch_cpu_idle_enter(void) void arch_cpu_idle(void) { enabled_wait(); - local_irq_enable(); + raw_local_irq_enable(); } void arch_cpu_idle_exit(void) diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c index 0dc0f52f9bb8..f59814983bd5 100644 --- a/arch/sh/kernel/idle.c +++ b/arch/sh/kernel/idle.c @@ -22,7 +22,7 @@ static void (*sh_idle)(void); void default_idle(void) { set_bl_bit(); - local_irq_enable(); + raw_local_irq_enable(); /* Isn't this racy ? */ cpu_sleep(); clear_bl_bit(); diff --git a/arch/sparc/kernel/leon_pmc.c b/arch/sparc/kernel/leon_pmc.c index 065e2d4b7290..396f46bca52e 100644 --- a/arch/sparc/kernel/leon_pmc.c +++ b/arch/sparc/kernel/leon_pmc.c @@ -50,7 +50,7 @@ static void pmc_leon_idle_fixup(void) register unsigned int address = (unsigned int)leon3_irqctrl_regs; /* Interrupts need to be enabled to not hang the CPU */ - local_irq_enable(); + raw_local_irq_enable(); __asm__ __volatile__ ( "wr %%g0, %%asr19\n" @@ -66,7 +66,7 @@ static void pmc_leon_idle_fixup(void) static void pmc_leon_idle(void) { /* Interrupts need to be enabled to not hang the CPU */ - local_irq_enable(); + raw_local_irq_enable(); /* For systems without power-down, this will be no-op */ __asm__ __volatile__ ("wr %g0, %asr19\n\t"); diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index adfcaeab3ddc..a02363735915 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -74,7 +74,7 @@ void arch_cpu_idle(void) { if (sparc_idle) (*sparc_idle)(); - local_irq_enable(); + raw_local_irq_enable(); } /* XXX cli/sti -> local_irq_xxx here, check this works once SMP is fixed. */ diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index a75093b993f9..6f8c7822fc06 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -62,11 +62,11 @@ void arch_cpu_idle(void) { if (tlb_type != hypervisor) { touch_nmi_watchdog(); - local_irq_enable(); + raw_local_irq_enable(); } else { unsigned long pstate; - local_irq_enable(); + raw_local_irq_enable(); /* The sun4v sleeping code requires that we have PSTATE.IE cleared over * the cpu sleep hypervisor call. diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 3bed09538dd9..9505a7e87396 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -217,7 +217,7 @@ void arch_cpu_idle(void) { cpu_tasks[current_thread_info()->cpu].pid = os_getpid(); um_idle_sleep(); - local_irq_enable(); + raw_local_irq_enable(); } int __cant_sleep(void) { diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index e039a933aca3..29dd27b5a339 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -88,8 +88,6 @@ static inline void __mwaitx(unsigned long eax, unsigned long ebx, static inline void __sti_mwait(unsigned long eax, unsigned long ecx) { - trace_hardirqs_on(); - mds_idle_clear_cpu_buffers(); /* "mwait %eax, %ecx;" */ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ba4593a913fa..145a7ac0c19a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -685,7 +685,7 @@ void arch_cpu_idle(void) */ void __cpuidle default_idle(void) { - safe_halt(); + raw_safe_halt(); } #if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE) EXPORT_SYMBOL(default_idle); @@ -736,6 +736,8 @@ void stop_this_cpu(void *dummy) /* * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power * states (local apic timer and TSC stop). + * + * XXX this function is completely buggered vs RCU and tracing. */ static void amd_e400_idle(void) { @@ -757,9 +759,9 @@ static void amd_e400_idle(void) * The switch back from broadcast mode needs to be called with * interrupts disabled. */ - local_irq_disable(); + raw_local_irq_disable(); tick_broadcast_exit(); - local_irq_enable(); + raw_local_irq_enable(); } /* @@ -801,9 +803,9 @@ static __cpuidle void mwait_idle(void) if (!need_resched()) __sti_mwait(0, 0); else - local_irq_enable(); + raw_local_irq_enable(); } else { - local_irq_enable(); + raw_local_irq_enable(); } __current_clr_polling(); } diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 24d0ee26377d..c6932b8f4467 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -78,7 +78,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; - local_irq_enable(); + raw_local_irq_enable(); } /** @@ -94,9 +94,35 @@ void __cpuidle default_idle_call(void) trace_cpu_idle(1, smp_processor_id()); stop_critical_timings(); + + /* + * arch_cpu_idle() is supposed to enable IRQs, however + * we can't do that because of RCU and tracing. + * + * Trace IRQs enable here, then switch off RCU, and have + * arch_cpu_idle() use raw_local_irq_enable(). Note that + * rcu_idle_enter() relies on lockdep IRQ state, so switch that + * last -- this is very similar to the entry code. + */ + trace_hardirqs_on_prepare(); + lockdep_hardirqs_on_prepare(_THIS_IP_); rcu_idle_enter(); + lockdep_hardirqs_on(_THIS_IP_); + arch_cpu_idle(); + + /* + * OK, so IRQs are enabled here, but RCU needs them disabled to + * turn itself back on.. funny thing is that disabling IRQs + * will cause tracing, which needs RCU. Jump through hoops to + * make it 'work'. + */ + raw_local_irq_disable(); + lockdep_hardirqs_off(_THIS_IP_); rcu_idle_exit(); + lockdep_hardirqs_on(_THIS_IP_); + raw_local_irq_enable(); + start_critical_timings(); trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id()); } -- cgit v1.2.3 From abeae76a47005aa3f07c9be12d8076365622e25c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 20 Nov 2020 09:06:27 +0000 Subject: sched/numa: Rename nr_running and break out the magic number This is simply a preparation patch to make the following patches easier to read. No functional change. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20201120090630.3286-2-mgorman@techsingularity.net --- kernel/sched/fair.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6691e28fa3da..9d10abe00f72 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1559,7 +1559,7 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, int nr_running); +static inline long adjust_numa_imbalance(int imbalance, int dst_running); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -8991,7 +8991,9 @@ next_group: } } -static inline long adjust_numa_imbalance(int imbalance, int nr_running) +#define NUMA_IMBALANCE_MIN 2 + +static inline long adjust_numa_imbalance(int imbalance, int dst_running) { unsigned int imbalance_min; @@ -8999,8 +9001,8 @@ static inline long adjust_numa_imbalance(int imbalance, int nr_running) * Allow a small imbalance based on a simple pair of communicating * tasks that remain local when the source domain is almost idle. */ - imbalance_min = 2; - if (nr_running <= imbalance_min) + imbalance_min = NUMA_IMBALANCE_MIN; + if (dst_running <= imbalance_min) return 0; return imbalance; -- cgit v1.2.3 From 5c339005f854fa75aa46078ad640919425658b3e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 20 Nov 2020 09:06:28 +0000 Subject: sched: Avoid unnecessary calculation of load imbalance at clone time In find_idlest_group(), the load imbalance is only relevant when the group is either overloaded or fully busy but it is calculated unconditionally. This patch moves the imbalance calculation to the context it is required. Technically, it is a micro-optimisation but really the benefit is avoiding confusing one type of imbalance with another depending on the group_type in the next patch. No functional change. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20201120090630.3286-3-mgorman@techsingularity.net --- kernel/sched/fair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9d10abe00f72..2626c6bac9f7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8777,9 +8777,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) .group_type = group_overloaded, }; - imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - do { int local_group; @@ -8833,6 +8830,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) switch (local_sgs.group_type) { case group_overloaded: case group_fully_busy: + + /* Calculate allowed imbalance based on load */ + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + /* * When comparing groups across NUMA domains, it's possible for * the local domain to be very lightly loaded relative to the -- cgit v1.2.3 From 7d2b5dd0bcc48095651f1b85f751eef610b3e034 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 20 Nov 2020 09:06:29 +0000 Subject: sched/numa: Allow a floating imbalance between NUMA nodes Currently, an imbalance is only allowed when a destination node is almost completely idle. This solved one basic class of problems and was the cautious approach. This patch revisits the possibility that NUMA nodes can be imbalanced until 25% of the CPUs are occupied. The reasoning behind 25% is somewhat superficial -- it's half the cores when HT is enabled. At higher utilisations, balancing should continue as normal and keep things even until scheduler domains are fully busy or over utilised. Note that this is not expected to be a universal win. Any benchmark that prefers spreading as wide as possible with limited communication will favour the old behaviour as there is more memory bandwidth. Workloads that communicate heavily in pairs such as netperf or tbench benefit. For the tests I ran, the vast majority of workloads saw a benefit so it seems to be a worthwhile trade-off. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20201120090630.3286-4-mgorman@techsingularity.net --- kernel/sched/fair.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2626c6bac9f7..377c77b35751 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1559,7 +1559,8 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, int dst_running); +static inline long adjust_numa_imbalance(int imbalance, + int dst_running, int dst_weight); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1939,7 +1940,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, src_running = env->src_stats.nr_running - 1; dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); - imbalance = adjust_numa_imbalance(imbalance, dst_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running, + env->dst_stats.weight); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -8995,16 +8997,14 @@ next_group: #define NUMA_IMBALANCE_MIN 2 -static inline long adjust_numa_imbalance(int imbalance, int dst_running) +static inline long adjust_numa_imbalance(int imbalance, + int dst_running, int dst_weight) { - unsigned int imbalance_min; - /* * Allow a small imbalance based on a simple pair of communicating - * tasks that remain local when the source domain is almost idle. + * tasks that remain local when the destination is lightly loaded. */ - imbalance_min = NUMA_IMBALANCE_MIN; - if (dst_running <= imbalance_min) + if (dst_running < (dst_weight >> 2) && imbalance <= NUMA_IMBALANCE_MIN) return 0; return imbalance; @@ -9106,9 +9106,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* Consider allowing a small imbalance between NUMA groups */ - if (env->sd->flags & SD_NUMA) + if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - busiest->sum_nr_running); + busiest->sum_nr_running, busiest->group_weight); + } return; } -- cgit v1.2.3 From 23e6082a522e32232f7377540b4d42d8304253b8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 20 Nov 2020 09:06:30 +0000 Subject: sched: Limit the amount of NUMA imbalance that can exist at fork time At fork time currently, a local node can be allowed to fill completely and allow the periodic load balancer to fix the problem. This can be problematic in cases where a task creates lots of threads that idle until woken as part of a worker poll causing a memory bandwidth problem. However, a "real" workload suffers badly from this behaviour. The workload in question is mostly NUMA aware but spawns large numbers of threads that act as a worker pool that can be called from anywhere. These need to spread early to get reasonable behaviour. This patch limits how much a local node can fill before spilling over to another node and it will not be a universal win. Specifically, very short-lived workloads that fit within a NUMA node would prefer the memory bandwidth. As I cannot describe the "real" workload, the best proxy measure I found for illustration was a page fault microbenchmark. It's not representative of the workload but demonstrates the hazard of the current behaviour. pft timings 5.10.0-rc2 5.10.0-rc2 imbalancefloat-v2 forkspread-v2 Amean elapsed-1 46.37 ( 0.00%) 46.05 * 0.69%* Amean elapsed-4 12.43 ( 0.00%) 12.49 * -0.47%* Amean elapsed-7 7.61 ( 0.00%) 7.55 * 0.81%* Amean elapsed-12 4.79 ( 0.00%) 4.80 ( -0.17%) Amean elapsed-21 3.13 ( 0.00%) 2.89 * 7.74%* Amean elapsed-30 3.65 ( 0.00%) 2.27 * 37.62%* Amean elapsed-48 3.08 ( 0.00%) 2.13 * 30.69%* Amean elapsed-79 2.00 ( 0.00%) 1.90 * 4.95%* Amean elapsed-80 2.00 ( 0.00%) 1.90 * 4.70%* This is showing the time to fault regions belonging to threads. The target machine has 80 logical CPUs and two nodes. Note the ~30% gain when the machine is approximately the point where one node becomes fully utilised. The slower results are borderline noise. Kernel building shows similar benefits around the same balance point. Generally performance was either neutral or better in the tests conducted. The main consideration with this patch is the point where fork stops spreading a task so some workloads may benefit from different balance points but it would be a risky tuning parameter. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20201120090630.3286-5-mgorman@techsingularity.net --- kernel/sched/fair.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 377c77b35751..2e8aadeac3a7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8761,6 +8761,16 @@ static bool update_pick_idlest(struct sched_group *idlest, return true; } +/* + * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. + * This is an approximation as the number of running tasks may not be + * related to the number of busy CPUs due to sched_setaffinity. + */ +static inline bool allow_numa_imbalance(int dst_running, int dst_weight) +{ + return (dst_running < (dst_weight >> 2)); +} + /* * find_idlest_group() finds and returns the least busy CPU group within the * domain. @@ -8893,7 +8903,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * a real need of migration, periodic load balance will * take care of it. */ - if (local_sgs.idle_cpus) + if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) return NULL; } @@ -9000,11 +9010,14 @@ next_group: static inline long adjust_numa_imbalance(int imbalance, int dst_running, int dst_weight) { + if (!allow_numa_imbalance(dst_running, dst_weight)) + return imbalance; + /* * Allow a small imbalance based on a simple pair of communicating * tasks that remain local when the destination is lightly loaded. */ - if (dst_running < (dst_weight >> 2) && imbalance <= NUMA_IMBALANCE_MIN) + if (imbalance <= NUMA_IMBALANCE_MIN) return 0; return imbalance; -- cgit v1.2.3 From 7a9f50a05843fee8366bd3a65addbebaa7cf7f07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Jun 2020 11:51:29 +0200 Subject: irq_work: Cleanup Get rid of the __call_single_node union and clean up the API a little to avoid external code relying on the structure layout as much. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker --- drivers/gpu/drm/i915/i915_request.c | 4 ++-- include/linux/irq_work.h | 33 +++++++++++++++++++++------------ include/linux/irqflags.h | 4 ++-- kernel/bpf/stackmap.c | 2 +- kernel/irq_work.c | 18 +++++++++--------- kernel/printk/printk.c | 6 ++---- kernel/rcu/tree.c | 3 +-- kernel/time/tick-sched.c | 6 ++---- kernel/trace/bpf_trace.c | 2 +- 9 files changed, 41 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c index 0e813819b041..5385b081a376 100644 --- a/drivers/gpu/drm/i915/i915_request.c +++ b/drivers/gpu/drm/i915/i915_request.c @@ -197,7 +197,7 @@ __notify_execute_cb(struct i915_request *rq, bool (*fn)(struct irq_work *wrk)) llist_for_each_entry_safe(cb, cn, llist_del_all(&rq->execute_cb), - work.llnode) + work.node.llist) fn(&cb->work); } @@ -460,7 +460,7 @@ __await_execution(struct i915_request *rq, * callback first, then checking the ACTIVE bit, we serialise with * the completed/retired request. */ - if (llist_add(&cb->work.llnode, &signal->execute_cb)) { + if (llist_add(&cb->work.node.llist, &signal->execute_cb)) { if (i915_request_is_active(signal) || __request_in_flight(signal)) __notify_execute_cb_imm(signal); diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index 30823780c192..ec2a47a81e42 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -14,28 +14,37 @@ */ struct irq_work { - union { - struct __call_single_node node; - struct { - struct llist_node llnode; - atomic_t flags; - }; - }; + struct __call_single_node node; void (*func)(struct irq_work *); }; +#define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ + .node = { .u_flags = (_flags), }, \ + .func = (_func), \ +} + +#define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) +#define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY) +#define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ) + +#define DEFINE_IRQ_WORK(name, _f) \ + struct irq_work name = IRQ_WORK_INIT(_f) + static inline void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) { - atomic_set(&work->flags, 0); - work->func = func; + *work = IRQ_WORK_INIT(func); } -#define DEFINE_IRQ_WORK(name, _f) struct irq_work name = { \ - .flags = ATOMIC_INIT(0), \ - .func = (_f) \ +static inline bool irq_work_is_pending(struct irq_work *work) +{ + return atomic_read(&work->node.a_flags) & IRQ_WORK_PENDING; } +static inline bool irq_work_is_busy(struct irq_work *work) +{ + return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; +} bool irq_work_queue(struct irq_work *work); bool irq_work_queue_on(struct irq_work *work, int cpu); diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 3ed4e8771b64..fef2d43a7a1d 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -109,12 +109,12 @@ do { \ # define lockdep_irq_work_enter(__work) \ do { \ - if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\ + if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\ current->irq_config = 1; \ } while (0) # define lockdep_irq_work_exit(__work) \ do { \ - if (!(atomic_read(&__work->flags) & IRQ_WORK_HARD_IRQ))\ + if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\ current->irq_config = 0; \ } while (0) diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 06065fa27124..599041cd0c8a 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -298,7 +298,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, if (irqs_disabled()) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { work = this_cpu_ptr(&up_read_work); - if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) { + if (irq_work_is_busy(&work->irq_work)) { /* cannot queue more up_read, fallback */ irq_work_busy = true; } diff --git a/kernel/irq_work.c b/kernel/irq_work.c index eca83965b631..fbff25adb574 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -31,7 +31,7 @@ static bool irq_work_claim(struct irq_work *work) { int oflags; - oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags); + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags); /* * If the work is already pending, no need to raise the IPI. * The pairing atomic_fetch_andnot() in irq_work_run() makes sure @@ -53,12 +53,12 @@ void __weak arch_irq_work_raise(void) static void __irq_work_queue_local(struct irq_work *work) { /* If the work is "lazy", handle it from next tick if any */ - if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && + if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { + if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && tick_nohz_tick_stopped()) arch_irq_work_raise(); } else { - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) + if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) arch_irq_work_raise(); } } @@ -102,7 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); - __smp_call_single_queue(cpu, &work->llnode); + __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } @@ -142,7 +142,7 @@ void irq_work_single(void *arg) * to claim that work don't rely on us to handle their data * while we are in the middle of the func. */ - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->node.a_flags); lockdep_irq_work_enter(work); work->func(work); @@ -152,7 +152,7 @@ void irq_work_single(void *arg) * no-one else claimed it meanwhile. */ flags &= ~IRQ_WORK_PENDING; - (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); + (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); } static void irq_work_run_list(struct llist_head *list) @@ -166,7 +166,7 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - llist_for_each_entry_safe(work, tmp, llnode, llnode) + llist_for_each_entry_safe(work, tmp, llnode, node.llist) irq_work_single(work); } @@ -198,7 +198,7 @@ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); - while (atomic_read(&work->flags) & IRQ_WORK_BUSY) + while (irq_work_is_busy(work)) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fe64a49344bf..9ef23d4b07c7 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3025,10 +3025,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) wake_up_interruptible(&log_wait); } -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { - .func = wake_up_klogd_work_func, - .flags = ATOMIC_INIT(IRQ_WORK_LAZY), -}; +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = + IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); void wake_up_klogd(void) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 06895ef85d69..a41e84f1b55a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1311,8 +1311,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (IS_ENABLED(CONFIG_IRQ_WORK) && !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { - init_irq_work(&rdp->rcu_iw, rcu_iw_handler); - atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ); rdp->rcu_iw_pending = true; rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); @@ -3964,6 +3962,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->cpu_no_qs.b.norm = true; rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; + rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler); rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 81632cd5e3b7..1b734070f028 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -243,10 +243,8 @@ static void nohz_full_kick_func(struct irq_work *work) /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } -static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { - .func = nohz_full_kick_func, - .flags = ATOMIC_INIT(IRQ_WORK_HARD_IRQ), -}; +static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = + IRQ_WORK_INIT_HARD(nohz_full_kick_func); /* * Kick this CPU if it's full dynticks in order to force it to diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4517c8b66518..a6903912f7a0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1086,7 +1086,7 @@ static int bpf_send_signal_common(u32 sig, enum pid_type type) return -EINVAL; work = this_cpu_ptr(&send_signal_work); - if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) + if (irq_work_is_busy(&work->irq_work)) return -EBUSY; /* Add the current task, which is the target of sending signal, -- cgit v1.2.3 From 545b8c8df41f9ecbaf806332d4095bc4bc7c14e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 15 Jun 2020 11:29:31 +0200 Subject: smp: Cleanup smp_call_function*() Get rid of the __call_single_node union and cleanup the API a little to avoid external code relying on the structure layout as much. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker --- arch/mips/kernel/process.c | 5 ++- arch/mips/kernel/smp.c | 25 +++---------- arch/s390/pci/pci_irq.c | 4 +- arch/x86/kernel/cpuid.c | 7 ++-- arch/x86/lib/msr-smp.c | 7 ++-- block/blk-mq.c | 4 +- drivers/cpuidle/coupled.c | 3 +- drivers/net/ethernet/cavium/liquidio/lio_core.c | 9 +---- include/linux/smp.h | 19 +++++----- kernel/debug/debug_core.c | 6 +-- kernel/sched/core.c | 12 +----- kernel/smp.c | 50 ++++++++++++------------- net/core/dev.c | 3 +- 13 files changed, 60 insertions(+), 94 deletions(-) (limited to 'kernel') diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 75ebd8d7bd5d..d7e288f3a1e7 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -702,7 +702,6 @@ unsigned long arch_align_stack(unsigned long sp) return sp & ALMASK; } -static DEFINE_PER_CPU(call_single_data_t, backtrace_csd); static struct cpumask backtrace_csd_busy; static void handle_backtrace(void *info) @@ -711,6 +710,9 @@ static void handle_backtrace(void *info) cpumask_clear_cpu(smp_processor_id(), &backtrace_csd_busy); } +static DEFINE_PER_CPU(call_single_data_t, backtrace_csd) = + CSD_INIT(handle_backtrace, NULL); + static void raise_backtrace(cpumask_t *mask) { call_single_data_t *csd; @@ -730,7 +732,6 @@ static void raise_backtrace(cpumask_t *mask) } csd = &per_cpu(backtrace_csd, cpu); - csd->func = handle_backtrace; smp_call_function_single_async(cpu, csd); } } diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c index 48d84d5fcc36..74b9102fd06e 100644 --- a/arch/mips/kernel/smp.c +++ b/arch/mips/kernel/smp.c @@ -687,36 +687,23 @@ EXPORT_SYMBOL(flush_tlb_one); #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST -static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd); - -void tick_broadcast(const struct cpumask *mask) -{ - call_single_data_t *csd; - int cpu; - - for_each_cpu(cpu, mask) { - csd = &per_cpu(tick_broadcast_csd, cpu); - smp_call_function_single_async(cpu, csd); - } -} - static void tick_broadcast_callee(void *info) { tick_receive_broadcast(); } -static int __init tick_broadcast_init(void) +static DEFINE_PER_CPU(call_single_data_t, tick_broadcast_csd) = + CSD_INIT(tick_broadcast_callee, NULL); + +void tick_broadcast(const struct cpumask *mask) { call_single_data_t *csd; int cpu; - for (cpu = 0; cpu < NR_CPUS; cpu++) { + for_each_cpu(cpu, mask) { csd = &per_cpu(tick_broadcast_csd, cpu); - csd->func = tick_broadcast_callee; + smp_call_function_single_async(cpu, csd); } - - return 0; } -early_initcall(tick_broadcast_init); #endif /* CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 743f257cf2cb..1311b6f9d6dd 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -178,9 +178,7 @@ static void zpci_handle_fallback_irq(void) if (atomic_inc_return(&cpu_data->scheduled) > 1) continue; - cpu_data->csd.func = zpci_handle_remote_irq; - cpu_data->csd.info = &cpu_data->scheduled; - cpu_data->csd.flags = 0; + INIT_CSD(&cpu_data->csd, zpci_handle_remote_irq, &cpu_data->scheduled); smp_call_function_single_async(cpu, &cpu_data->csd); } } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 3492aa36bf09..6f7b8cc1bc9f 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -74,10 +74,9 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, init_completion(&cmd.done); for (; count; count -= 16) { - call_single_data_t csd = { - .func = cpuid_smp_cpuid, - .info = &cmd, - }; + call_single_data_t csd; + + INIT_CSD(&csd, cpuid_smp_cpuid, &cmd); cmd.regs.eax = pos; cmd.regs.ecx = pos >> 32; diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index fee8b9c0520c..75a0915b0d01 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -169,12 +169,11 @@ static void __wrmsr_safe_on_cpu(void *info) int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { struct msr_info_completion rv; - call_single_data_t csd = { - .func = __rdmsr_safe_on_cpu, - .info = &rv, - }; + call_single_data_t csd; int err; + INIT_CSD(&csd, __rdmsr_safe_on_cpu, &rv); + memset(&rv, 0, sizeof(rv)); init_completion(&rv.done); rv.msr.msr_no = msr_no; diff --git a/block/blk-mq.c b/block/blk-mq.c index 55bcee5dc032..d35b3c0c876a 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -671,9 +671,7 @@ bool blk_mq_complete_request_remote(struct request *rq) return false; if (blk_mq_complete_need_ipi(rq)) { - rq->csd.func = __blk_mq_complete_request_remote; - rq->csd.info = rq; - rq->csd.flags = 0; + INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); } else { if (rq->q->nr_hw_queues > 1) diff --git a/drivers/cpuidle/coupled.c b/drivers/cpuidle/coupled.c index 04003b90dc49..74068742cef3 100644 --- a/drivers/cpuidle/coupled.c +++ b/drivers/cpuidle/coupled.c @@ -674,8 +674,7 @@ have_coupled: coupled->refcnt++; csd = &per_cpu(cpuidle_coupled_poke_cb, dev->cpu); - csd->func = cpuidle_coupled_handle_poke; - csd->info = (void *)(unsigned long)dev->cpu; + INIT_CSD(csd, cpuidle_coupled_handle_poke, (void *)(unsigned long)dev->cpu); return 0; } diff --git a/drivers/net/ethernet/cavium/liquidio/lio_core.c b/drivers/net/ethernet/cavium/liquidio/lio_core.c index 9ef172976b35..37d064193f0f 100644 --- a/drivers/net/ethernet/cavium/liquidio/lio_core.c +++ b/drivers/net/ethernet/cavium/liquidio/lio_core.c @@ -729,13 +729,8 @@ static void liquidio_napi_drv_callback(void *arg) droq->cpu_id == this_cpu) { napi_schedule_irqoff(&droq->napi); } else { - call_single_data_t *csd = &droq->csd; - - csd->func = napi_schedule_wrapper; - csd->info = &droq->napi; - csd->flags = 0; - - smp_call_function_single_async(droq->cpu_id, csd); + INIT_CSD(&droq->csd, napi_schedule_wrapper, &droq->napi); + smp_call_function_single_async(droq->cpu_id, &droq->csd); } } diff --git a/include/linux/smp.h b/include/linux/smp.h index 9f13966d3d92..70c6f6284dcf 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -21,24 +21,23 @@ typedef bool (*smp_cond_func_t)(int cpu, void *info); * structure shares (partial) layout with struct irq_work */ struct __call_single_data { - union { - struct __call_single_node node; - struct { - struct llist_node llist; - unsigned int flags; -#ifdef CONFIG_64BIT - u16 src, dst; -#endif - }; - }; + struct __call_single_node node; smp_call_func_t func; void *info; }; +#define CSD_INIT(_func, _info) \ + (struct __call_single_data){ .func = (_func), .info = (_info), } + /* Use __aligned() to avoid to use 2 cache lines for 1 csd */ typedef struct __call_single_data call_single_data_t __aligned(sizeof(struct __call_single_data)); +#define INIT_CSD(_csd, _func, _info) \ +do { \ + *(_csd) = CSD_INIT((_func), (_info)); \ +} while (0) + /* * Enqueue a llist_node on the call_single_queue; be very careful, read * flush_smp_call_function_queue() in detail. diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1e75a8923a8d..af6e8b4fb359 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -225,8 +225,6 @@ NOKPROBE_SYMBOL(kgdb_skipexception); * Default (weak) implementation for kgdb_roundup_cpus */ -static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd); - void __weak kgdb_call_nmi_hook(void *ignored) { /* @@ -241,6 +239,9 @@ void __weak kgdb_call_nmi_hook(void *ignored) } NOKPROBE_SYMBOL(kgdb_call_nmi_hook); +static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) = + CSD_INIT(kgdb_call_nmi_hook, NULL); + void __weak kgdb_roundup_cpus(void) { call_single_data_t *csd; @@ -267,7 +268,6 @@ void __weak kgdb_roundup_cpus(void) continue; kgdb_info[cpu].rounding_up = true; - csd->func = kgdb_call_nmi_hook; ret = smp_call_function_single_async(cpu, csd); if (ret) kgdb_info[cpu].rounding_up = false; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c962922784d1..b943b459b77a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -320,14 +320,6 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } -static inline void -rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) -{ - csd->flags = 0; - csd->func = func; - csd->info = rq; -} - #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -428,7 +420,7 @@ void hrtick_start(struct rq *rq, u64 delay) static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); + INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; @@ -7774,7 +7766,7 @@ void __init sched_init(void) rq->last_blocked_load_update_tick = jiffies; atomic_set(&rq->nohz_flags, 0); - rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func); + INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); #endif #ifdef CONFIG_HOTPLUG_CPU rcuwait_init(&rq->hotplug_wait); diff --git a/kernel/smp.c b/kernel/smp.c index 4d17501433be..faf1a3ace6a9 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -27,7 +27,7 @@ #include "smpboot.h" #include "sched/smp.h" -#define CSD_TYPE(_csd) ((_csd)->flags & CSD_FLAG_TYPE_MASK) +#define CSD_TYPE(_csd) ((_csd)->node.u_flags & CSD_FLAG_TYPE_MASK) struct call_function_data { call_single_data_t __percpu *csd; @@ -146,7 +146,7 @@ static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 t bool firsttime; u64 ts2, ts_delta; call_single_data_t *cpu_cur_csd; - unsigned int flags = READ_ONCE(csd->flags); + unsigned int flags = READ_ONCE(csd->node.u_flags); if (!(flags & CSD_FLAG_LOCK)) { if (!unlikely(*bug_id)) @@ -224,14 +224,14 @@ static void csd_lock_record(call_single_data_t *csd) static __always_inline void csd_lock_wait(call_single_data_t *csd) { - smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK)); + smp_cond_load_acquire(&csd->node.u_flags, !(VAL & CSD_FLAG_LOCK)); } #endif static __always_inline void csd_lock(call_single_data_t *csd) { csd_lock_wait(csd); - csd->flags |= CSD_FLAG_LOCK; + csd->node.u_flags |= CSD_FLAG_LOCK; /* * prevent CPU from reordering the above assignment @@ -243,12 +243,12 @@ static __always_inline void csd_lock(call_single_data_t *csd) static __always_inline void csd_unlock(call_single_data_t *csd) { - WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); + WARN_ON(!(csd->node.u_flags & CSD_FLAG_LOCK)); /* * ensure we're all done before releasing data: */ - smp_store_release(&csd->flags, 0); + smp_store_release(&csd->node.u_flags, 0); } static DEFINE_PER_CPU_SHARED_ALIGNED(call_single_data_t, csd_data); @@ -300,7 +300,7 @@ static int generic_exec_single(int cpu, call_single_data_t *csd) return -ENXIO; } - __smp_call_single_queue(cpu, &csd->llist); + __smp_call_single_queue(cpu, &csd->node.llist); return 0; } @@ -353,7 +353,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * We don't have to use the _safe() variant here * because we are not invoking the IPI handlers yet. */ - llist_for_each_entry(csd, entry, llist) { + llist_for_each_entry(csd, entry, node.llist) { switch (CSD_TYPE(csd)) { case CSD_TYPE_ASYNC: case CSD_TYPE_SYNC: @@ -378,16 +378,16 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * First; run all SYNC callbacks, people are waiting for us. */ prev = NULL; - llist_for_each_entry_safe(csd, csd_next, entry, llist) { + llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { /* Do we wait until *after* callback? */ if (CSD_TYPE(csd) == CSD_TYPE_SYNC) { smp_call_func_t func = csd->func; void *info = csd->info; if (prev) { - prev->next = &csd_next->llist; + prev->next = &csd_next->node.llist; } else { - entry = &csd_next->llist; + entry = &csd_next->node.llist; } csd_lock_record(csd); @@ -395,7 +395,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) csd_unlock(csd); csd_lock_record(NULL); } else { - prev = &csd->llist; + prev = &csd->node.llist; } } @@ -406,14 +406,14 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) * Second; run all !SYNC callbacks. */ prev = NULL; - llist_for_each_entry_safe(csd, csd_next, entry, llist) { + llist_for_each_entry_safe(csd, csd_next, entry, node.llist) { int type = CSD_TYPE(csd); if (type != CSD_TYPE_TTWU) { if (prev) { - prev->next = &csd_next->llist; + prev->next = &csd_next->node.llist; } else { - entry = &csd_next->llist; + entry = &csd_next->node.llist; } if (type == CSD_TYPE_ASYNC) { @@ -429,7 +429,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline) } } else { - prev = &csd->llist; + prev = &csd->node.llist; } } @@ -465,7 +465,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, { call_single_data_t *csd; call_single_data_t csd_stack = { - .flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, + .node = { .u_flags = CSD_FLAG_LOCK | CSD_TYPE_SYNC, }, }; int this_cpu; int err; @@ -502,8 +502,8 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG - csd->src = smp_processor_id(); - csd->dst = cpu; + csd->node.src = smp_processor_id(); + csd->node.dst = cpu; #endif err = generic_exec_single(cpu, csd); @@ -544,12 +544,12 @@ int smp_call_function_single_async(int cpu, call_single_data_t *csd) preempt_disable(); - if (csd->flags & CSD_FLAG_LOCK) { + if (csd->node.u_flags & CSD_FLAG_LOCK) { err = -EBUSY; goto out; } - csd->flags = CSD_FLAG_LOCK; + csd->node.u_flags = CSD_FLAG_LOCK; smp_wmb(); err = generic_exec_single(cpu, csd); @@ -667,14 +667,14 @@ static void smp_call_function_many_cond(const struct cpumask *mask, csd_lock(csd); if (wait) - csd->flags |= CSD_TYPE_SYNC; + csd->node.u_flags |= CSD_TYPE_SYNC; csd->func = func; csd->info = info; #ifdef CONFIG_CSD_LOCK_WAIT_DEBUG - csd->src = smp_processor_id(); - csd->dst = cpu; + csd->node.src = smp_processor_id(); + csd->node.dst = cpu; #endif - if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu))) + if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) __cpumask_set_cpu(cpu, cfd->cpumask_ipi); } diff --git a/net/core/dev.c b/net/core/dev.c index 82dc6b48e45f..57352605f82c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11165,8 +11165,7 @@ static int __init net_dev_init(void) INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS - sd->csd.func = rps_trigger_softirq; - sd->csd.info = sd; + INIT_CSD(&sd->csd, rps_trigger_softirq, sd); sd->cpu = i; #endif -- cgit v1.2.3 From 2914b0ba61a9d253535e51af16c7122a8148995d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 18 Jun 2020 22:28:37 +0200 Subject: irq_work: Optimize irq_work_single() Trade one atomic op for a full memory barrier. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Frederic Weisbecker --- include/linux/irqflags.h | 8 ++++---- kernel/irq_work.c | 29 +++++++++++++++++------------ 2 files changed, 21 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index fef2d43a7a1d..8de0e1373de7 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -107,14 +107,14 @@ do { \ current->irq_config = 0; \ } while (0) -# define lockdep_irq_work_enter(__work) \ +# define lockdep_irq_work_enter(_flags) \ do { \ - if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\ + if (!((_flags) & IRQ_WORK_HARD_IRQ)) \ current->irq_config = 1; \ } while (0) -# define lockdep_irq_work_exit(__work) \ +# define lockdep_irq_work_exit(_flags) \ do { \ - if (!(atomic_read(&__work->node.a_flags) & IRQ_WORK_HARD_IRQ))\ + if (!((_flags) & IRQ_WORK_HARD_IRQ)) \ current->irq_config = 0; \ } while (0) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index fbff25adb574..e8da1e71583a 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -34,7 +34,7 @@ static bool irq_work_claim(struct irq_work *work) oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags); /* * If the work is already pending, no need to raise the IPI. - * The pairing atomic_fetch_andnot() in irq_work_run() makes sure + * The pairing smp_mb() in irq_work_single() makes sure * everything we did before is visible. */ if (oflags & IRQ_WORK_PENDING) @@ -136,22 +136,27 @@ void irq_work_single(void *arg) int flags; /* - * Clear the PENDING bit, after this point the @work - * can be re-used. - * Make it immediately visible so that other CPUs trying - * to claim that work don't rely on us to handle their data - * while we are in the middle of the func. + * Clear the PENDING bit, after this point the @work can be re-used. + * The PENDING bit acts as a lock, and we own it, so we can clear it + * without atomic ops. */ - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->node.a_flags); + flags = atomic_read(&work->node.a_flags); + flags &= ~IRQ_WORK_PENDING; + atomic_set(&work->node.a_flags, flags); + + /* + * See irq_work_claim(). + */ + smp_mb(); - lockdep_irq_work_enter(work); + lockdep_irq_work_enter(flags); work->func(work); - lockdep_irq_work_exit(work); + lockdep_irq_work_exit(flags); + /* - * Clear the BUSY bit and return to the free state if - * no-one else claimed it meanwhile. + * Clear the BUSY bit, if set, and return to the free state if no-one + * else claimed it meanwhile. */ - flags &= ~IRQ_WORK_PENDING; (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); } -- cgit v1.2.3 From 607c543f939d8ca6fed7afe90b3a8d6f6684dd17 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 20 Nov 2020 23:08:29 -0800 Subject: bpf: Sanitize BTF data pointer after module is loaded Given .BTF section is not allocatable, it will get trimmed after module is loaded. BPF system handles that properly by creating an independent copy of data. But prevent any accidental misused by resetting the pointer to BTF data. Fixes: 36e68442d1af ("bpf: Load and verify kernel module BTFs") Suggested-by: Jessica Yu Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Jessica Yu Cc: Greg Kroah-Hartman Link: https://lore.kernel.org/bpf/20201121070829.2612884-2-andrii@kernel.org --- kernel/module.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index f2996b02ab2e..18f259d61d14 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3709,6 +3709,11 @@ static noinline int do_init_module(struct module *mod) mod->init_layout.ro_size = 0; mod->init_layout.ro_after_init_size = 0; mod->init_layout.text_size = 0; +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */ + mod->btf_data = NULL; + mod->btf_data_size = 0; +#endif /* * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we -- cgit v1.2.3 From ba59eae723857257a791618092d8022ad82efaa4 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 6 Nov 2020 16:31:22 +0800 Subject: audit: fix macros warnings Some unused macros could cause gcc warning: kernel/audit.c:68:0: warning: macro "AUDIT_UNINITIALIZED" is not used [-Wunused-macros] kernel/auditsc.c:104:0: warning: macro "AUDIT_AUX_IPCPERM" is not used [-Wunused-macros] kernel/auditsc.c:82:0: warning: macro "AUDITSC_INVALID" is not used [-Wunused-macros] AUDIT_UNINITIALIZED and AUDITSC_INVALID are still meaningful and should be in incorporated. Just remove AUDIT_AUX_IPCPERM. Thanks comments from Richard Guy Briggs and Paul Moore. Signed-off-by: Alex Shi Cc: Paul Moore Cc: Richard Guy Briggs Cc: Eric Paris Cc: linux-audit@redhat.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- kernel/auditsc.c | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index ac0aeaa99937..e22f22bdc000 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -67,7 +67,7 @@ #define AUDIT_DISABLED -1 #define AUDIT_UNINITIALIZED 0 #define AUDIT_INITIALIZED 1 -static int audit_initialized; +static int audit_initialized = AUDIT_UNINITIALIZED; u32 audit_enabled = AUDIT_OFF; bool audit_ever_enabled = !!AUDIT_OFF; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 183d79cc2e12..9cbe6d5437be 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -102,8 +102,6 @@ struct audit_aux_data { int type; }; -#define AUDIT_AUX_IPCPERM 0 - /* Number of target pids per aux struct. */ #define AUDIT_AUX_PIDS 16 @@ -552,11 +550,11 @@ static int audit_filter_rules(struct task_struct *tsk, break; case AUDIT_EXIT: - if (ctx && ctx->return_valid) + if (ctx && ctx->return_valid != AUDITSC_INVALID) result = audit_comparator(ctx->return_code, f->op, f->val); break; case AUDIT_SUCCESS: - if (ctx && ctx->return_valid) { + if (ctx && ctx->return_valid != AUDITSC_INVALID) { if (f->val) result = audit_comparator(ctx->return_valid, f->op, AUDITSC_SUCCESS); else @@ -930,6 +928,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state) INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); context->fds[0] = -1; + context->return_valid = AUDITSC_INVALID; return context; } @@ -1488,7 +1487,7 @@ static void audit_log_exit(void) context->arch, context->major); if (context->personality != PER_LINUX) audit_log_format(ab, " per=%lx", context->personality); - if (context->return_valid) + if (context->return_valid != AUDITSC_INVALID) audit_log_format(ab, " success=%s exit=%ld", (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", context->return_code); @@ -1625,7 +1624,7 @@ void __audit_free(struct task_struct *tsk) * need to log via audit_log_exit(). */ if (tsk == current && !context->dummy && context->in_syscall) { - context->return_valid = 0; + context->return_valid = AUDITSC_INVALID; context->return_code = 0; audit_filter_syscall(tsk, context, -- cgit v1.2.3 From 59e2e27d227a0a4e7ec0e22c63ca36a5ad1ab438 Mon Sep 17 00:00:00 2001 From: Wedson Almeida Filho Date: Sat, 21 Nov 2020 01:55:09 +0000 Subject: bpf: Refactor check_cfg to use a structured loop. The current implementation uses a number of gotos to implement a loop and different paths within the loop, which makes the code less readable than it would be with an explicit while-loop. This patch also replaces a chain of if/if-elses keyed on the same expression with a switch statement. No change in behaviour is intended. Signed-off-by: Wedson Almeida Filho Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201121015509.3594191-1-wedsonaf@google.com --- kernel/bpf/verifier.c | 179 +++++++++++++++++++++++++++----------------------- 1 file changed, 95 insertions(+), 84 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index fb2943ea715d..e333ce43f281 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8047,6 +8047,11 @@ static void init_explored_state(struct bpf_verifier_env *env, int idx) env->insn_aux_data[idx].prune_point = true; } +enum { + DONE_EXPLORING = 0, + KEEP_EXPLORING = 1, +}; + /* t, w, e - match pseudo-code above: * t - index of current instruction * w - next instruction @@ -8059,10 +8064,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, int *insn_state = env->cfg.insn_state; if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH)) - return 0; + return DONE_EXPLORING; if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH)) - return 0; + return DONE_EXPLORING; if (w < 0 || w >= env->prog->len) { verbose_linfo(env, t, "%d: ", t); @@ -8081,10 +8086,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, if (env->cfg.cur_stack >= env->prog->len) return -E2BIG; insn_stack[env->cfg.cur_stack++] = w; - return 1; + return KEEP_EXPLORING; } else if ((insn_state[w] & 0xF0) == DISCOVERED) { if (loop_ok && env->bpf_capable) - return 0; + return DONE_EXPLORING; verbose_linfo(env, t, "%d: ", t); verbose_linfo(env, w, "%d: ", w); verbose(env, "back-edge from insn %d to %d\n", t, w); @@ -8096,7 +8101,74 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, verbose(env, "insn state internal bug\n"); return -EFAULT; } - return 0; + return DONE_EXPLORING; +} + +/* Visits the instruction at index t and returns one of the following: + * < 0 - an error occurred + * DONE_EXPLORING - the instruction was fully explored + * KEEP_EXPLORING - there is still work to be done before it is fully explored + */ +static int visit_insn(int t, int insn_cnt, struct bpf_verifier_env *env) +{ + struct bpf_insn *insns = env->prog->insnsi; + int ret; + + /* All non-branch instructions have a single fall-through edge. */ + if (BPF_CLASS(insns[t].code) != BPF_JMP && + BPF_CLASS(insns[t].code) != BPF_JMP32) + return push_insn(t, t + 1, FALLTHROUGH, env, false); + + switch (BPF_OP(insns[t].code)) { + case BPF_EXIT: + return DONE_EXPLORING; + + case BPF_CALL: + ret = push_insn(t, t + 1, FALLTHROUGH, env, false); + if (ret) + return ret; + + if (t + 1 < insn_cnt) + init_explored_state(env, t + 1); + if (insns[t].src_reg == BPF_PSEUDO_CALL) { + init_explored_state(env, t); + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, + env, false); + } + return ret; + + case BPF_JA: + if (BPF_SRC(insns[t].code) != BPF_K) + return -EINVAL; + + /* unconditional jump with single edge */ + ret = push_insn(t, t + insns[t].off + 1, FALLTHROUGH, env, + true); + if (ret) + return ret; + + /* unconditional jmp is not a good pruning point, + * but it's marked, since backtracking needs + * to record jmp history in is_state_visited(). + */ + init_explored_state(env, t + insns[t].off + 1); + /* tell verifier to check for equivalent states + * after every call and jump + */ + if (t + 1 < insn_cnt) + init_explored_state(env, t + 1); + + return ret; + + default: + /* conditional jump with two edges */ + init_explored_state(env, t); + ret = push_insn(t, t + 1, FALLTHROUGH, env, true); + if (ret) + return ret; + + return push_insn(t, t + insns[t].off + 1, BRANCH, env, true); + } } /* non-recursive depth-first-search to detect loops in BPF program @@ -8104,11 +8176,10 @@ static int push_insn(int t, int w, int e, struct bpf_verifier_env *env, */ static int check_cfg(struct bpf_verifier_env *env) { - struct bpf_insn *insns = env->prog->insnsi; int insn_cnt = env->prog->len; int *insn_stack, *insn_state; int ret = 0; - int i, t; + int i; insn_state = env->cfg.insn_state = kvcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) @@ -8124,92 +8195,32 @@ static int check_cfg(struct bpf_verifier_env *env) insn_stack[0] = 0; /* 0 is the first instruction */ env->cfg.cur_stack = 1; -peek_stack: - if (env->cfg.cur_stack == 0) - goto check_state; - t = insn_stack[env->cfg.cur_stack - 1]; - - if (BPF_CLASS(insns[t].code) == BPF_JMP || - BPF_CLASS(insns[t].code) == BPF_JMP32) { - u8 opcode = BPF_OP(insns[t].code); - - if (opcode == BPF_EXIT) { - goto mark_explored; - } else if (opcode == BPF_CALL) { - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); - if (insns[t].src_reg == BPF_PSEUDO_CALL) { - init_explored_state(env, t); - ret = push_insn(t, t + insns[t].imm + 1, BRANCH, - env, false); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - } - } else if (opcode == BPF_JA) { - if (BPF_SRC(insns[t].code) != BPF_K) { - ret = -EINVAL; - goto err_free; - } - /* unconditional jump with single edge */ - ret = push_insn(t, t + insns[t].off + 1, - FALLTHROUGH, env, true); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - /* unconditional jmp is not a good pruning point, - * but it's marked, since backtracking needs - * to record jmp history in is_state_visited(). - */ - init_explored_state(env, t + insns[t].off + 1); - /* tell verifier to check for equivalent states - * after every call and jump - */ - if (t + 1 < insn_cnt) - init_explored_state(env, t + 1); - } else { - /* conditional jump with two edges */ - init_explored_state(env, t); - ret = push_insn(t, t + 1, FALLTHROUGH, env, true); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; + while (env->cfg.cur_stack > 0) { + int t = insn_stack[env->cfg.cur_stack - 1]; - ret = push_insn(t, t + insns[t].off + 1, BRANCH, env, true); - if (ret == 1) - goto peek_stack; - else if (ret < 0) - goto err_free; - } - } else { - /* all other non-branch instructions with single - * fall-through edge - */ - ret = push_insn(t, t + 1, FALLTHROUGH, env, false); - if (ret == 1) - goto peek_stack; - else if (ret < 0) + ret = visit_insn(t, insn_cnt, env); + switch (ret) { + case DONE_EXPLORING: + insn_state[t] = EXPLORED; + env->cfg.cur_stack--; + break; + case KEEP_EXPLORING: + break; + default: + if (ret > 0) { + verbose(env, "visit_insn internal bug\n"); + ret = -EFAULT; + } goto err_free; + } } -mark_explored: - insn_state[t] = EXPLORED; - if (env->cfg.cur_stack-- <= 0) { + if (env->cfg.cur_stack < 0) { verbose(env, "pop stack internal bug\n"); ret = -EFAULT; goto err_free; } - goto peek_stack; -check_state: for (i = 0; i < insn_cnt; i++) { if (insn_state[i] != EXPLORED) { verbose(env, "unreachable insn %d\n", i); -- cgit v1.2.3 From b112082c8930e7aa72422484b2d31d3aa06f58bc Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Mon, 23 Nov 2020 11:23:15 +0100 Subject: module: simplify version-attribute handling Instead of using the array-of-pointers trick to avoid having gcc mess up the built-in module-version array stride, specify type alignment when declaring entries to prevent gcc from increasing alignment. This is essentially an alternative (one-line) fix to the problem addressed by commit b4bc842802db ("module: deal with alignment issues in built-in module versions"). gcc can increase the alignment of larger objects with static extent as an optimisation, but this can be suppressed by using the aligned attribute when declaring variables. Note that we have been relying on this behaviour for kernel parameters for 16 years and it indeed hasn't changed since the introduction of the aligned attribute in gcc-3.1. Link: https://lore.kernel.org/lkml/20201103175711.10731-1-johan@kernel.org Signed-off-by: Johan Hovold Signed-off-by: Jessica Yu --- include/linux/module.h | 26 +++++++++++++------------- kernel/params.c | 10 ++++------ 2 files changed, 17 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 293250958512..5958075ea3f4 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -266,20 +266,20 @@ extern typeof(name) __mod_##type##__##name##_device_table \ #else #define MODULE_VERSION(_version) \ MODULE_INFO(version, _version); \ - static struct module_version_attribute ___modver_attr = { \ - .mattr = { \ - .attr = { \ - .name = "version", \ - .mode = S_IRUGO, \ + static struct module_version_attribute __modver_attr \ + __used __section("__modver") \ + __aligned(__alignof__(struct module_version_attribute)) \ + = { \ + .mattr = { \ + .attr = { \ + .name = "version", \ + .mode = S_IRUGO, \ + }, \ + .show = __modver_version_show, \ }, \ - .show = __modver_version_show, \ - }, \ - .module_name = KBUILD_MODNAME, \ - .version = _version, \ - }; \ - static const struct module_version_attribute \ - __used __section("__modver") \ - * __moduleparam_const __modver_attr = &___modver_attr + .module_name = KBUILD_MODNAME, \ + .version = _version, \ + }; #endif /* Optional firmware file (or files) needed by the module diff --git a/kernel/params.c b/kernel/params.c index 3835fb82c64b..aa7d6f2213f1 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -843,18 +843,16 @@ ssize_t __modver_version_show(struct module_attribute *mattr, return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version); } -extern const struct module_version_attribute *__start___modver[]; -extern const struct module_version_attribute *__stop___modver[]; +extern const struct module_version_attribute __start___modver[]; +extern const struct module_version_attribute __stop___modver[]; static void __init version_sysfs_builtin(void) { - const struct module_version_attribute **p; + const struct module_version_attribute *vattr; struct module_kobject *mk; int err; - for (p = __start___modver; p < __stop___modver; p++) { - const struct module_version_attribute *vattr = *p; - + for (vattr = __start___modver; vattr < __stop___modver; vattr++) { mk = locate_module_kobject(vattr->module_name); if (mk) { err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr); -- cgit v1.2.3 From b87e745945e3de3e4d5c5eeb53e0e455e5cd5416 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 25 Nov 2020 18:16:26 +0200 Subject: resource: provide meaningful MODULE_LICENSE() in test suite modpost complains that module has no licence provided. Provide it via meaningful MODULE_LICENSE(). Reported-by: Stephen Rothwell Signed-off-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- kernel/resource_kunit.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/resource_kunit.c b/kernel/resource_kunit.c index 9fdbca8426f1..58ab9f914602 100644 --- a/kernel/resource_kunit.c +++ b/kernel/resource_kunit.c @@ -148,3 +148,5 @@ static struct kunit_suite resource_test_suite = { .test_cases = resource_test_cases, }; kunit_test_suite(resource_test_suite); + +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 01341fbd0d8d4e717fc1231cdffe00343088ce0b Mon Sep 17 00:00:00 2001 From: Yunfeng Ye Date: Thu, 19 Nov 2020 14:21:25 +0800 Subject: workqueue: Kick a worker based on the actual activation of delayed works In realtime scenario, We do not want to have interference on the isolated cpu cores. but when invoking alloc_workqueue() for percpu wq on the housekeeping cpu, it kick a kworker on the isolated cpu. alloc_workqueue pwq_adjust_max_active wake_up_worker The comment in pwq_adjust_max_active() said: "Need to kick a worker after thawed or an unbound wq's max_active is bumped" So it is unnecessary to kick a kworker for percpu's wq when invoking alloc_workqueue(). this patch only kick a worker based on the actual activation of delayed works. Signed-off-by: Yunfeng Ye Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 437935e7a199..0695c7895c89 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3728,17 +3728,24 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq) * is updated and visible. */ if (!freezable || !workqueue_freezing) { + bool kick = false; + pwq->max_active = wq->saved_max_active; while (!list_empty(&pwq->delayed_works) && - pwq->nr_active < pwq->max_active) + pwq->nr_active < pwq->max_active) { pwq_activate_first_delayed(pwq); + kick = true; + } /* * Need to kick a worker after thawed or an unbound wq's - * max_active is bumped. It's a slow path. Do it always. + * max_active is bumped. In realtime scenarios, always kicking a + * worker will cause interference on the isolated cpu cores, so + * let's kick iff work items were activated. */ - wake_up_worker(pwq->pool); + if (kick) + wake_up_worker(pwq->pool); } else { pwq->max_active = 0; } -- cgit v1.2.3 From 58315c96651152b9f438e5e56c910994234e2c7a Mon Sep 17 00:00:00 2001 From: Bhaskar Chowdhury Date: Mon, 9 Nov 2020 16:01:11 +0530 Subject: kernel: cgroup: Mundane spelling fixes throughout the file Few spelling fixes throughout the file. Signed-off-by: Bhaskar Chowdhury Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e41c21819ba0..690f477d537c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -244,7 +244,7 @@ bool cgroup_ssid_enabled(int ssid) * * The default hierarchy is the v2 interface of cgroup and this function * can be used to test whether a cgroup is on the default hierarchy for - * cases where a subsystem should behave differnetly depending on the + * cases where a subsystem should behave differently depending on the * interface version. * * List of changed behaviors: @@ -262,7 +262,7 @@ bool cgroup_ssid_enabled(int ssid) * "cgroup.procs" instead. * * - "cgroup.procs" is not sorted. pids will be unique unless they got - * recycled inbetween reads. + * recycled in-between reads. * * - "release_agent" and "notify_on_release" are removed. Replacement * notification mechanism will be implemented. @@ -345,7 +345,7 @@ static bool cgroup_is_mixable(struct cgroup *cgrp) return !cgroup_parent(cgrp); } -/* can @cgrp become a thread root? should always be true for a thread root */ +/* can @cgrp become a thread root? Should always be true for a thread root */ static bool cgroup_can_be_thread_root(struct cgroup *cgrp) { /* mixables don't care */ @@ -530,7 +530,7 @@ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp, * the root css is returned, so this function always returns a valid css. * * The returned css is not guaranteed to be online, and therefore it is the - * callers responsiblity to tryget a reference for it. + * callers responsibility to try get a reference for it. */ struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, struct cgroup_subsys *ss) @@ -702,7 +702,7 @@ EXPORT_SYMBOL_GPL(of_css); ; \ else -/* walk live descendants in preorder */ +/* walk live descendants in pre order */ #define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ if (({ lockdep_assert_held(&cgroup_mutex); \ @@ -936,7 +936,7 @@ void put_css_set_locked(struct css_set *cset) WARN_ON_ONCE(!list_empty(&cset->threaded_csets)); - /* This css_set is dead. unlink it and release cgroup and css refs */ + /* This css_set is dead. Unlink it and release cgroup and css refs */ for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); css_put(cset->subsys[ssid]); @@ -1061,7 +1061,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* * Build the set of subsystem state objects that we want to see in the - * new css_set. while subsystems can change globally, the entries here + * new css_set. While subsystems can change globally, the entries here * won't change, so no need for locking. */ for_each_subsys(ss, i) { @@ -1151,7 +1151,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, /* * Always add links to the tail of the lists so that the lists are - * in choronological order. + * in chronological order. */ list_move_tail(&link->cset_link, &cgrp->cset_links); list_add_tail(&link->cgrp_link, &cset->cgrp_links); @@ -4137,7 +4137,7 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, * implies that if we observe !CSS_RELEASED on @pos in this RCU * critical section, the one pointed to by its next pointer is * guaranteed to not have finished its RCU grace period even if we - * have dropped rcu_read_lock() inbetween iterations. + * have dropped rcu_read_lock() in-between iterations. * * If @pos has CSS_RELEASED set, its next pointer can't be * dereferenced; however, as each css is given a monotonically @@ -4385,7 +4385,7 @@ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it) } /** - * css_task_iter_advance_css_set - advance a task itererator to the next css_set + * css_task_iter_advance_css_set - advance a task iterator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. @@ -6320,7 +6320,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) * * Find the cgroup at @path on the default hierarchy, increment its * reference count and return it. Returns pointer to the found cgroup on - * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) + * success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR) * if @path points to a non-directory. */ struct cgroup *cgroup_get_from_path(const char *path) -- cgit v1.2.3 From 5a7b5f32c5aa628841502d19a813c633ff6ecbe4 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Fri, 6 Nov 2020 22:47:40 +0800 Subject: cgroup/cgroup.c: replace 'of->kn->priv' with of_cft() we have supplied the inline function: of_cft() in cgroup.h. So replace the direct use 'of->kn->priv' with inline func of_cft(), which is more readable. Signed-off-by: Hui Su Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 690f477d537c..385f80cf7d0c 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3657,7 +3657,7 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, static int cgroup_file_open(struct kernfs_open_file *of) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); if (cft->open) return cft->open(of); @@ -3666,7 +3666,7 @@ static int cgroup_file_open(struct kernfs_open_file *of) static void cgroup_file_release(struct kernfs_open_file *of) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); if (cft->release) cft->release(of); @@ -3677,7 +3677,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, { struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *cgrp = of->kn->parent->priv; - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; int ret; @@ -3727,7 +3727,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt) { - struct cftype *cft = of->kn->priv; + struct cftype *cft = of_cft(of); if (cft->poll) return cft->poll(of, pt); -- cgit v1.2.3 From 27672f0d280a3f286a410a8db2004f46ace72a17 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 24 Nov 2020 15:12:09 +0000 Subject: bpf: Add a BPF helper for getting the IMA hash of an inode Provide a wrapper function to get the IMA hash of an inode. This helper is useful in fingerprinting files (e.g executables on execution) and using these fingerprints in detections like an executable unlinking itself. Since the ima_inode_hash can sleep, it's only allowed for sleepable LSM hooks. Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201124151210.1081188-3-kpsingh@chromium.org --- include/uapi/linux/bpf.h | 11 +++++++++++ kernel/bpf/bpf_lsm.c | 26 ++++++++++++++++++++++++++ scripts/bpf_helpers_doc.py | 2 ++ tools/include/uapi/linux/bpf.h | 11 +++++++++++ 4 files changed, 50 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3ca6146f001a..c3458ec1f30a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3807,6 +3807,16 @@ union bpf_attr { * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) * Return * Current *ktime*. + * + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) + * Description + * Returns the stored IMA hash of the *inode* (if it's avaialable). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3970,6 +3980,7 @@ union bpf_attr { FN(get_current_task_btf), \ FN(bprm_opts_set), \ FN(ktime_get_coarse_ns), \ + FN(ima_inode_hash), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index b4f27a874092..70e5e0b6d69d 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -15,6 +15,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -75,6 +76,29 @@ const static struct bpf_func_proto bpf_bprm_opts_set_proto = { .arg2_type = ARG_ANYTHING, }; +BPF_CALL_3(bpf_ima_inode_hash, struct inode *, inode, void *, dst, u32, size) +{ + return ima_inode_hash(inode, dst, size); +} + +static bool bpf_ima_inode_hash_allowed(const struct bpf_prog *prog) +{ + return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); +} + +BTF_ID_LIST_SINGLE(bpf_ima_inode_hash_btf_ids, struct, inode) + +const static struct bpf_func_proto bpf_ima_inode_hash_proto = { + .func = bpf_ima_inode_hash, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_ima_inode_hash_btf_ids[0], + .arg2_type = ARG_PTR_TO_UNINIT_MEM, + .arg3_type = ARG_CONST_SIZE, + .allowed = bpf_ima_inode_hash_allowed, +}; + static const struct bpf_func_proto * bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -97,6 +121,8 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_task_storage_delete_proto; case BPF_FUNC_bprm_opts_set: return &bpf_bprm_opts_set_proto; + case BPF_FUNC_ima_inode_hash: + return prog->aux->sleepable ? &bpf_ima_inode_hash_proto : NULL; default: return tracing_prog_func_proto(func_id, prog); } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index c5bc947a70ad..8b829748d488 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -436,6 +436,7 @@ class PrinterHelpers(Printer): 'struct xdp_md', 'struct path', 'struct btf_ptr', + 'struct inode', ] known_types = { '...', @@ -480,6 +481,7 @@ class PrinterHelpers(Printer): 'struct task_struct', 'struct path', 'struct btf_ptr', + 'struct inode', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 3ca6146f001a..c3458ec1f30a 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3807,6 +3807,16 @@ union bpf_attr { * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) * Return * Current *ktime*. + * + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) + * Description + * Returns the stored IMA hash of the *inode* (if it's avaialable). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3970,6 +3980,7 @@ union bpf_attr { FN(get_current_task_btf), \ FN(bprm_opts_set), \ FN(ktime_get_coarse_ns), \ + FN(ima_inode_hash), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From 8ff00399b153440c1c83e20c43020385b416415b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 26 Nov 2020 20:25:29 +1000 Subject: kernel/cpu: add arch override for clear_tasks_mm_cpumask() mm handling powerpc/64s keeps a counter in the mm which counts bits set in mm_cpumask as well as other things. This means it can't use generic code to clear bits out of the mask and doesn't adjust the arch specific counter. Add an arch override that allows powerpc/64s to use clear_tasks_mm_cpumask(). Signed-off-by: Nicholas Piggin Reviewed-by: Aneesh Kumar K.V Acked-by: Peter Zijlstra (Intel) Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20201126102530.691335-4-npiggin@gmail.com --- kernel/cpu.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ff2578ecf17..2b8d7a5db383 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -815,6 +815,10 @@ void __init cpuhp_threads_init(void) } #ifdef CONFIG_HOTPLUG_CPU +#ifndef arch_clear_mm_cpumask_cpu +#define arch_clear_mm_cpumask_cpu(cpu, mm) cpumask_clear_cpu(cpu, mm_cpumask(mm)) +#endif + /** * clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU * @cpu: a CPU id @@ -850,7 +854,7 @@ void clear_tasks_mm_cpumask(int cpu) t = find_lock_task_mm(p); if (!t) continue; - cpumask_clear_cpu(cpu, mm_cpumask(t->mm)); + arch_clear_mm_cpumask_cpu(cpu, t->mm); task_unlock(t); } rcu_read_unlock(); -- cgit v1.2.3 From 8d8d53cf8fd028310b1189165b939cde124895d7 Mon Sep 17 00:00:00 2001 From: Alexey Kardashevskiy Date: Thu, 29 Oct 2020 12:52:40 +1100 Subject: dma-mapping: Allow mixing bypass and mapped DMA operation At the moment we allow bypassing DMA ops only when we can do this for the entire RAM. However there are configs with mixed type memory where we could still allow bypassing IOMMU in most cases; POWERPC with persistent memory is one example. This adds an arch hook to determine where bypass can still work and we invoke direct DMA API. The following patch checks the bus limit on POWERPC to allow or disallow direct mapping. This adds a ARCH_HAS_DMA_MAP_DIRECT config option to make the arch_xxxx hooks no-op by default. Signed-off-by: Alexey Kardashevskiy Signed-off-by: Christoph Hellwig --- include/linux/dma-map-ops.h | 14 ++++++++++++++ kernel/dma/Kconfig | 4 ++++ kernel/dma/mapping.c | 12 ++++++++---- 3 files changed, 26 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h index a5f89fc4d6df..38c8a4558e08 100644 --- a/include/linux/dma-map-ops.h +++ b/include/linux/dma-map-ops.h @@ -314,6 +314,20 @@ static inline void arch_dma_mark_clean(phys_addr_t paddr, size_t size) void *arch_dma_set_uncached(void *addr, size_t size); void arch_dma_clear_uncached(void *addr, size_t size); +#ifdef CONFIG_ARCH_HAS_DMA_MAP_DIRECT +bool arch_dma_map_page_direct(struct device *dev, phys_addr_t addr); +bool arch_dma_unmap_page_direct(struct device *dev, dma_addr_t dma_handle); +bool arch_dma_map_sg_direct(struct device *dev, struct scatterlist *sg, + int nents); +bool arch_dma_unmap_sg_direct(struct device *dev, struct scatterlist *sg, + int nents); +#else +#define arch_dma_map_page_direct(d, a) (false) +#define arch_dma_unmap_page_direct(d, a) (false) +#define arch_dma_map_sg_direct(d, s, n) (false) +#define arch_dma_unmap_sg_direct(d, s, n) (false) +#endif + #ifdef CONFIG_ARCH_HAS_SETUP_DMA_OPS void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent); diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index c99de4a21458..43d106598e82 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -20,6 +20,10 @@ config DMA_OPS config DMA_OPS_BYPASS bool +# Lets platform IOMMU driver choose between bypass and IOMMU +config ARCH_HAS_DMA_MAP_DIRECT + bool + config NEED_SG_DMA_LENGTH bool diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 51bb8fa8eb89..f87a89d08654 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -149,7 +149,8 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, if (WARN_ON_ONCE(!dev->dma_mask)) return DMA_MAPPING_ERROR; - if (dma_map_direct(dev, ops)) + if (dma_map_direct(dev, ops) || + arch_dma_map_page_direct(dev, page_to_phys(page) + offset + size)) addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); @@ -165,7 +166,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t addr, size_t size, const struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); - if (dma_map_direct(dev, ops)) + if (dma_map_direct(dev, ops) || + arch_dma_unmap_page_direct(dev, addr + size)) dma_direct_unmap_page(dev, addr, size, dir, attrs); else if (ops->unmap_page) ops->unmap_page(dev, addr, size, dir, attrs); @@ -188,7 +190,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents, if (WARN_ON_ONCE(!dev->dma_mask)) return 0; - if (dma_map_direct(dev, ops)) + if (dma_map_direct(dev, ops) || + arch_dma_map_sg_direct(dev, sg, nents)) ents = dma_direct_map_sg(dev, sg, nents, dir, attrs); else ents = ops->map_sg(dev, sg, nents, dir, attrs); @@ -207,7 +210,8 @@ void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg, BUG_ON(!valid_dma_direction(dir)); debug_dma_unmap_sg(dev, sg, nents, dir); - if (dma_map_direct(dev, ops)) + if (dma_map_direct(dev, ops) || + arch_dma_unmap_sg_direct(dev, sg, nents)) dma_direct_unmap_sg(dev, sg, nents, dir, attrs); else if (ops->unmap_sg) ops->unmap_sg(dev, sg, nents, dir, attrs); -- cgit v1.2.3 From 94035edcb4e3bbc9f445bee706722ef64e044095 Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Sat, 7 Nov 2020 18:03:12 +0800 Subject: dma-pool: no need to check return value of debugfs_create functions When calling debugfs functions, there is no need to ever check the return value. The function can work or not, but the code logic should never do something different based on this. Signed-off-by: Tiezhu Yang Reviewed-by: Robin Murphy Signed-off-by: Christoph Hellwig --- kernel/dma/pool.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index d4637f72239b..5f84e6cdb78e 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -38,9 +38,6 @@ static void __init dma_atomic_pool_debugfs_init(void) struct dentry *root; root = debugfs_create_dir("dma_pools", NULL); - if (IS_ERR_OR_NULL(root)) - return; - debugfs_create_ulong("pool_size_dma", 0400, root, &pool_size_dma); debugfs_create_ulong("pool_size_dma32", 0400, root, &pool_size_dma32); debugfs_create_ulong("pool_size_kernel", 0400, root, &pool_size_kernel); -- cgit v1.2.3 From 819b70ad620119d21a9e4be6ad665ece26fc0db8 Mon Sep 17 00:00:00 2001 From: tangjianqiang Date: Tue, 24 Nov 2020 18:40:19 +0800 Subject: dma-contiguous: fix a typo error in a comment Fix a typo error in cma description comment: "then" -> "than". Signed-off-by: tangjianqiang Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 16b95ff12e4d..3d63d91cba5c 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -20,7 +20,7 @@ * coders, etc. * * Such devices often require big memory buffers (a full HD frame - * is, for instance, more then 2 mega pixels large, i.e. more than 6 + * is, for instance, more than 2 mega pixels large, i.e. more than 6 * MB of memory), which makes mechanisms such as kmalloc() or * alloc_page() ineffective. * -- cgit v1.2.3 From 65789daa8087e125927230ccb7e1eab13999b0cf Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 16 Nov 2020 19:08:47 +1300 Subject: dma-mapping: add benchmark support for streaming DMA APIs Nowadays, there are increasing requirements to benchmark the performance of dma_map and dma_unmap particually while the device is attached to an IOMMU. This patch enables the support. Users can run specified number of threads to do dma_map_page and dma_unmap_page on a specific NUMA node with the specified duration. Then dma_map_benchmark will calculate the average latency for map and unmap. A difficulity for this benchmark is that dma_map/unmap APIs must run on a particular device. Each device might have different backend of IOMMU or non-IOMMU. So we use the driver_override to bind dma_map_benchmark to a particual device by: For platform devices: echo dma_map_benchmark > /sys/bus/platform/devices/xxx/driver_override echo xxx > /sys/bus/platform/drivers/xxx/unbind echo xxx > /sys/bus/platform/drivers/dma_map_benchmark/bind For PCI devices: echo dma_map_benchmark > /sys/bus/pci/devices/0000:00:01.0/driver_override echo 0000:00:01.0 > /sys/bus/pci/drivers/xxx/unbind echo 0000:00:01.0 > /sys/bus/pci/drivers/dma_map_benchmark/bind Cc: Will Deacon Cc: Shuah Khan Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Signed-off-by: Barry Song [hch: folded in two fixes from Colin Ian King ] Signed-off-by: Christoph Hellwig --- kernel/dma/Kconfig | 9 ++ kernel/dma/Makefile | 1 + kernel/dma/map_benchmark.c | 361 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 371 insertions(+) create mode 100644 kernel/dma/map_benchmark.c (limited to 'kernel') diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig index 43d106598e82..ba50d4588bc5 100644 --- a/kernel/dma/Kconfig +++ b/kernel/dma/Kconfig @@ -229,3 +229,12 @@ config DMA_API_DEBUG_SG is technically out-of-spec. If unsure, say N. + +config DMA_MAP_BENCHMARK + bool "Enable benchmarking of streaming DMA mapping" + depends on DEBUG_FS + help + Provides /sys/kernel/debug/dma_map_benchmark that helps with testing + performance of dma_(un)map_page. + + See tools/testing/selftests/dma/dma_map_benchmark.c diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile index dc755ab68aab..7aa6b26b1348 100644 --- a/kernel/dma/Makefile +++ b/kernel/dma/Makefile @@ -10,3 +10,4 @@ obj-$(CONFIG_DMA_API_DEBUG) += debug.o obj-$(CONFIG_SWIOTLB) += swiotlb.o obj-$(CONFIG_DMA_COHERENT_POOL) += pool.o obj-$(CONFIG_DMA_REMAP) += remap.o +obj-$(CONFIG_DMA_MAP_BENCHMARK) += map_benchmark.o diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c new file mode 100644 index 000000000000..b1496e744c68 --- /dev/null +++ b/kernel/dma/map_benchmark.c @@ -0,0 +1,361 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 2020 Hisilicon Limited. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DMA_MAP_BENCHMARK _IOWR('d', 1, struct map_benchmark) +#define DMA_MAP_MAX_THREADS 1024 +#define DMA_MAP_MAX_SECONDS 300 + +#define DMA_MAP_BIDIRECTIONAL 0 +#define DMA_MAP_TO_DEVICE 1 +#define DMA_MAP_FROM_DEVICE 2 + +struct map_benchmark { + __u64 avg_map_100ns; /* average map latency in 100ns */ + __u64 map_stddev; /* standard deviation of map latency */ + __u64 avg_unmap_100ns; /* as above */ + __u64 unmap_stddev; + __u32 threads; /* how many threads will do map/unmap in parallel */ + __u32 seconds; /* how long the test will last */ + __s32 node; /* which numa node this benchmark will run on */ + __u32 dma_bits; /* DMA addressing capability */ + __u32 dma_dir; /* DMA data direction */ + __u64 expansion[10]; /* For future use */ +}; + +struct map_benchmark_data { + struct map_benchmark bparam; + struct device *dev; + struct dentry *debugfs; + enum dma_data_direction dir; + atomic64_t sum_map_100ns; + atomic64_t sum_unmap_100ns; + atomic64_t sum_sq_map; + atomic64_t sum_sq_unmap; + atomic64_t loops; +}; + +static int map_benchmark_thread(void *data) +{ + void *buf; + dma_addr_t dma_addr; + struct map_benchmark_data *map = data; + int ret = 0; + + buf = (void *)__get_free_page(GFP_KERNEL); + if (!buf) + return -ENOMEM; + + while (!kthread_should_stop()) { + u64 map_100ns, unmap_100ns, map_sq, unmap_sq; + ktime_t map_stime, map_etime, unmap_stime, unmap_etime; + ktime_t map_delta, unmap_delta; + + /* + * for a non-coherent device, if we don't stain them in the + * cache, this will give an underestimate of the real-world + * overhead of BIDIRECTIONAL or TO_DEVICE mappings; + * 66 means evertything goes well! 66 is lucky. + */ + if (map->dir != DMA_FROM_DEVICE) + memset(buf, 0x66, PAGE_SIZE); + + map_stime = ktime_get(); + dma_addr = dma_map_single(map->dev, buf, PAGE_SIZE, map->dir); + if (unlikely(dma_mapping_error(map->dev, dma_addr))) { + pr_err("dma_map_single failed on %s\n", + dev_name(map->dev)); + ret = -ENOMEM; + goto out; + } + map_etime = ktime_get(); + map_delta = ktime_sub(map_etime, map_stime); + + unmap_stime = ktime_get(); + dma_unmap_single(map->dev, dma_addr, PAGE_SIZE, map->dir); + unmap_etime = ktime_get(); + unmap_delta = ktime_sub(unmap_etime, unmap_stime); + + /* calculate sum and sum of squares */ + + map_100ns = div64_ul(map_delta, 100); + unmap_100ns = div64_ul(unmap_delta, 100); + map_sq = map_100ns * map_100ns; + unmap_sq = unmap_100ns * unmap_100ns; + + atomic64_add(map_100ns, &map->sum_map_100ns); + atomic64_add(unmap_100ns, &map->sum_unmap_100ns); + atomic64_add(map_sq, &map->sum_sq_map); + atomic64_add(unmap_sq, &map->sum_sq_unmap); + atomic64_inc(&map->loops); + } + +out: + free_page((unsigned long)buf); + return ret; +} + +static int do_map_benchmark(struct map_benchmark_data *map) +{ + struct task_struct **tsk; + int threads = map->bparam.threads; + int node = map->bparam.node; + const cpumask_t *cpu_mask = cpumask_of_node(node); + u64 loops; + int ret = 0; + int i; + + tsk = kmalloc_array(threads, sizeof(*tsk), GFP_KERNEL); + if (!tsk) + return -ENOMEM; + + get_device(map->dev); + + for (i = 0; i < threads; i++) { + tsk[i] = kthread_create_on_node(map_benchmark_thread, map, + map->bparam.node, "dma-map-benchmark/%d", i); + if (IS_ERR(tsk[i])) { + pr_err("create dma_map thread failed\n"); + ret = PTR_ERR(tsk[i]); + goto out; + } + + if (node != NUMA_NO_NODE) + kthread_bind_mask(tsk[i], cpu_mask); + } + + /* clear the old value in the previous benchmark */ + atomic64_set(&map->sum_map_100ns, 0); + atomic64_set(&map->sum_unmap_100ns, 0); + atomic64_set(&map->sum_sq_map, 0); + atomic64_set(&map->sum_sq_unmap, 0); + atomic64_set(&map->loops, 0); + + for (i = 0; i < threads; i++) + wake_up_process(tsk[i]); + + msleep_interruptible(map->bparam.seconds * 1000); + + /* wait for the completion of benchmark threads */ + for (i = 0; i < threads; i++) { + ret = kthread_stop(tsk[i]); + if (ret) + goto out; + } + + loops = atomic64_read(&map->loops); + if (likely(loops > 0)) { + u64 map_variance, unmap_variance; + u64 sum_map = atomic64_read(&map->sum_map_100ns); + u64 sum_unmap = atomic64_read(&map->sum_unmap_100ns); + u64 sum_sq_map = atomic64_read(&map->sum_sq_map); + u64 sum_sq_unmap = atomic64_read(&map->sum_sq_unmap); + + /* average latency */ + map->bparam.avg_map_100ns = div64_u64(sum_map, loops); + map->bparam.avg_unmap_100ns = div64_u64(sum_unmap, loops); + + /* standard deviation of latency */ + map_variance = div64_u64(sum_sq_map, loops) - + map->bparam.avg_map_100ns * + map->bparam.avg_map_100ns; + unmap_variance = div64_u64(sum_sq_unmap, loops) - + map->bparam.avg_unmap_100ns * + map->bparam.avg_unmap_100ns; + map->bparam.map_stddev = int_sqrt64(map_variance); + map->bparam.unmap_stddev = int_sqrt64(unmap_variance); + } + +out: + put_device(map->dev); + kfree(tsk); + return ret; +} + +static long map_benchmark_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct map_benchmark_data *map = file->private_data; + void __user *argp = (void __user *)arg; + u64 old_dma_mask; + + int ret; + + if (copy_from_user(&map->bparam, argp, sizeof(map->bparam))) + return -EFAULT; + + switch (cmd) { + case DMA_MAP_BENCHMARK: + if (map->bparam.threads == 0 || + map->bparam.threads > DMA_MAP_MAX_THREADS) { + pr_err("invalid thread number\n"); + return -EINVAL; + } + + if (map->bparam.seconds == 0 || + map->bparam.seconds > DMA_MAP_MAX_SECONDS) { + pr_err("invalid duration seconds\n"); + return -EINVAL; + } + + if (map->bparam.node != NUMA_NO_NODE && + !node_possible(map->bparam.node)) { + pr_err("invalid numa node\n"); + return -EINVAL; + } + + switch (map->bparam.dma_dir) { + case DMA_MAP_BIDIRECTIONAL: + map->dir = DMA_BIDIRECTIONAL; + break; + case DMA_MAP_FROM_DEVICE: + map->dir = DMA_FROM_DEVICE; + break; + case DMA_MAP_TO_DEVICE: + map->dir = DMA_TO_DEVICE; + break; + default: + pr_err("invalid DMA direction\n"); + return -EINVAL; + } + + old_dma_mask = dma_get_mask(map->dev); + + ret = dma_set_mask(map->dev, + DMA_BIT_MASK(map->bparam.dma_bits)); + if (ret) { + pr_err("failed to set dma_mask on device %s\n", + dev_name(map->dev)); + return -EINVAL; + } + + ret = do_map_benchmark(map); + + /* + * restore the original dma_mask as many devices' dma_mask are + * set by architectures, acpi, busses. When we bind them back + * to their original drivers, those drivers shouldn't see + * dma_mask changed by benchmark + */ + dma_set_mask(map->dev, old_dma_mask); + break; + default: + return -EINVAL; + } + + if (copy_to_user(argp, &map->bparam, sizeof(map->bparam))) + return -EFAULT; + + return ret; +} + +static const struct file_operations map_benchmark_fops = { + .open = simple_open, + .unlocked_ioctl = map_benchmark_ioctl, +}; + +static void map_benchmark_remove_debugfs(void *data) +{ + struct map_benchmark_data *map = (struct map_benchmark_data *)data; + + debugfs_remove(map->debugfs); +} + +static int __map_benchmark_probe(struct device *dev) +{ + struct dentry *entry; + struct map_benchmark_data *map; + int ret; + + map = devm_kzalloc(dev, sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; + map->dev = dev; + + ret = devm_add_action(dev, map_benchmark_remove_debugfs, map); + if (ret) { + pr_err("Can't add debugfs remove action\n"); + return ret; + } + + /* + * we only permit a device bound with this driver, 2nd probe + * will fail + */ + entry = debugfs_create_file("dma_map_benchmark", 0600, NULL, map, + &map_benchmark_fops); + if (IS_ERR(entry)) + return PTR_ERR(entry); + map->debugfs = entry; + + return 0; +} + +static int map_benchmark_platform_probe(struct platform_device *pdev) +{ + return __map_benchmark_probe(&pdev->dev); +} + +static struct platform_driver map_benchmark_platform_driver = { + .driver = { + .name = "dma_map_benchmark", + }, + .probe = map_benchmark_platform_probe, +}; + +static int +map_benchmark_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) +{ + return __map_benchmark_probe(&pdev->dev); +} + +static struct pci_driver map_benchmark_pci_driver = { + .name = "dma_map_benchmark", + .probe = map_benchmark_pci_probe, +}; + +static int __init map_benchmark_init(void) +{ + int ret; + + ret = pci_register_driver(&map_benchmark_pci_driver); + if (ret) + return ret; + + ret = platform_driver_register(&map_benchmark_platform_driver); + if (ret) { + pci_unregister_driver(&map_benchmark_pci_driver); + return ret; + } + + return 0; +} + +static void __exit map_benchmark_cleanup(void) +{ + platform_driver_unregister(&map_benchmark_platform_driver); + pci_unregister_driver(&map_benchmark_pci_driver); +} + +module_init(map_benchmark_init); +module_exit(map_benchmark_cleanup); + +MODULE_AUTHOR("Barry Song "); +MODULE_DESCRIPTION("dma_map benchmark driver"); +MODULE_LICENSE("GPL"); -- cgit v1.2.3 From 4ad9921af4f18490980369f7d60f90ade0195812 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 26 Nov 2020 12:54:36 +0106 Subject: printk: finalize records with trailing newlines Any record with a trailing newline (LOG_NEWLINE flag) cannot be continued because the newline has been stripped and will not be visible if the message is appended. This was already handled correctly when committing in log_output() but was not handled correctly when committing in log_store(). Fixes: f5f022e53b87 ("printk: reimplement log_cont using record extension") Link: https://lore.kernel.org/r/20201126114836.14750-1-john.ogness@linutronix.de Reported-by: Kefeng Wang Signed-off-by: John Ogness Tested-by: Kefeng Wang Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fe64a49344bf..bc1e3b5a97bd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -528,8 +528,8 @@ static int log_store(u32 caller_id, int facility, int level, if (dev_info) memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); - /* insert message */ - if ((flags & LOG_CONT) || !(flags & LOG_NEWLINE)) + /* A message without a trailing newline can be continued. */ + if (!(flags & LOG_NEWLINE)) prb_commit(&e); else prb_final_commit(&e); -- cgit v1.2.3 From bb4c6910c8b41623104c2e64a30615682689a54d Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Thu, 26 Nov 2020 09:28:51 +0100 Subject: genirq/irqdomain: Add an irq_create_mapping_affinity() function There is currently no way to convey the affinity of an interrupt via irq_create_mapping(), which creates issues for devices that expect that affinity to be managed by the kernel. In order to sort this out, rename irq_create_mapping() to irq_create_mapping_affinity() with an additional affinity parameter that can be passed down to irq_domain_alloc_descs(). irq_create_mapping() is re-implemented as a wrapper around irq_create_mapping_affinity(). No functional change. Fixes: e75eafb9b039 ("genirq/msi: Switch to new irq spreading infrastructure") Signed-off-by: Laurent Vivier Signed-off-by: Thomas Gleixner Reviewed-by: Greg Kurz Cc: Michael Ellerman Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20201126082852.1178497-2-lvivier@redhat.com --- include/linux/irqdomain.h | 12 ++++++++++-- kernel/irq/irqdomain.c | 13 ++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h index 71535e87109f..ea5a337e0f8b 100644 --- a/include/linux/irqdomain.h +++ b/include/linux/irqdomain.h @@ -384,11 +384,19 @@ extern void irq_domain_associate_many(struct irq_domain *domain, extern void irq_domain_disassociate(struct irq_domain *domain, unsigned int irq); -extern unsigned int irq_create_mapping(struct irq_domain *host, - irq_hw_number_t hwirq); +extern unsigned int irq_create_mapping_affinity(struct irq_domain *host, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity); extern unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec); extern void irq_dispose_mapping(unsigned int virq); +static inline unsigned int irq_create_mapping(struct irq_domain *host, + irq_hw_number_t hwirq) +{ + return irq_create_mapping_affinity(host, hwirq, NULL); +} + + /** * irq_linear_revmap() - Find a linux irq from a hw irq number. * @domain: domain owning this hardware interrupt diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf8b374b892d..e4ca69608f3b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -624,17 +624,19 @@ unsigned int irq_create_direct_mapping(struct irq_domain *domain) EXPORT_SYMBOL_GPL(irq_create_direct_mapping); /** - * irq_create_mapping() - Map a hardware interrupt into linux irq space + * irq_create_mapping_affinity() - Map a hardware interrupt into linux irq space * @domain: domain owning this hardware interrupt or NULL for default domain * @hwirq: hardware irq number in that domain space + * @affinity: irq affinity * * Only one mapping per hardware interrupt is permitted. Returns a linux * irq number. * If the sense/trigger is to be specified, set_irq_type() should be called * on the number returned from that call. */ -unsigned int irq_create_mapping(struct irq_domain *domain, - irq_hw_number_t hwirq) +unsigned int irq_create_mapping_affinity(struct irq_domain *domain, + irq_hw_number_t hwirq, + const struct irq_affinity_desc *affinity) { struct device_node *of_node; int virq; @@ -660,7 +662,8 @@ unsigned int irq_create_mapping(struct irq_domain *domain, } /* Allocate a virtual interrupt number */ - virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), NULL); + virq = irq_domain_alloc_descs(-1, 1, hwirq, of_node_to_nid(of_node), + affinity); if (virq <= 0) { pr_debug("-> virq allocation failed\n"); return 0; @@ -676,7 +679,7 @@ unsigned int irq_create_mapping(struct irq_domain *domain, return virq; } -EXPORT_SYMBOL_GPL(irq_create_mapping); +EXPORT_SYMBOL_GPL(irq_create_mapping_affinity); /** * irq_create_strict_mappings() - Map a range of hw irqs to fixed linux irqs -- cgit v1.2.3 From 4615fbc3788ddc8e7c6d697714ad35a53729aa2c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 29 Nov 2020 13:55:51 +0000 Subject: genirq/irqdomain: Don't try to free an interrupt that has no mapping When an interrupt allocation fails for N interrupts, it is pretty common for the error handling code to free the same number of interrupts, no matter how many interrupts have actually been allocated. This may result in the domain freeing code to be unexpectedly called for interrupts that have no mapping in that domain. Things end pretty badly. Instead, add some checks to irq_domain_free_irqs_hierarchy() to make sure that thiss does not follow the hierarchy if no mapping exists for a given interrupt. Fixes: 6a6544e520abe ("genirq/irqdomain: Remove auto-recursive hierarchy support") Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201129135551.396777-1-maz@kernel.org --- kernel/irq/irqdomain.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 3d7463fd6453..30a78872a5cf 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1381,8 +1381,15 @@ static void irq_domain_free_irqs_hierarchy(struct irq_domain *domain, unsigned int irq_base, unsigned int nr_irqs) { - if (domain->ops->free) - domain->ops->free(domain, irq_base, nr_irqs); + unsigned int i; + + if (!domain->ops->free) + return; + + for (i = 0; i < nr_irqs; i++) { + if (irq_domain_get_irq_data(domain, irq_base + i)) + domain->ops->free(domain, irq_base + i, 1); + } } int irq_domain_alloc_irqs_hierarchy(struct irq_domain *domain, -- cgit v1.2.3 From 55ea4cf403800af2ce6b125bc3d853117e0c0456 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 27 Nov 2020 11:20:58 -0500 Subject: ring-buffer: Update write stamp with the correct ts The write stamp, used to calculate deltas between events, was updated with the stale "ts" value in the "info" structure, and not with the updated "ts" variable. This caused the deltas between events to be inaccurate, and when crossing into a new sub buffer, had time go backwards. Link: https://lkml.kernel.org/r/20201124223917.795844-1-elavila@google.com Cc: stable@vger.kernel.org Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp") Reported-by: "J. Avila" Tested-by: Daniel Mentz Tested-by: Will McVicker Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index dc83b3fa9fe7..bccaf88d3706 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3291,7 +3291,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* Nothing came after this event between C and E */ info->delta = ts - info->after; (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, info->ts); + info->after, ts); info->ts = ts; } else { /* -- cgit v1.2.3 From 8785f51a17083eee7c37606079c6447afc6ba102 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 28 Nov 2020 10:15:17 +0100 Subject: ring-buffer: Set the right timestamp in the slow path of __rb_reserve_next() In the slow path of __rb_reserve_next() a nested event(s) can happen between evaluating the timestamp delta of the current event and updating write_stamp via local_cmpxchg(); in this case the delta is not valid anymore and it should be set to 0 (same timestamp as the interrupting event), since the event that we are currently processing is not the last event in the buffer. Link: https://lkml.kernel.org/r/X8IVJcp1gRE+FJCJ@xps-13-7390 Cc: Ingo Molnar Cc: Masami Hiramatsu Cc: stable@vger.kernel.org Link: https://lwn.net/Articles/831207 Fixes: a389d86f7fd0 ("ring-buffer: Have nested events still record running time stamp") Signed-off-by: Andrea Righi Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bccaf88d3706..35d91b20d47a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3287,11 +3287,11 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, ts = rb_time_stamp(cpu_buffer->buffer); barrier(); /*E*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) && - info->after < ts) { + info->after < ts && + rb_time_cmpxchg(&cpu_buffer->write_stamp, + info->after, ts)) { /* Nothing came after this event between C and E */ info->delta = ts - info->after; - (void)rb_time_cmpxchg(&cpu_buffer->write_stamp, - info->after, ts); info->ts = ts; } else { /* -- cgit v1.2.3 From 310e3a4b5a4fc718a72201c1e4cf5c64ac6f5442 Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Wed, 18 Nov 2020 15:05:20 +0300 Subject: tracing: Remove WARN_ON in start_thread() This patch reverts commit 978defee11a5 ("tracing: Do a WARN_ON() if start_thread() in hwlat is called when thread exists") .start hook can be legally called several times if according tracer is stopped screen window 1 [root@localhost ~]# echo 1 > /sys/kernel/tracing/events/kmem/kfree/enable [root@localhost ~]# echo 1 > /sys/kernel/tracing/options/pause-on-trace [root@localhost ~]# less -F /sys/kernel/tracing/trace screen window 2 [root@localhost ~]# cat /sys/kernel/debug/tracing/tracing_on 0 [root@localhost ~]# echo hwlat > /sys/kernel/debug/tracing/current_tracer [root@localhost ~]# echo 1 > /sys/kernel/debug/tracing/tracing_on [root@localhost ~]# cat /sys/kernel/debug/tracing/tracing_on 0 [root@localhost ~]# echo 2 > /sys/kernel/debug/tracing/tracing_on triggers warning in dmesg: WARNING: CPU: 3 PID: 1403 at kernel/trace/trace_hwlat.c:371 hwlat_tracer_start+0xc9/0xd0 Link: https://lkml.kernel.org/r/bd4d3e70-400d-9c82-7b73-a2d695e86b58@virtuozzo.com Cc: Ingo Molnar Cc: stable@vger.kernel.org Fixes: 978defee11a5 ("tracing: Do a WARN_ON() if start_thread() in hwlat is called when thread exists") Signed-off-by: Vasily Averin Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_hwlat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index c9ad5c6fbaad..d071fc271eef 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -368,7 +368,7 @@ static int start_kthread(struct trace_array *tr) struct task_struct *kthread; int next_cpu; - if (WARN_ON(hwlat_kthread)) + if (hwlat_kthread) return 0; /* Just pick the first CPU on first iteration */ -- cgit v1.2.3 From 8fa655a3a0013a0c2a2aada6f39a93ee6fc25549 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 25 Nov 2020 14:56:54 -0800 Subject: tracing: Fix alignment of static buffer With 5.9 kernel on ARM64, I found ftrace_dump output was broken but it had no problem with normal output "cat /sys/kernel/debug/tracing/trace". With investigation, it seems coping the data into temporal buffer seems to break the align binary printf expects if the static buffer is not aligned with 4-byte. IIUC, get_arg in bstr_printf expects that args has already right align to be decoded and seq_buf_bprintf says ``the arguments are saved in a 32bit word array that is defined by the format string constraints``. So if we don't keep the align under copy to temporal buffer, the output will be broken by shifting some bytes. This patch fixes it. Link: https://lkml.kernel.org/r/20201125225654.1618966-1-minchan@kernel.org Cc: Fixes: 8e99cf91b99bb ("tracing: Do not allocate buffer in trace_find_next_entry() in atomic") Signed-off-by: Namhyung Kim Signed-off-by: Minchan Kim Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 410cfeb16db5..7d53c5bdea3e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3534,7 +3534,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, } #define STATIC_TEMP_BUF_SIZE 128 -static char static_temp_buf[STATIC_TEMP_BUF_SIZE]; +static char static_temp_buf[STATIC_TEMP_BUF_SIZE] __aligned(4); /* Find the next real entry, without updating the iterator itself */ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, -- cgit v1.2.3 From 4c75b0ff4e4bf7a45b5aef9639799719c28d0073 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 26 Nov 2020 23:38:38 +0530 Subject: ftrace: Fix updating FTRACE_FL_TRAMP On powerpc, kprobe-direct.tc triggered FTRACE_WARN_ON() in ftrace_get_addr_new() followed by the below message: Bad trampoline accounting at: 000000004222522f (wake_up_process+0xc/0x20) (f0000001) The set of steps leading to this involved: - modprobe ftrace-direct-too - enable_probe - modprobe ftrace-direct - rmmod ftrace-direct <-- trigger The problem turned out to be that we were not updating flags in the ftrace record properly. From the above message about the trampoline accounting being bad, it can be seen that the ftrace record still has FTRACE_FL_TRAMP set though ftrace-direct module is going away. This happens because we are checking if any ftrace_ops has the FTRACE_FL_TRAMP flag set _before_ updating the filter hash. The fix for this is to look for any _other_ ftrace_ops that also needs FTRACE_FL_TRAMP. Link: https://lkml.kernel.org/r/56c113aa9c3e10c19144a36d9684c7882bf09af5.1606412433.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: a124692b698b0 ("ftrace: Enable trampoline when rec count returns back to one") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8185f7240095..9c1bba8cc51b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1629,6 +1629,8 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude); +static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, @@ -1778,7 +1780,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, * to it. */ if (ftrace_rec_count(rec) == 1 && - ftrace_find_tramp_ops_any(rec)) + ftrace_find_tramp_ops_any_other(rec, ops)) rec->flags |= FTRACE_FL_TRAMP; else rec->flags &= ~FTRACE_FL_TRAMP; @@ -2244,6 +2246,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) return NULL; } +static struct ftrace_ops * +ftrace_find_tramp_ops_any_other(struct dyn_ftrace *rec, struct ftrace_ops *op_exclude) +{ + struct ftrace_ops *op; + unsigned long ip = rec->ip; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + + if (op == op_exclude || !op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } while_for_each_ftrace_op(op); + + return NULL; +} + static struct ftrace_ops * ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *op) -- cgit v1.2.3 From 49a962c075dfa41c78e34784772329bc8784d217 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 26 Nov 2020 23:38:39 +0530 Subject: ftrace: Fix DYNAMIC_FTRACE_WITH_DIRECT_CALLS dependency DYNAMIC_FTRACE_WITH_DIRECT_CALLS should depend on DYNAMIC_FTRACE_WITH_REGS since we need ftrace_regs_caller(). Link: https://lkml.kernel.org/r/fc4b257ea8689a36f086d2389a9ed989496ca63a.1606412433.git.naveen.n.rao@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: 763e34e74bb7d5c ("ftrace: Add register_ftrace_direct()") Signed-off-by: Naveen N. Rao Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a4020c0b4508..e1bf5228fb69 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -202,7 +202,7 @@ config DYNAMIC_FTRACE_WITH_REGS config DYNAMIC_FTRACE_WITH_DIRECT_CALLS def_bool y - depends on DYNAMIC_FTRACE + depends on DYNAMIC_FTRACE_WITH_REGS depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS config FUNCTION_PROFILER -- cgit v1.2.3 From 68e10d5ff512b503dcba1246ad5620f32035e135 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 30 Nov 2020 23:16:03 -0500 Subject: ring-buffer: Always check to put back before stamp when crossing pages The current ring buffer logic checks to see if the updating of the event buffer was interrupted, and if it is, it will try to fix up the before stamp with the write stamp to make them equal again. This logic is flawed, because if it is not interrupted, the two are guaranteed to be different, as the current event just updated the before stamp before allocation. This guarantees that the next event (this one or another interrupting one) will think it interrupted the time updates of a previous event and inject an absolute time stamp to compensate. The correct logic is to always update the timestamps when traversing to a new sub buffer. Cc: stable@vger.kernel.org Fixes: a389d86f7fd09 ("ring-buffer: Have nested events still record running time stamp") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 35d91b20d47a..a6268e09160a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3234,14 +3234,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* See if we shot pass the end of this buffer page */ if (unlikely(write > BUF_PAGE_SIZE)) { - if (tail != w) { - /* before and after may now different, fix it up*/ - b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); - a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); - if (a_ok && b_ok && info->before != info->after) - (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, - info->before, info->after); - } + /* before and after may now different, fix it up*/ + b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before); + a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after); + if (a_ok && b_ok && info->before != info->after) + (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, + info->before, info->after); return rb_move_tail(cpu_buffer, tail, info); } -- cgit v1.2.3 From a2abe7cbd8fe2db5ff386c968e2273d9dc6c468d Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Mon, 30 Nov 2020 15:34:41 -0800 Subject: scs: switch to vmapped shadow stacks The kernel currently uses kmem_cache to allocate shadow call stacks, which means an overflows may not be immediately detected and can potentially result in another task's shadow stack to be overwritten. This change switches SCS to use virtually mapped shadow stacks for tasks, which increases shadow stack size to a full page and provides more robust overflow detection, similarly to VMAP_STACK. Signed-off-by: Sami Tolvanen Acked-by: Will Deacon Link: https://lore.kernel.org/r/20201130233442.2562064-2-samitolvanen@google.com Signed-off-by: Will Deacon --- include/linux/scs.h | 12 ++++----- kernel/scs.c | 71 ++++++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 66 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/scs.h b/include/linux/scs.h index 6dec390cf154..2a506c2a16f4 100644 --- a/include/linux/scs.h +++ b/include/linux/scs.h @@ -15,12 +15,8 @@ #ifdef CONFIG_SHADOW_CALL_STACK -/* - * In testing, 1 KiB shadow stack size (i.e. 128 stack frames on a 64-bit - * architecture) provided ~40% safety margin on stack usage while keeping - * memory allocation overhead reasonable. - */ -#define SCS_SIZE SZ_1K +#define SCS_ORDER 0 +#define SCS_SIZE (PAGE_SIZE << SCS_ORDER) #define GFP_SCS (GFP_KERNEL | __GFP_ZERO) /* An illegal pointer value to mark the end of the shadow stack. */ @@ -33,6 +29,8 @@ #define task_scs(tsk) (task_thread_info(tsk)->scs_base) #define task_scs_sp(tsk) (task_thread_info(tsk)->scs_sp) +void *scs_alloc(int node); +void scs_free(void *s); void scs_init(void); int scs_prepare(struct task_struct *tsk, int node); void scs_release(struct task_struct *tsk); @@ -61,6 +59,8 @@ static inline bool task_scs_end_corrupted(struct task_struct *tsk) #else /* CONFIG_SHADOW_CALL_STACK */ +static inline void *scs_alloc(int node) { return NULL; } +static inline void scs_free(void *s) {} static inline void scs_init(void) {} static inline void scs_task_reset(struct task_struct *tsk) {} static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; } diff --git a/kernel/scs.c b/kernel/scs.c index 4ff4a7ba0094..e2a71fc82fa0 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -5,26 +5,49 @@ * Copyright (C) 2019 Google LLC */ +#include #include #include #include -#include +#include #include -static struct kmem_cache *scs_cache; - static void __scs_account(void *s, int account) { - struct page *scs_page = virt_to_page(s); + struct page *scs_page = vmalloc_to_page(s); mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB, account * (SCS_SIZE / SZ_1K)); } -static void *scs_alloc(int node) +/* Matches NR_CACHED_STACKS for VMAP_STACK */ +#define NR_CACHED_SCS 2 +static DEFINE_PER_CPU(void *, scs_cache[NR_CACHED_SCS]); + +static void *__scs_alloc(int node) { - void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); + int i; + void *s; + + for (i = 0; i < NR_CACHED_SCS; i++) { + s = this_cpu_xchg(scs_cache[i], NULL); + if (s) { + kasan_unpoison_vmalloc(s, SCS_SIZE); + memset(s, 0, SCS_SIZE); + return s; + } + } + + return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END, + GFP_SCS, PAGE_KERNEL, 0, node, + __builtin_return_address(0)); +} +void *scs_alloc(int node) +{ + void *s; + + s = __scs_alloc(node); if (!s) return NULL; @@ -34,21 +57,47 @@ static void *scs_alloc(int node) * Poison the allocation to catch unintentional accesses to * the shadow stack when KASAN is enabled. */ - kasan_poison_object_data(scs_cache, s); + kasan_poison_vmalloc(s, SCS_SIZE); __scs_account(s, 1); return s; } -static void scs_free(void *s) +void scs_free(void *s) { + int i; + __scs_account(s, -1); - kasan_unpoison_object_data(scs_cache, s); - kmem_cache_free(scs_cache, s); + + /* + * We cannot sleep as this can be called in interrupt context, + * so use this_cpu_cmpxchg to update the cache, and vfree_atomic + * to free the stack. + */ + + for (i = 0; i < NR_CACHED_SCS; i++) + if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL) + return; + + vfree_atomic(s); +} + +static int scs_cleanup(unsigned int cpu) +{ + int i; + void **cache = per_cpu_ptr(scs_cache, cpu); + + for (i = 0; i < NR_CACHED_SCS; i++) { + vfree(cache[i]); + cache[i] = NULL; + } + + return 0; } void __init scs_init(void) { - scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); + cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL, + scs_cleanup); } int scs_prepare(struct task_struct *tsk, int node) -- cgit v1.2.3 From a782483cc1f875355690625d8253a232f2581418 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Nov 2020 18:43:37 +0100 Subject: block: remove the nr_sects field in struct hd_struct Now that the hd_struct always has a block device attached to it, there is no need for having two size field that just get out of sync. Additionally the field in hd_struct did not use proper serialization, possibly allowing for torn writes. By only using the block_device field this problem also gets fixed. Signed-off-by: Christoph Hellwig Reviewed-by: Greg Kroah-Hartman Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Acked-by: Coly Li [bcache] Acked-by: Chao Yu [f2fs] Signed-off-by: Jens Axboe --- block/bio.c | 4 +-- block/blk-core.c | 2 +- block/blk.h | 53 --------------------------- block/genhd.c | 59 +++++++++++++++++------------- block/partitions/core.c | 17 +++++---- drivers/block/loop.c | 1 - drivers/block/nbd.c | 2 +- drivers/block/xen-blkback/common.h | 4 +-- drivers/md/bcache/super.c | 2 +- drivers/s390/block/dasd_ioctl.c | 4 +-- drivers/target/target_core_pscsi.c | 5 ++- fs/block_dev.c | 73 ++------------------------------------ fs/f2fs/super.c | 2 +- fs/pstore/blk.c | 2 +- include/linux/genhd.h | 29 ++++----------- kernel/trace/blktrace.c | 2 +- 16 files changed, 67 insertions(+), 194 deletions(-) (limited to 'kernel') diff --git a/block/bio.c b/block/bio.c index fa01bef35bb1..669bb47a3198 100644 --- a/block/bio.c +++ b/block/bio.c @@ -613,8 +613,8 @@ void guard_bio_eod(struct bio *bio) rcu_read_lock(); part = __disk_get_part(bio->bi_disk, bio->bi_partno); if (part) - maxsector = part_nr_sects_read(part); - else + maxsector = bdev_nr_sectors(part->bdev); + else maxsector = get_capacity(bio->bi_disk); rcu_read_unlock(); diff --git a/block/blk-core.c b/block/blk-core.c index 2db8bda43b6e..988f45094a38 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -755,7 +755,7 @@ static inline int blk_partition_remap(struct bio *bio) goto out; if (bio_sectors(bio)) { - if (bio_check_eod(bio, part_nr_sects_read(p))) + if (bio_check_eod(bio, bdev_nr_sectors(p->bdev))) goto out; bio->bi_iter.bi_sector += p->start_sect; trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), diff --git a/block/blk.h b/block/blk.h index c4839abcfa27..09cee7024fb4 100644 --- a/block/blk.h +++ b/block/blk.h @@ -387,59 +387,6 @@ static inline void hd_free_part(struct hd_struct *part) percpu_ref_exit(&part->ref); } -/* - * Any access of part->nr_sects which is not protected by partition - * bd_mutex or gendisk bdev bd_mutex, should be done using this - * accessor function. - * - * Code written along the lines of i_size_read() and i_size_write(). - * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption - * on. - */ -static inline sector_t part_nr_sects_read(struct hd_struct *part) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - sector_t nr_sects; - unsigned seq; - do { - seq = read_seqcount_begin(&part->nr_sects_seq); - nr_sects = part->nr_sects; - } while (read_seqcount_retry(&part->nr_sects_seq, seq)); - return nr_sects; -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - sector_t nr_sects; - - preempt_disable(); - nr_sects = part->nr_sects; - preempt_enable(); - return nr_sects; -#else - return part->nr_sects; -#endif -} - -/* - * Should be called with mutex lock held (typically bd_mutex) of partition - * to provide mutual exlusion among writers otherwise seqcount might be - * left in wrong state leaving the readers spinning infinitely. - */ -static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - preempt_disable(); - write_seqcount_begin(&part->nr_sects_seq); - part->nr_sects = size; - write_seqcount_end(&part->nr_sects_seq); - preempt_enable(); -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - preempt_disable(); - part->nr_sects = size; - preempt_enable(); -#else - part->nr_sects = size; -#endif -} - int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); diff --git a/block/genhd.c b/block/genhd.c index bf8fa82f135f..c65f485b9db5 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -40,6 +40,16 @@ static void disk_add_events(struct gendisk *disk); static void disk_del_events(struct gendisk *disk); static void disk_release_events(struct gendisk *disk); +void set_capacity(struct gendisk *disk, sector_t sectors) +{ + struct block_device *bdev = disk->part0.bdev; + + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); +} +EXPORT_SYMBOL(set_capacity); + /* * Set disk capacity and notify if the size is not currently zero and will not * be set to zero. Returns true if a uevent was sent, otherwise false. @@ -47,18 +57,30 @@ static void disk_release_events(struct gendisk *disk); bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); + char *envp[] = { "RESIZE=1", NULL }; set_capacity(disk, size); - revalidate_disk_size(disk, true); - if (capacity != size && capacity != 0 && size != 0) { - char *envp[] = { "RESIZE=1", NULL }; + /* + * Only print a message and send a uevent if the gendisk is user visible + * and alive. This avoids spamming the log and udev when setting the + * initial capacity during probing. + */ + if (size == capacity || + (disk->flags & (GENHD_FL_UP | GENHD_FL_HIDDEN)) != GENHD_FL_UP) + return false; - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); - return true; - } + pr_info("%s: detected capacity change from %lld to %lld\n", + disk->disk_name, size, capacity); - return false; + /* + * Historically we did not send a uevent for changes to/from an empty + * device. + */ + if (!capacity || !size) + return false; + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + return true; } EXPORT_SYMBOL_GPL(set_capacity_and_notify); @@ -247,7 +269,7 @@ struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) part = rcu_dereference(ptbl->part[piter->idx]); if (!part) continue; - if (!part_nr_sects_read(part) && + if (!bdev_nr_sectors(part->bdev) && !(piter->flags & DISK_PITER_INCL_EMPTY) && !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && piter->idx == 0)) @@ -284,7 +306,7 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { return part->start_sect <= sector && - sector < part->start_sect + part_nr_sects_read(part); + sector < part->start_sect + bdev_nr_sectors(part->bdev); } /** @@ -986,8 +1008,8 @@ void __init printk_all_partitions(void) printk("%s%s %10llu %s %s", is_part0 ? "" : " ", bdevt_str(part_devt(part), devt_buf), - (unsigned long long)part_nr_sects_read(part) >> 1 - , disk_name(disk, part->partno, name_buf), + bdev_nr_sectors(part->bdev) >> 1, + disk_name(disk, part->partno, name_buf), part->info ? part->info->uuid : ""); if (is_part0) { if (dev->parent && dev->parent->driver) @@ -1079,7 +1101,7 @@ static int show_partition(struct seq_file *seqf, void *v) while ((part = disk_part_iter_next(&piter))) seq_printf(seqf, "%4d %7d %10llu %s\n", MAJOR(part_devt(part)), MINOR(part_devt(part)), - (unsigned long long)part_nr_sects_read(part) >> 1, + bdev_nr_sectors(part->bdev) >> 1, disk_name(sgp, part->partno, buf)); disk_part_iter_exit(&piter); @@ -1161,8 +1183,7 @@ ssize_t part_size_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n", - (unsigned long long)part_nr_sects_read(p)); + return sprintf(buf, "%llu\n", bdev_nr_sectors(p->bdev)); } ssize_t part_stat_show(struct device *dev, @@ -1618,16 +1639,6 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) ptbl = rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(ptbl->part[0], &disk->part0); - /* - * set_capacity() and get_capacity() currently don't use - * seqcounter to read/update the part0->nr_sects. Still init - * the counter as we can read the sectors in IO submission - * patch using seqence counters. - * - * TODO: Ideally set_capacity() and get_capacity() should be - * converted to make use of bd_mutex and sequence counters. - */ - hd_sects_seq_init(&disk->part0); if (hd_ref_init(&disk->part0)) goto out_free_bdstats; diff --git a/block/partitions/core.c b/block/partitions/core.c index 696bd9ff63c6..bcfa8215bd5e 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -85,6 +85,13 @@ static int (*check_part[])(struct parsed_partitions *) = { NULL }; +static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) +{ + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + spin_unlock(&bdev->bd_size_lock); +} + static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; @@ -295,7 +302,7 @@ static void hd_struct_free_work(struct work_struct *work) put_device(disk_to_dev(disk)); part->start_sect = 0; - part->nr_sects = 0; + bdev_set_nr_sectors(part->bdev, 0); part_stat_set_all(part, 0); put_device(part_to_dev(part)); } @@ -412,11 +419,10 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_free_stats; p->bdev = bdev; - hd_sects_seq_init(p); pdev = part_to_dev(p); p->start_sect = start; - p->nr_sects = len; + bdev_set_nr_sectors(bdev, len); p->partno = partno; p->policy = get_disk_ro(disk); @@ -509,7 +515,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { if (part->partno == skip_partno || - start >= part->start_sect + part->nr_sects || + start >= part->start_sect + bdev_nr_sectors(part->bdev) || start + length <= part->start_sect) continue; overlap = true; @@ -600,8 +606,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, if (partition_overlaps(bdev->bd_disk, start, length, partno)) goto out_unlock; - part_nr_sects_write(part, length); - bd_set_nr_sectors(bdevp, length); + bdev_set_nr_sectors(bdevp, length); ret = 0; out_unlock: diff --git a/drivers/block/loop.c b/drivers/block/loop.c index d643c67be6ac..d2ce1ddc192d 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1241,7 +1241,6 @@ static int __loop_clr_fd(struct loop_device *lo, bool release) set_capacity(lo->lo_disk, 0); loop_sysfs_exit(lo); if (bdev) { - bd_set_nr_sectors(bdev, 0); /* let user-space know about this change */ kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 45b0423ef2c5..014683968ce1 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -1132,7 +1132,7 @@ static void nbd_bdev_reset(struct block_device *bdev) { if (bdev->bd_openers > 1) return; - bd_set_nr_sectors(bdev, 0); + set_capacity(bdev->bd_disk, 0); } static void nbd_parse_flags(struct nbd_device *nbd) diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index c6ea5d38c509..0762db247b41 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -358,9 +358,7 @@ struct pending_req { }; -#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ - (_v)->bdev->bd_part->nr_sects : \ - get_capacity((_v)->bdev->bd_disk)) +#define vbd_sz(_v) bdev_nr_sectors((_v)->bdev) #define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt)) #define xen_blkif_put(_b) \ diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index c55d3c58a7ef..04fa40868fbe 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1408,7 +1408,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) q->limits.raid_partial_stripes_expensive; ret = bcache_device_init(&dc->disk, block_size, - dc->bdev->bd_part->nr_sects - dc->sb.data_offset, + bdev_nr_sectors(dc->bdev) - dc->sb.data_offset, dc->bdev, &bcache_cached_ops); if (ret) return ret; diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 3359559517bf..304eba1acf16 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -54,8 +54,6 @@ dasd_ioctl_enable(struct block_device *bdev) return -ENODEV; dasd_enable_device(base); - /* Formatting the dasd device can change the capacity. */ - bd_set_nr_sectors(bdev, get_capacity(base->block->gdp)); dasd_put_device(base); return 0; } @@ -88,7 +86,7 @@ dasd_ioctl_disable(struct block_device *bdev) * Set i_size to zero, since read, write, etc. check against this * value. */ - bd_set_nr_sectors(bdev, 0); + set_capacity(bdev->bd_disk, 0); dasd_put_device(base); return 0; } diff --git a/drivers/target/target_core_pscsi.c b/drivers/target/target_core_pscsi.c index 4e37fa9b409d..7994f27e4527 100644 --- a/drivers/target/target_core_pscsi.c +++ b/drivers/target/target_core_pscsi.c @@ -1029,9 +1029,8 @@ static sector_t pscsi_get_blocks(struct se_device *dev) { struct pscsi_dev_virt *pdv = PSCSI_DEV(dev); - if (pdv->pdv_bd && pdv->pdv_bd->bd_part) - return pdv->pdv_bd->bd_part->nr_sects; - + if (pdv->pdv_bd) + return bdev_nr_sectors(pdv->pdv_bd); return 0; } diff --git a/fs/block_dev.c b/fs/block_dev.c index a5b6955a841f..31ee5a857f71 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1208,70 +1208,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif -/** - * check_disk_size_change - checks for disk size change and adjusts bdev size. - * @disk: struct gendisk to check - * @bdev: struct bdev to adjust. - * @verbose: if %true log a message about a size change if there is any - * - * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. When shrinking the bdev size, its all caches - * are freed. - */ -static void check_disk_size_change(struct gendisk *disk, - struct block_device *bdev, bool verbose) -{ - loff_t disk_size, bdev_size; - - spin_lock(&bdev->bd_size_lock); - disk_size = (loff_t)get_capacity(disk) << 9; - bdev_size = i_size_read(bdev->bd_inode); - if (disk_size != bdev_size) { - if (verbose) { - printk(KERN_INFO - "%s: detected capacity change from %lld to %lld\n", - disk->disk_name, bdev_size, disk_size); - } - i_size_write(bdev->bd_inode, disk_size); - } - spin_unlock(&bdev->bd_size_lock); -} - -/** - * revalidate_disk_size - checks for disk size change and adjusts bdev size. - * @disk: struct gendisk to check - * @verbose: if %true log a message about a size change if there is any - * - * This routine checks to see if the bdev size does not match the disk size - * and adjusts it if it differs. When shrinking the bdev size, its all caches - * are freed. - */ -void revalidate_disk_size(struct gendisk *disk, bool verbose) -{ - struct block_device *bdev; - - /* - * Hidden disks don't have associated bdev so there's no point in - * revalidating them. - */ - if (disk->flags & GENHD_FL_HIDDEN) - return; - - bdev = bdget_disk(disk, 0); - if (bdev) { - check_disk_size_change(disk, bdev, verbose); - bdput(bdev); - } -} - -void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) -{ - spin_lock(&bdev->bd_size_lock); - i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); - spin_unlock(&bdev->bd_size_lock); -} -EXPORT_SYMBOL(bd_set_nr_sectors); - static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); int bdev_disk_changed(struct block_device *bdev, bool invalidate) @@ -1305,8 +1241,6 @@ rescan: disk->fops->revalidate_disk(disk); } - check_disk_size_change(disk, bdev, !invalidate); - if (get_capacity(disk)) { ret = blk_add_partitions(disk, bdev); if (ret == -EAGAIN) @@ -1349,10 +1283,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) if (disk->fops->open) ret = disk->fops->open(bdev, mode); - if (!ret) { - bd_set_nr_sectors(bdev, get_capacity(disk)); + if (!ret) set_init_blocksize(bdev); - } /* * If the device is invalidated, rescan partition @@ -1381,13 +1313,12 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) bdev->bd_part = disk_get_part(disk, bdev->bd_partno); if (!(disk->flags & GENHD_FL_UP) || - !bdev->bd_part || !bdev->bd_part->nr_sects) { + !bdev->bd_part || !bdev_nr_sectors(bdev)) { __blkdev_put(whole, mode, 1); bdput(whole); ret = -ENXIO; goto out_clear; } - bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); set_init_blocksize(bdev); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 00eff2f51807..d4e7fab352ba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3151,7 +3151,7 @@ static int f2fs_report_zone_cb(struct blk_zone *zone, unsigned int idx, static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { struct block_device *bdev = FDEV(devi).bdev; - sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t nr_sectors = bdev_nr_sectors(bdev); struct f2fs_report_zones_args rep_zone_arg; int ret; diff --git a/fs/pstore/blk.c b/fs/pstore/blk.c index fcd5563dde06..777a26f7bbe2 100644 --- a/fs/pstore/blk.c +++ b/fs/pstore/blk.c @@ -245,7 +245,7 @@ static struct block_device *psblk_get_bdev(void *holder, return bdev; } - nr_sects = part_nr_sects_read(bdev->bd_part); + nr_sects = bdev_nr_sectors(bdev); if (!nr_sects) { pr_err("not enough space for '%s'\n", blkdev); blkdev_put(bdev, mode); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 6ba91ee54cb2..30d4785b7df8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -52,15 +52,6 @@ struct partition_meta_info { struct hd_struct { sector_t start_sect; - /* - * nr_sects is protected by sequence counter. One might extend a - * partition while IO is happening to it and update of nr_sects - * can be non-atomic on 32bit machines with 64bit sector_t. - */ - sector_t nr_sects; -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - seqcount_t nr_sects_seq; -#endif unsigned long stamp; struct disk_stats __percpu *dkstats; struct percpu_ref ref; @@ -254,13 +245,6 @@ static inline void disk_put_part(struct hd_struct *part) put_device(part_to_dev(part)); } -static inline void hd_sects_seq_init(struct hd_struct *p) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - seqcount_init(&p->nr_sects_seq); -#endif -} - /* * Smarter partition iterator without context limits. */ @@ -318,13 +302,15 @@ static inline sector_t get_start_sect(struct block_device *bdev) { return bdev->bd_part->start_sect; } -static inline sector_t get_capacity(struct gendisk *disk) + +static inline sector_t bdev_nr_sectors(struct block_device *bdev) { - return disk->part0.nr_sects; + return i_size_read(bdev->bd_inode) >> 9; } -static inline void set_capacity(struct gendisk *disk, sector_t size) + +static inline sector_t get_capacity(struct gendisk *disk) { - disk->part0.nr_sects = size; + return bdev_nr_sectors(disk->part0.bdev); } int bdev_disk_changed(struct block_device *bdev, bool invalidate); @@ -358,10 +344,9 @@ int __register_blkdev(unsigned int major, const char *name, __register_blkdev(major, name, NULL) void unregister_blkdev(unsigned int major, const char *name); -void revalidate_disk_size(struct gendisk *disk, bool verbose); bool bdev_check_media_change(struct block_device *bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty); -void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors); +void set_capacity(struct gendisk *disk, sector_t size); /* for drivers/char/raw.c: */ int blkdev_ioctl(struct block_device *, fmode_t, unsigned, unsigned long); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f1022945e346..7076d588a50d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -465,7 +465,7 @@ static void blk_trace_setup_lba(struct blk_trace *bt, if (part) { bt->start_lba = part->start_sect; - bt->end_lba = part->start_sect + part->nr_sects; + bt->end_lba = part->start_sect + bdev_nr_sectors(bdev); } else { bt->start_lba = 0; bt->end_lba = -1ULL; -- cgit v1.2.3 From 29ff57c61094e7bbd921ab10b5a99dce9a0132e0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 24 Nov 2020 09:34:24 +0100 Subject: block: move the start_sect field to struct block_device Move the start_sect field to struct block_device in preparation of killing struct hd_struct. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++-- block/blk-lib.c | 2 +- block/genhd.c | 4 ++-- block/partitions/core.c | 17 +++++++++-------- include/linux/blk_types.h | 1 + include/linux/blkdev.h | 4 ++-- include/linux/genhd.h | 3 +-- kernel/trace/blktrace.c | 11 +++-------- 8 files changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index d2c9cb24e087..9a3793d5ce38 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -757,9 +757,10 @@ static inline int blk_partition_remap(struct bio *bio) if (bio_sectors(bio)) { if (bio_check_eod(bio, bdev_nr_sectors(p->bdev))) goto out; - bio->bi_iter.bi_sector += p->start_sect; + bio->bi_iter.bi_sector += p->bdev->bd_start_sect; trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), - bio->bi_iter.bi_sector - p->start_sect); + bio->bi_iter.bi_sector - + p->bdev->bd_start_sect); } bio->bi_partno = 0; ret = 0; diff --git a/block/blk-lib.c b/block/blk-lib.c index e90614fd8d6a..752f9c722062 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -65,7 +65,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, /* In case the discard request is in a partition */ if (bdev_is_partition(bdev)) - part_offset = bdev->bd_part->start_sect; + part_offset = bdev->bd_start_sect; while (nr_sects) { sector_t granularity_aligned_lba, req_sects; diff --git a/block/genhd.c b/block/genhd.c index 2cbda8139556..5efb2df1f079 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -305,8 +305,8 @@ EXPORT_SYMBOL_GPL(disk_part_iter_exit); static inline int sector_in_part(struct hd_struct *part, sector_t sector) { - return part->start_sect <= sector && - sector < part->start_sect + bdev_nr_sectors(part->bdev); + return part->bdev->bd_start_sect <= sector && + sector < part->bdev->bd_start_sect + bdev_nr_sectors(part->bdev); } /** diff --git a/block/partitions/core.c b/block/partitions/core.c index 8924e1ea8b2a..460a745812c6 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -192,7 +192,7 @@ static ssize_t part_start_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); + return sprintf(buf, "%llu\n", p->bdev->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, @@ -209,7 +209,7 @@ static ssize_t part_alignment_offset_show(struct device *dev, return sprintf(buf, "%u\n", queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, - p->start_sect)); + p->bdev->bd_start_sect)); } static ssize_t part_discard_alignment_show(struct device *dev, @@ -219,7 +219,7 @@ static ssize_t part_discard_alignment_show(struct device *dev, return sprintf(buf, "%u\n", queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, - p->start_sect)); + p->bdev->bd_start_sect)); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -301,7 +301,7 @@ static void hd_struct_free_work(struct work_struct *work) */ put_device(disk_to_dev(disk)); - part->start_sect = 0; + part->bdev->bd_start_sect = 0; bdev_set_nr_sectors(part->bdev, 0); part_stat_set_all(part, 0); put_device(part_to_dev(part)); @@ -416,7 +416,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); - p->start_sect = start; + bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); p->partno = partno; p->policy = get_disk_ro(disk); @@ -508,8 +508,9 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) { if (part->partno == skip_partno || - start >= part->start_sect + bdev_nr_sectors(part->bdev) || - start + length <= part->start_sect) + start >= part->bdev->bd_start_sect + + bdev_nr_sectors(part->bdev) || + start + length <= part->bdev->bd_start_sect) continue; overlap = true; break; @@ -592,7 +593,7 @@ int bdev_resize_partition(struct block_device *bdev, int partno, mutex_lock_nested(&bdev->bd_mutex, 1); ret = -EINVAL; - if (start != part->start_sect) + if (start != part->bdev->bd_start_sect) goto out_unlock; ret = -EBUSY; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 520011b95276..a690008f60cd 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -20,6 +20,7 @@ typedef void (bio_end_io_t) (struct bio *); struct bio_crypt_ctx; struct block_device { + sector_t bd_start_sect; struct disk_stats __percpu *bd_stats; unsigned long bd_stamp; dev_t bd_dev; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 43a25d855e04..619adea57098 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1488,7 +1488,7 @@ static inline int bdev_alignment_offset(struct block_device *bdev) return -1; if (bdev_is_partition(bdev)) return queue_limit_alignment_offset(&q->limits, - bdev->bd_part->start_sect); + bdev->bd_start_sect); return q->limits.alignment_offset; } @@ -1529,7 +1529,7 @@ static inline int bdev_discard_alignment(struct block_device *bdev) if (bdev_is_partition(bdev)) return queue_limit_discard_alignment(&q->limits, - bdev->bd_part->start_sect); + bdev->bd_start_sect); return q->limits.discard_alignment; } diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 804ac45fbfbc..50d27f5d38e2 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -51,7 +51,6 @@ struct partition_meta_info { }; struct hd_struct { - sector_t start_sect; struct percpu_ref ref; struct block_device *bdev; @@ -298,7 +297,7 @@ extern void rand_initialize_disk(struct gendisk *disk); static inline sector_t get_start_sect(struct block_device *bdev) { - return bdev->bd_part->start_sect; + return bdev->bd_start_sect; } static inline sector_t bdev_nr_sectors(struct block_device *bdev) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7076d588a50d..8a723a91ec5a 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -458,14 +458,9 @@ static struct rchan_callbacks blk_relay_callbacks = { static void blk_trace_setup_lba(struct blk_trace *bt, struct block_device *bdev) { - struct hd_struct *part = NULL; - - if (bdev) - part = bdev->bd_part; - - if (part) { - bt->start_lba = part->start_sect; - bt->end_lba = part->start_sect + bdev_nr_sectors(bdev); + if (bdev) { + bt->start_lba = bdev->bd_start_sect; + bt->end_lba = bdev->bd_start_sect + bdev_nr_sectors(bdev); } else { bt->start_lba = 0; bt->end_lba = -1ULL; -- cgit v1.2.3 From 0d02129e76edf91cf04fabf1efbc3a9a1f1d729a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 27 Nov 2020 16:43:51 +0100 Subject: block: merge struct block_device and struct hd_struct Instead of having two structures that represent each block device with different life time rules, merge them into a single one. This also greatly simplifies the reference counting rules, as we can use the inode reference count as the main reference count for the new struct block_device, with the device model reference front ending it for device model interaction. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Reviewed-by: Hannes Reinecke Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 8 ++-- block/blk.h | 2 +- block/genhd.c | 90 +++++++++++------------------------ block/partitions/core.c | 116 ++++++++++++++++++---------------------------- fs/block_dev.c | 9 ---- include/linux/blk_types.h | 8 +++- include/linux/blkdev.h | 1 - include/linux/genhd.h | 40 ++++------------ init/do_mounts.c | 21 ++++----- kernel/trace/blktrace.c | 43 ++++------------- 10 files changed, 108 insertions(+), 230 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 79aa96240cec..031114d454a6 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -820,9 +820,9 @@ static void blkcg_fill_root_iostats(void) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part = disk_get_part(disk, 0); - struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); + struct block_device *bdev = dev_to_bdev(dev); + struct blkcg_gq *blkg = + blk_queue_root_blkg(bdev->bd_disk->queue); struct blkg_iostat tmp; int cpu; @@ -830,7 +830,7 @@ static void blkcg_fill_root_iostats(void) for_each_possible_cpu(cpu) { struct disk_stats *cpu_dkstats; - cpu_dkstats = per_cpu_ptr(part->bdev->bd_stats, cpu); + cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); tmp.ios[BLKG_IOSTAT_READ] += cpu_dkstats->ios[STAT_READ]; tmp.ios[BLKG_IOSTAT_WRITE] += diff --git a/block/blk.h b/block/blk.h index 9657c6da7c77..98f0b1ae2641 100644 --- a/block/blk.h +++ b/block/blk.h @@ -356,7 +356,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -void delete_partition(struct hd_struct *part); +void delete_partition(struct block_device *part); int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length); int bdev_del_partition(struct block_device *bdev, int partno); diff --git a/block/genhd.c b/block/genhd.c index 2d34dd2da4e9..0fabfc90b8e4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -106,13 +106,14 @@ const char *bdevname(struct block_device *bdev, char *buf) } EXPORT_SYMBOL(bdevname); -static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +static void part_stat_read_all(struct block_device *part, + struct disk_stats *stat) { int cpu; memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { - struct disk_stats *ptr = per_cpu_ptr(part->bdev->bd_stats, cpu); + struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { @@ -167,39 +168,6 @@ struct block_device *__disk_get_part(struct gendisk *disk, int partno) return rcu_dereference(ptbl->part[partno]); } -/** - * disk_get_part - get partition - * @disk: disk to look partition from - * @partno: partition number - * - * Look for partition @partno from @disk. If found, increment - * reference count and return it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Pointer to the found partition on success, NULL if not found. - */ -struct hd_struct *disk_get_part(struct gendisk *disk, int partno) -{ - struct block_device *bdev; - struct hd_struct *part; - - rcu_read_lock(); - bdev = __disk_get_part(disk, partno); - if (!bdev) - goto fail; - part = bdev->bd_part; - if (!kobject_get_unless_zero(&part_to_dev(part)->kobj)) - goto fail; - rcu_read_unlock(); - return part; -fail: - rcu_read_unlock(); - return NULL; -} - /** * disk_part_iter_init - initialize partition iterator * @piter: iterator to initialize @@ -859,7 +827,7 @@ void del_gendisk(struct gendisk *disk) DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { invalidate_partition(part); - delete_partition(part->bd_part); + delete_partition(part); } disk_part_iter_exit(&piter); @@ -952,13 +920,13 @@ void blk_request_module(dev_t devt) */ struct block_device *bdget_disk(struct gendisk *disk, int partno) { - struct hd_struct *part; struct block_device *bdev = NULL; - part = disk_get_part(disk, partno); - if (part) - bdev = bdget_part(part); - disk_put_part(part); + rcu_read_lock(); + bdev = __disk_get_part(disk, partno); + if (bdev && !bdgrab(bdev)) + bdev = NULL; + rcu_read_unlock(); return bdev; } @@ -1175,24 +1143,22 @@ static ssize_t disk_ro_show(struct device *dev, ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n", bdev_nr_sectors(p->bdev)); + return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev->bd_disk->queue; struct disk_stats stat; unsigned int inflight; - part_stat_read_all(p, &stat); + part_stat_read_all(bdev, &stat); if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, p->bdev); + inflight = blk_mq_in_flight(q, bdev); else - inflight = part_in_flight(p->bdev); + inflight = part_in_flight(bdev); return sprintf(buf, "%8lu %8lu %8llu %8u " @@ -1227,14 +1193,14 @@ ssize_t part_stat_show(struct device *dev, ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev->bd_disk->queue; unsigned int inflight[2]; if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, p->bdev, inflight); + blk_mq_in_flight_rw(q, bdev, inflight); else - part_in_flight_rw(p->bdev, inflight); + part_in_flight_rw(bdev, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } @@ -1282,20 +1248,17 @@ static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->bdev->bd_make_it_fail); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail); } ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct hd_struct *p = dev_to_part(dev); int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->bdev->bd_make_it_fail = i; + dev_to_bdev(dev)->bd_make_it_fail = i; return count; } @@ -1505,7 +1468,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - part_stat_read_all(hd->bd_part, &stat); + part_stat_read_all(hd, &stat); if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); else @@ -1577,7 +1540,7 @@ dev_t blk_lookup_devt(const char *name, int partno) class_dev_iter_init(&iter, &block_class, NULL, &disk_type); while ((dev = class_dev_iter_next(&iter))) { struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part; + struct block_device *part; if (strcmp(dev_name(dev), name)) continue; @@ -1590,13 +1553,12 @@ dev_t blk_lookup_devt(const char *name, int partno) MINOR(dev->devt) + partno); break; } - part = disk_get_part(disk, partno); + part = bdget_disk(disk, partno); if (part) { - devt = part_devt(part); - disk_put_part(part); + devt = part->bd_dev; + bdput(part); break; } - disk_put_part(part); } class_dev_iter_exit(&iter); return devt; diff --git a/block/partitions/core.c b/block/partitions/core.c index 4cb6df175f90..deca253583bd 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -182,44 +182,39 @@ static struct parsed_partitions *check_partition(struct gendisk *hd, static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->bdev->bd_partno); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n", p->bdev->bd_start_sect); + return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->bdev->bd_read_only); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_read_only); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, - p->bdev->bd_start_sect)); + queue_limit_alignment_offset(&bdev->bd_disk->queue->limits, + bdev->bd_start_sect)); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); + struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, - p->bdev->bd_start_sect)); + queue_limit_discard_alignment(&bdev->bd_disk->queue->limits, + bdev->bd_start_sect)); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -264,20 +259,17 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - struct hd_struct *p = dev_to_part(dev); - blk_free_devt(dev->devt); - bdput(p->bdev); + bdput(dev_to_bdev(dev)); } static int part_uevent(struct device *dev, struct kobj_uevent_env *env) { - struct hd_struct *part = dev_to_part(dev); + struct block_device *part = dev_to_bdev(dev); - add_uevent_var(env, "PARTN=%u", part->bdev->bd_partno); - if (part->bdev->bd_meta_info && part->bdev->bd_meta_info->volname[0]) - add_uevent_var(env, "PARTNAME=%s", - part->bdev->bd_meta_info->volname); + add_uevent_var(env, "PARTN=%u", part->bd_partno); + if (part->bd_meta_info && part->bd_meta_info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); return 0; } @@ -292,25 +284,25 @@ struct device_type part_type = { * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -void delete_partition(struct hd_struct *part) +void delete_partition(struct block_device *part) { - struct gendisk *disk = part_to_disk(part); + struct gendisk *disk = part->bd_disk; struct disk_part_tbl *ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[part->bdev->bd_partno], NULL); + rcu_assign_pointer(ptbl->part[part->bd_partno], NULL); rcu_assign_pointer(ptbl->last_lookup, NULL); - kobject_put(part->bdev->bd_holder_dir); - device_del(part_to_dev(part)); + kobject_put(part->bd_holder_dir); + device_del(&part->bd_device); /* * Remove the block device from the inode hash, so that it cannot be * looked up any more even when openers still hold references. */ - remove_inode_hash(part->bdev->bd_inode); + remove_inode_hash(part->bd_inode); - put_device(part_to_dev(part)); + put_device(&part->bd_device); } static ssize_t whole_disk_show(struct device *dev, @@ -324,11 +316,10 @@ static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); * Must be called either with bd_mutex held, before a disk can be opened or * after all disk users are gone. */ -static struct hd_struct *add_partition(struct gendisk *disk, int partno, +static struct block_device *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { - struct hd_struct *p; dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; @@ -367,9 +358,6 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, if (!bdev) return ERR_PTR(-ENOMEM); - p = bdev->bd_part; - pdev = part_to_dev(p); - bdev->bd_start_sect = start; bdev_set_nr_sectors(bdev, len); bdev->bd_read_only = get_disk_ro(disk); @@ -381,6 +369,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_bdput; } + pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); @@ -422,7 +411,7 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - return p; + return bdev; out_bdput: bdput(bdev); @@ -459,7 +448,7 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, int bdev_add_partition(struct block_device *bdev, int partno, sector_t start, sector_t length) { - struct hd_struct *part; + struct block_device *part; mutex_lock(&bdev->bd_mutex); if (partition_overlaps(bdev->bd_disk, start, length, -1)) { @@ -475,76 +464,59 @@ int bdev_add_partition(struct block_device *bdev, int partno, int bdev_del_partition(struct block_device *bdev, int partno) { - struct block_device *bdevp; - struct hd_struct *part = NULL; + struct block_device *part; int ret; - bdevp = bdget_disk(bdev->bd_disk, partno); - if (!bdevp) + part = bdget_disk(bdev->bd_disk, partno); + if (!part) return -ENXIO; - mutex_lock(&bdevp->bd_mutex); + mutex_lock(&part->bd_mutex); mutex_lock_nested(&bdev->bd_mutex, 1); - ret = -ENXIO; - part = disk_get_part(bdev->bd_disk, partno); - if (!part) - goto out_unlock; - ret = -EBUSY; - if (bdevp->bd_openers) + if (part->bd_openers) goto out_unlock; - sync_blockdev(bdevp); - invalidate_bdev(bdevp); + sync_blockdev(part); + invalidate_bdev(part); delete_partition(part); ret = 0; out_unlock: mutex_unlock(&bdev->bd_mutex); - mutex_unlock(&bdevp->bd_mutex); - bdput(bdevp); - if (part) - disk_put_part(part); + mutex_unlock(&part->bd_mutex); + bdput(part); return ret; } int bdev_resize_partition(struct block_device *bdev, int partno, sector_t start, sector_t length) { - struct block_device *bdevp; - struct hd_struct *part; + struct block_device *part; int ret = 0; - part = disk_get_part(bdev->bd_disk, partno); + part = bdget_disk(bdev->bd_disk, partno); if (!part) return -ENXIO; - ret = -ENOMEM; - bdevp = bdget_part(part); - if (!bdevp) - goto out_put_part; - - mutex_lock(&bdevp->bd_mutex); + mutex_lock(&part->bd_mutex); mutex_lock_nested(&bdev->bd_mutex, 1); - ret = -EINVAL; - if (start != part->bdev->bd_start_sect) + if (start != part->bd_start_sect) goto out_unlock; ret = -EBUSY; if (partition_overlaps(bdev->bd_disk, start, length, partno)) goto out_unlock; - bdev_set_nr_sectors(bdevp, length); + bdev_set_nr_sectors(part, length); ret = 0; out_unlock: - mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&part->bd_mutex); mutex_unlock(&bdev->bd_mutex); - bdput(bdevp); -out_put_part: - disk_put_part(part); + bdput(part); return ret; } @@ -577,7 +549,7 @@ int blk_drop_partitions(struct block_device *bdev) disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) - delete_partition(part->bd_part); + delete_partition(part); disk_part_iter_exit(&piter); return 0; @@ -592,7 +564,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; - struct hd_struct *part; + struct block_device *part; if (!size) return true; @@ -632,7 +604,7 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && (state->parts[p].flags & ADDPART_FLAG_RAID)) - md_autodetect_dev(part_to_dev(part)->devt); + md_autodetect_dev(part->bd_dev); return true; } diff --git a/fs/block_dev.c b/fs/block_dev.c index 61cf33b6284f..a9905d8fd02b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -39,7 +39,6 @@ struct bdev_inode { struct block_device bdev; - struct hd_struct hd; struct inode vfs_inode; }; @@ -887,9 +886,6 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) iput(inode); return NULL; } - bdev->bd_part = &BDEV_I(inode)->hd; - memset(bdev->bd_part, 0, sizeof(*bdev->bd_part)); - bdev->bd_part->bdev = bdev; return bdev; } @@ -926,11 +922,6 @@ struct block_device *bdgrab(struct block_device *bdev) } EXPORT_SYMBOL(bdgrab); -struct block_device *bdget_part(struct hd_struct *part) -{ - return bdget(part_devt(part)); -} - long nr_blockdev_pages(void) { struct inode *inode; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 6edea5c16259..866f74261b3b 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -8,6 +8,7 @@ #include #include +#include #include struct bio_set; @@ -30,6 +31,7 @@ struct block_device { struct super_block * bd_super; struct mutex bd_mutex; /* open/close mutex */ void * bd_claiming; + struct device bd_device; void * bd_holder; int bd_holders; bool bd_write_holder; @@ -38,7 +40,6 @@ struct block_device { #endif struct kobject *bd_holder_dir; u8 bd_partno; - struct hd_struct * bd_part; /* number of times partitions within this device have been opened. */ unsigned bd_part_count; @@ -61,8 +62,11 @@ struct block_device { #define bdev_whole(_bdev) \ ((_bdev)->bd_disk->part0) +#define dev_to_bdev(device) \ + container_of((device), struct block_device, bd_device) + #define bdev_kobj(_bdev) \ - (&part_to_dev((_bdev)->bd_part)->kobj) + (&((_bdev)->bd_device.kobj)) /* * Block error status values. See block/blk-core:blk_errors for the details. diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1d4be1fc6007..17cedf0dc83d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1999,7 +1999,6 @@ void blkdev_put_no_open(struct block_device *bdev); struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); void bdev_add(struct block_device *bdev, dev_t dev); struct block_device *I_BDEV(struct inode *inode); -struct block_device *bdget_part(struct hd_struct *part); struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index cd23c80265b2..809aaa32d53c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -19,12 +19,6 @@ #include #include -#define dev_to_part(device) container_of((device), struct hd_struct, __dev) -#define part_to_dev(part) (&((part)->__dev)) - -#define dev_to_disk(device) (dev_to_part(device)->bdev->bd_disk) -#define disk_to_dev(disk) (part_to_dev((disk)->part0->bd_part)) - extern const struct device_type disk_type; extern struct device_type part_type; extern struct class block_class; @@ -51,11 +45,6 @@ struct partition_meta_info { u8 volname[PARTITION_META_INFO_VOLNAMELTH]; }; -struct hd_struct { - struct block_device *bdev; - struct device __dev; -}; - /** * DOC: genhd capability flags * @@ -190,19 +179,21 @@ struct gendisk { struct lockdep_map lockdep_map; }; +/* + * The gendisk is refcounted by the part0 block_device, and the bd_device + * therein is also used for device model presentation in sysfs. + */ +#define dev_to_disk(device) \ + (dev_to_bdev(device)->bd_disk) +#define disk_to_dev(disk) \ + (&((disk)->part0->bd_device)) + #if IS_REACHABLE(CONFIG_CDROM) #define disk_to_cdi(disk) ((disk)->cdi) #else #define disk_to_cdi(disk) NULL #endif -static inline struct gendisk *part_to_disk(struct hd_struct *part) -{ - if (unlikely(!part)) - return NULL; - return part->bdev->bd_disk; -} - static inline int disk_max_parts(struct gendisk *disk) { if (disk->flags & GENHD_FL_EXT_DEVT) @@ -221,19 +212,6 @@ static inline dev_t disk_devt(struct gendisk *disk) return MKDEV(disk->major, disk->first_minor); } -static inline dev_t part_devt(struct hd_struct *part) -{ - return part_to_dev(part)->devt; -} - -extern struct hd_struct *disk_get_part(struct gendisk *disk, int partno); - -static inline void disk_put_part(struct hd_struct *part) -{ - if (likely(part)) - put_device(part_to_dev(part)); -} - /* * Smarter partition iterator without context limits. */ diff --git a/init/do_mounts.c b/init/do_mounts.c index 86bef93e72eb..a78e44ee6adb 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -76,11 +76,11 @@ struct uuidcmp { */ static int match_dev_by_uuid(struct device *dev, const void *data) { + struct block_device *bdev = dev_to_bdev(dev); const struct uuidcmp *cmp = data; - struct hd_struct *part = dev_to_part(dev); - if (!part->bdev->bd_meta_info || - strncasecmp(cmp->uuid, part->bdev->bd_meta_info->uuid, cmp->len)) + if (!bdev->bd_meta_info || + strncasecmp(cmp->uuid, bdev->bd_meta_info->uuid, cmp->len)) return 0; return 1; } @@ -133,13 +133,13 @@ static dev_t devt_from_partuuid(const char *uuid_str) * Attempt to find the requested partition by adding an offset * to the partition number found by UUID. */ - struct hd_struct *part; + struct block_device *part; - part = disk_get_part(dev_to_disk(dev), - dev_to_part(dev)->bdev->bd_partno + offset); + part = bdget_disk(dev_to_disk(dev), + dev_to_bdev(dev)->bd_partno + offset); if (part) { - devt = part_devt(part); - put_device(part_to_dev(part)); + devt = part->bd_dev; + bdput(part); } } else { devt = dev->devt; @@ -166,11 +166,10 @@ clear_root_wait: */ static int match_dev_by_label(struct device *dev, const void *data) { + struct block_device *bdev = dev_to_bdev(dev); const char *label = data; - struct hd_struct *part = dev_to_part(dev); - if (!part->bdev->bd_meta_info || - strcmp(label, part->bdev->bd_meta_info->volname)) + if (!bdev->bd_meta_info || strcmp(label, bdev->bd_meta_info->volname)) return 0; return 1; } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8a723a91ec5a..a482a37848bf 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1810,30 +1810,15 @@ static ssize_t blk_trace_mask2str(char *buf, int mask) return p - buf; } -static struct request_queue *blk_trace_get_queue(struct block_device *bdev) -{ - if (bdev->bd_disk == NULL) - return NULL; - - return bdev_get_queue(bdev); -} - static ssize_t sysfs_blk_trace_attr_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct block_device *bdev = bdget_part(dev_to_part(dev)); - struct request_queue *q; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; ssize_t ret = -ENXIO; - if (bdev == NULL) - goto out; - - q = blk_trace_get_queue(bdev); - if (q == NULL) - goto out_bdput; - mutex_lock(&q->debugfs_mutex); bt = rcu_dereference_protected(q->blk_trace, @@ -1856,9 +1841,6 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); -out_bdput: - bdput(bdev); -out: return ret; } @@ -1866,8 +1848,8 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct block_device *bdev; - struct request_queue *q; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev_get_queue(bdev); struct blk_trace *bt; u64 value; ssize_t ret = -EINVAL; @@ -1883,17 +1865,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, goto out; value = ret; } - } else if (kstrtoull(buf, 0, &value)) - goto out; - - ret = -ENXIO; - bdev = bdget_part(dev_to_part(dev)); - if (bdev == NULL) - goto out; - - q = blk_trace_get_queue(bdev); - if (q == NULL) - goto out_bdput; + } else { + if (kstrtoull(buf, 0, &value)) + goto out; + } mutex_lock(&q->debugfs_mutex); @@ -1931,8 +1906,6 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, out_unlock_bdev: mutex_unlock(&q->debugfs_mutex); -out_bdput: - bdput(bdev); out: return ret ? ret : count; } -- cgit v1.2.3 From 5b7be9c709e10e88531f1f81e1150bbad65be1aa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 30 Nov 2020 23:37:33 -0500 Subject: ring-buffer: Add test to validate the time stamp deltas While debugging a situation where a delta for an event was calucalted wrong, I realize there was nothing making sure that the delta of events are correct. If a single event has an incorrect delta, then all events after it will also have one. If the discrepency gets large enough, it could cause the time stamps to go backwards when crossing sub buffers, that record a full 64 bit time stamp, and the new deltas are added to that. Add a way to validate the events at most events and when crossing a buffer page. This will help make sure that the deltas are always correct. This test will detect if they are ever corrupted. The test adds a high overhead to the ring buffer recording, as it does the audit for almost every event, and should only be used for testing the ring buffer. This will catch the bug that is fixed by commit 55ea4cf40380 ("ring-buffer: Update write stamp with the correct ts"), which is not applied when this commit is applied. Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 20 ++++++ kernel/trace/ring_buffer.c | 150 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 170 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index c9b64dea1216..fe60f9d7a0e6 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -845,6 +845,26 @@ config RING_BUFFER_STARTUP_TEST If unsure, say N +config RING_BUFFER_VALIDATE_TIME_DELTAS + bool "Verify ring buffer time stamp deltas" + depends on RING_BUFFER + help + This will audit the time stamps on the ring buffer sub + buffer to make sure that all the time deltas for the + events on a sub buffer matches the current time stamp. + This audit is performed for every event that is not + interrupted, or interrupting another event. A check + is also made when traversing sub buffers to make sure + that all the deltas on the previous sub buffer do not + add up to be greater than the current time stamp. + + NOTE: This adds significant overhead to recording of events, + and should only be used to test the logic of the ring buffer. + Do not use it on production systems. + + Only say Y if you understand what this does, and you + still want it enabled. Otherwise say N + config MMIOTRACE_TEST tristate "Test module for mmiotrace" depends on MMIOTRACE && m diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index ab68f28b8f4b..7cd888ee9ac7 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3193,6 +3193,153 @@ int ring_buffer_unlock_commit(struct trace_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +/* Special value to validate all deltas on a page. */ +#define CHECK_FULL_PAGE 1L + +#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS +static void dump_buffer_page(struct buffer_data_page *bpage, + struct rb_event_info *info, + unsigned long tail) +{ + struct ring_buffer_event *event; + u64 ts, delta; + int e; + + ts = bpage->time_stamp; + pr_warn(" [%lld] PAGE TIME STAMP\n", ts); + + for (e = 0; e < tail; e += rb_event_length(event)) { + + event = (struct ring_buffer_event *)(bpage->data + e); + + switch (event->type_len) { + + case RINGBUF_TYPE_TIME_EXTEND: + delta = ring_buffer_event_time_stamp(event); + ts += delta; + pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta); + break; + + case RINGBUF_TYPE_TIME_STAMP: + delta = ring_buffer_event_time_stamp(event); + ts = delta; + pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta); + break; + + case RINGBUF_TYPE_PADDING: + ts += event->time_delta; + pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta); + break; + + case RINGBUF_TYPE_DATA: + ts += event->time_delta; + pr_warn(" [%lld] delta:%d\n", ts, event->time_delta); + break; + + default: + break; + } + } +} + +static DEFINE_PER_CPU(atomic_t, checking); +static atomic_t ts_dump; + +/* + * Check if the current event time stamp matches the deltas on + * the buffer page. + */ +static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info, + unsigned long tail) +{ + struct ring_buffer_event *event; + struct buffer_data_page *bpage; + u64 ts, delta; + bool full = false; + int e; + + bpage = info->tail_page->page; + + if (tail == CHECK_FULL_PAGE) { + full = true; + tail = local_read(&bpage->commit); + } else if (info->add_timestamp & + (RB_ADD_STAMP_FORCE | RB_ADD_STAMP_ABSOLUTE)) { + /* Ignore events with absolute time stamps */ + return; + } + + /* + * Do not check the first event (skip possible extends too). + * Also do not check if previous events have not been committed. + */ + if (tail <= 8 || tail > local_read(&bpage->commit)) + return; + + /* + * If this interrupted another event, + */ + if (atomic_inc_return(this_cpu_ptr(&checking)) != 1) + goto out; + + ts = bpage->time_stamp; + + for (e = 0; e < tail; e += rb_event_length(event)) { + + event = (struct ring_buffer_event *)(bpage->data + e); + + switch (event->type_len) { + + case RINGBUF_TYPE_TIME_EXTEND: + delta = ring_buffer_event_time_stamp(event); + ts += delta; + break; + + case RINGBUF_TYPE_TIME_STAMP: + delta = ring_buffer_event_time_stamp(event); + ts = delta; + break; + + case RINGBUF_TYPE_PADDING: + if (event->time_delta == 1) + break; + /* fall through */ + case RINGBUF_TYPE_DATA: + ts += event->time_delta; + break; + + default: + RB_WARN_ON(cpu_buffer, 1); + } + } + if ((full && ts > info->ts) || + (!full && ts + info->delta != info->ts)) { + /* If another report is happening, ignore this one */ + if (atomic_inc_return(&ts_dump) != 1) { + atomic_dec(&ts_dump); + goto out; + } + atomic_inc(&cpu_buffer->record_disabled); + pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld after:%lld\n", + cpu_buffer->cpu, + ts + info->delta, info->ts, info->delta, info->after); + dump_buffer_page(bpage, info, tail); + atomic_dec(&ts_dump); + /* Do not re-enable checking */ + return; + } +out: + atomic_dec(this_cpu_ptr(&checking)); +} +#else +static inline void check_buffer(struct ring_buffer_per_cpu *cpu_buffer, + struct rb_event_info *info, + unsigned long tail) +{ +} +#endif /* CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS */ + static struct ring_buffer_event * __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, struct rb_event_info *info) @@ -3252,6 +3399,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, (void)rb_time_cmpxchg(&cpu_buffer->before_stamp, info->before, info->after); } + if (a_ok && b_ok) + check_buffer(cpu_buffer, info, CHECK_FULL_PAGE); return rb_move_tail(cpu_buffer, tail, info); } @@ -3272,6 +3421,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* Just use full timestamp for inerrupting event */ info->delta = info->ts; barrier(); + check_buffer(cpu_buffer, info, tail); if (unlikely(info->ts != save_before)) { /* SLOW PATH - Interrupted between C and E */ -- cgit v1.2.3 From 1446e1df9eb183fdf81c3f0715402f1d7595d4cb Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 27 Nov 2020 14:32:34 -0500 Subject: kernel: Implement selective syscall userspace redirection Introduce a mechanism to quickly disable/enable syscall handling for a specific process and redirect to userspace via SIGSYS. This is useful for processes with parts that require syscall redirection and parts that don't, but who need to perform this boundary crossing really fast, without paying the cost of a system call to reconfigure syscall handling on each boundary transition. This is particularly important for Windows games running over Wine. The proposed interface looks like this: prctl(PR_SET_SYSCALL_USER_DISPATCH, , , , [selector]) The range [,+) is a part of the process memory map that is allowed to by-pass the redirection code and dispatch syscalls directly, such that in fast paths a process doesn't need to disable the trap nor the kernel has to check the selector. This is essential to return from SIGSYS to a blocked area without triggering another SIGSYS from rt_sigreturn. selector is an optional pointer to a char-sized userspace memory region that has a key switch for the mechanism. This key switch is set to either PR_SYS_DISPATCH_ON, PR_SYS_DISPATCH_OFF to enable and disable the redirection without calling the kernel. The feature is meant to be set per-thread and it is disabled on fork/clone/execv. Internally, this doesn't add overhead to the syscall hot path, and it requires very little per-architecture support. I avoided using seccomp, even though it duplicates some functionality, due to previous feedback that maybe it shouldn't mix with seccomp since it is not a security mechanism. And obviously, this should never be considered a security mechanism, since any part of the program can by-pass it by using the syscall dispatcher. For the sysinfo benchmark, which measures the overhead added to executing a native syscall that doesn't require interception, the overhead using only the direct dispatcher region to issue syscalls is pretty much irrelevant. The overhead of using the selector goes around 40ns for a native (unredirected) syscall in my system, and it is (as expected) dominated by the supervisor-mode user-address access. In fact, with SMAP off, the overhead is consistently less than 5ns on my test box. Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Acked-by: Kees Cook Link: https://lore.kernel.org/r/20201127193238.821364-4-krisman@collabora.com --- fs/exec.c | 3 + include/linux/sched.h | 2 + include/linux/syscall_user_dispatch.h | 40 +++++++++++++ include/linux/thread_info.h | 2 + include/uapi/linux/prctl.h | 5 ++ kernel/entry/Makefile | 2 +- kernel/entry/common.h | 7 +++ kernel/entry/syscall_user_dispatch.c | 104 ++++++++++++++++++++++++++++++++++ kernel/fork.c | 1 + kernel/sys.c | 5 ++ 10 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 include/linux/syscall_user_dispatch.h create mode 100644 kernel/entry/common.h create mode 100644 kernel/entry/syscall_user_dispatch.c (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 547a2390baf5..aee36e5733ce 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -1302,6 +1303,8 @@ int begin_new_exec(struct linux_binprm * bprm) flush_thread(); me->personality &= ~bprm->per_clear; + clear_syscall_work_syscall_user_dispatch(me); + /* * We have to apply CLOEXEC before we change whether the process is * dumpable (in setup_new_exec) to avoid a race with a process in userspace diff --git a/include/linux/sched.h b/include/linux/sched.h index 063cd120b459..5a24a033b3f8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -965,6 +966,7 @@ struct task_struct { unsigned int sessionid; #endif struct seccomp seccomp; + struct syscall_user_dispatch syscall_dispatch; /* Thread group tracking: */ u64 parent_exec_id; diff --git a/include/linux/syscall_user_dispatch.h b/include/linux/syscall_user_dispatch.h new file mode 100644 index 000000000000..a0ae443fb7df --- /dev/null +++ b/include/linux/syscall_user_dispatch.h @@ -0,0 +1,40 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Collabora Ltd. + */ +#ifndef _SYSCALL_USER_DISPATCH_H +#define _SYSCALL_USER_DISPATCH_H + +#include + +#ifdef CONFIG_GENERIC_ENTRY + +struct syscall_user_dispatch { + char __user *selector; + unsigned long offset; + unsigned long len; + bool on_dispatch; +}; + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector); + +#define clear_syscall_work_syscall_user_dispatch(tsk) \ + clear_task_syscall_work(tsk, SYSCALL_USER_DISPATCH) + +#else +struct syscall_user_dispatch {}; + +static inline int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + return -EINVAL; +} + +static inline void clear_syscall_work_syscall_user_dispatch(struct task_struct *tsk) +{ +} + +#endif /* CONFIG_GENERIC_ENTRY */ + +#endif /* _SYSCALL_USER_DISPATCH_H */ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index ca80a214df09..c8a974cead73 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -42,6 +42,7 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SYSCALL_TRACE, SYSCALL_WORK_BIT_SYSCALL_EMU, SYSCALL_WORK_BIT_SYSCALL_AUDIT, + SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) @@ -49,6 +50,7 @@ enum syscall_work_bit { #define SYSCALL_WORK_SYSCALL_TRACE BIT(SYSCALL_WORK_BIT_SYSCALL_TRACE) #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) +#define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) #endif #include diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 7f0827705c9a..90deb41c8a34 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -247,4 +247,9 @@ struct prctl_mm_map { #define PR_SET_IO_FLUSHER 57 #define PR_GET_IO_FLUSHER 58 +/* Dispatch syscalls to a userspace handler */ +#define PR_SET_SYSCALL_USER_DISPATCH 59 +# define PR_SYS_DISPATCH_OFF 0 +# define PR_SYS_DISPATCH_ON 1 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/entry/Makefile b/kernel/entry/Makefile index 34c8a3f1c735..095c775e001e 100644 --- a/kernel/entry/Makefile +++ b/kernel/entry/Makefile @@ -9,5 +9,5 @@ KCOV_INSTRUMENT := n CFLAGS_REMOVE_common.o = -fstack-protector -fstack-protector-strong CFLAGS_common.o += -fno-stack-protector -obj-$(CONFIG_GENERIC_ENTRY) += common.o +obj-$(CONFIG_GENERIC_ENTRY) += common.o syscall_user_dispatch.o obj-$(CONFIG_KVM_XFER_TO_GUEST_WORK) += kvm.o diff --git a/kernel/entry/common.h b/kernel/entry/common.h new file mode 100644 index 000000000000..f6e6d02f07fe --- /dev/null +++ b/kernel/entry/common.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _COMMON_H +#define _COMMON_H + +bool syscall_user_dispatch(struct pt_regs *regs); + +#endif diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c new file mode 100644 index 000000000000..b0338a5625d9 --- /dev/null +++ b/kernel/entry/syscall_user_dispatch.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2020 Collabora Ltd. + */ +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include "common.h" + +static void trigger_sigsys(struct pt_regs *regs) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSYS; + info.si_code = SYS_USER_DISPATCH; + info.si_call_addr = (void __user *)KSTK_EIP(current); + info.si_errno = 0; + info.si_arch = syscall_get_arch(current); + info.si_syscall = syscall_get_nr(current, regs); + + force_sig_info(&info); +} + +bool syscall_user_dispatch(struct pt_regs *regs) +{ + struct syscall_user_dispatch *sd = ¤t->syscall_dispatch; + char state; + + if (likely(instruction_pointer(regs) - sd->offset < sd->len)) + return false; + + if (unlikely(arch_syscall_is_vdso_sigreturn(regs))) + return false; + + if (likely(sd->selector)) { + /* + * access_ok() is performed once, at prctl time, when + * the selector is loaded by userspace. + */ + if (unlikely(__get_user(state, sd->selector))) + do_exit(SIGSEGV); + + if (likely(state == PR_SYS_DISPATCH_OFF)) + return false; + + if (state != PR_SYS_DISPATCH_ON) + do_exit(SIGSYS); + } + + sd->on_dispatch = true; + syscall_rollback(current, regs); + trigger_sigsys(regs); + + return true; +} + +int set_syscall_user_dispatch(unsigned long mode, unsigned long offset, + unsigned long len, char __user *selector) +{ + switch (mode) { + case PR_SYS_DISPATCH_OFF: + if (offset || len || selector) + return -EINVAL; + break; + case PR_SYS_DISPATCH_ON: + /* + * Validate the direct dispatcher region just for basic + * sanity against overflow and a 0-sized dispatcher + * region. If the user is able to submit a syscall from + * an address, that address is obviously valid. + */ + if (offset && offset + len <= offset) + return -EINVAL; + + if (selector && !access_ok(selector, sizeof(*selector))) + return -EFAULT; + + break; + default: + return -EINVAL; + } + + current->syscall_dispatch.selector = selector; + current->syscall_dispatch.offset = offset; + current->syscall_dispatch.len = len; + current->syscall_dispatch.on_dispatch = false; + + if (mode == PR_SYS_DISPATCH_ON) + set_syscall_work(SYSCALL_USER_DISPATCH); + else + clear_syscall_work(SYSCALL_USER_DISPATCH); + + return 0; +} diff --git a/kernel/fork.c b/kernel/fork.c index 02b689a23457..4a5ecb41f440 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -906,6 +906,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) clear_user_return_notifier(tsk); clear_tsk_need_resched(tsk); set_task_stack_end_magic(tsk); + clear_syscall_work_syscall_user_dispatch(tsk); #ifdef CONFIG_STACKPROTECTOR tsk->stack_canary = get_random_canary(); diff --git a/kernel/sys.c b/kernel/sys.c index a730c03ee607..51f00fe20e4d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -2530,6 +2531,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER; break; + case PR_SET_SYSCALL_USER_DISPATCH: + error = set_syscall_user_dispatch(arg2, arg3, arg4, + (char __user *) arg5); + break; default: error = -EINVAL; break; -- cgit v1.2.3 From 11894468e39def270199f845b76df6c36d4ed133 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 27 Nov 2020 14:32:35 -0500 Subject: entry: Support Syscall User Dispatch on common syscall entry Syscall User Dispatch (SUD) must take precedence over seccomp and ptrace, since the use case is emulation (it can be invoked with a different ABI) such that seccomp filtering by syscall number doesn't make sense in the first place. In addition, either the syscall is dispatched back to userspace, in which case there is no resource for to trace, or the syscall will be executed, and seccomp/ptrace will execute next. Since SUD runs before tracepoints, it needs to be a SYSCALL_WORK_EXIT as well, just to prevent a trace exit event when dispatch was triggered. For that, the on_syscall_dispatch() examines context to skip the tracepoint, audit and other work. [ tglx: Add a comment on the exit side ] Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Acked-by: Peter Zijlstra (Intel) Acked-by: Kees Cook Link: https://lore.kernel.org/r/20201127193238.821364-5-krisman@collabora.com --- include/linux/entry-common.h | 2 ++ kernel/entry/common.c | 25 +++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index 49b26b216e4e..a6e98b4ba8e9 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -44,10 +44,12 @@ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_EMU | \ SYSCALL_WORK_SYSCALL_AUDIT | \ + SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ ARCH_SYSCALL_WORK_ENTER) #define SYSCALL_WORK_EXIT (SYSCALL_WORK_SYSCALL_TRACEPOINT | \ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ + SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ ARCH_SYSCALL_WORK_EXIT) /* diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 91e8fd50adf4..e661e70ffcf3 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -5,6 +5,8 @@ #include #include +#include "common.h" + #define CREATE_TRACE_POINTS #include @@ -46,6 +48,16 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall, { long ret = 0; + /* + * Handle Syscall User Dispatch. This must comes first, since + * the ABI here can be something that doesn't make sense for + * other syscall_work features. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (syscall_user_dispatch(regs)) + return -1L; + } + /* Handle ptrace */ if (work & (SYSCALL_WORK_SYSCALL_TRACE | SYSCALL_WORK_SYSCALL_EMU)) { ret = arch_syscall_enter_tracehook(regs); @@ -230,6 +242,19 @@ static void syscall_exit_work(struct pt_regs *regs, unsigned long work) { bool step; + /* + * If the syscall was rolled back due to syscall user dispatching, + * then the tracers below are not invoked for the same reason as + * the entry side was not invoked in syscall_trace_enter(): The ABI + * of these syscalls is unknown. + */ + if (work & SYSCALL_WORK_SYSCALL_USER_DISPATCH) { + if (unlikely(current->syscall_dispatch.on_dispatch)) { + current->syscall_dispatch.on_dispatch = false; + return; + } + } + audit_syscall_exit(regs); if (work & SYSCALL_WORK_SYSCALL_TRACEPOINT) -- cgit v1.2.3 From 6666bb714fb3bc7b2e8be72b9c92f2d8a89ea2dc Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 1 Dec 2020 15:27:51 +0100 Subject: entry: Rename enter_from_user_mode() In order to make this function publicly available rename it so it can still be inlined. An additional enter_from_user_mode() function will be added with a later commit. Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201201142755.31931-2-svens@linux.ibm.com --- kernel/entry/common.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index e661e70ffcf3..8e294a794e93 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -11,7 +11,7 @@ #include /** - * enter_from_user_mode - Establish state when coming from user mode + * __enter_from_user_mode - Establish state when coming from user mode * * Syscall/interrupt entry disables interrupts, but user mode is traced as * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. @@ -20,7 +20,7 @@ * 2) Invoke context tracking if enabled to reactivate RCU * 3) Trace interrupts off state */ -static __always_inline void enter_from_user_mode(struct pt_regs *regs) +static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { arch_check_user_regs(regs); lockdep_hardirqs_off(CALLER_ADDR0); @@ -103,7 +103,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) { long ret; - enter_from_user_mode(regs); + __enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); @@ -115,7 +115,7 @@ noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall) noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) { - enter_from_user_mode(regs); + __enter_from_user_mode(regs); instrumentation_begin(); local_irq_enable(); instrumentation_end(); @@ -304,7 +304,7 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) { - enter_from_user_mode(regs); + __enter_from_user_mode(regs); } noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) -- cgit v1.2.3 From bb793562f0da7317adf6c456316bca651ff46f5d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 1 Dec 2020 15:27:52 +0100 Subject: entry: Rename exit_to_user_mode() In order to make this function publicly available rename it so it can still be inlined. An additional exit_to_user_mode() function will be added with a later commit. Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201201142755.31931-3-svens@linux.ibm.com --- kernel/entry/common.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 8e294a794e93..dff07b4ce6ec 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -122,7 +122,7 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) } /** - * exit_to_user_mode - Fixup state when exiting to user mode + * __exit_to_user_mode - Fixup state when exiting to user mode * * Syscall/interupt exit enables interrupts, but the kernel state is * interrupts disabled when this is invoked. Also tell RCU about it. @@ -133,7 +133,7 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) * mitigations, etc. * 4) Tell lockdep that interrupts are enabled */ -static __always_inline void exit_to_user_mode(void) +static __always_inline void __exit_to_user_mode(void) { instrumentation_begin(); trace_hardirqs_on_prepare(); @@ -299,7 +299,7 @@ __visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) local_irq_disable_exit_to_user(); exit_to_user_mode_prepare(regs); instrumentation_end(); - exit_to_user_mode(); + __exit_to_user_mode(); } noinstr void irqentry_enter_from_user_mode(struct pt_regs *regs) @@ -312,7 +312,7 @@ noinstr void irqentry_exit_to_user_mode(struct pt_regs *regs) instrumentation_begin(); exit_to_user_mode_prepare(regs); instrumentation_end(); - exit_to_user_mode(); + __exit_to_user_mode(); } noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) -- cgit v1.2.3 From 96e2fbccd0fc806364a964fdf072bfc858a66109 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 1 Dec 2020 15:27:53 +0100 Subject: entry_Add_enter_from_user_mode_wrapper To be called from architecture specific code if the combo interfaces are not suitable. It simply calls __enter_from_user_mode(). This way __enter_from_user_mode will still be inlined because it is declared static __always_inline. [ tglx: Amend comments and move it to a different location in the header ] Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201201142755.31931-4-svens@linux.ibm.com --- include/linux/entry-common.h | 24 +++++++++++++++++++++++- kernel/entry/common.c | 16 ++++++---------- 2 files changed, 29 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index a6e98b4ba8e9..da60980a2e7b 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -101,6 +101,27 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs } #endif +/** + * enter_from_user_mode - Establish state when coming from user mode + * + * Syscall/interrupt entry disables interrupts, but user mode is traced as + * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. + * + * 1) Tell lockdep that interrupts are disabled + * 2) Invoke context tracking if enabled to reactivate RCU + * 3) Trace interrupts off state + * + * Invoked from architecture specific syscall entry code with interrupts + * disabled. The calling code has to be non-instrumentable. When the + * function returns all state is correct and interrupts are still + * disabled. The subsequent functions can be instrumented. + * + * This is invoked when there is architecture specific functionality to be + * done between establishing state and enabling interrupts. The caller must + * enable interrupts before invoking syscall_enter_from_user_mode_work(). + */ +void enter_from_user_mode(struct pt_regs *regs); + /** * syscall_enter_from_user_mode_prepare - Establish state and enable interrupts * @regs: Pointer to currents pt_regs @@ -110,7 +131,8 @@ static inline __must_check int arch_syscall_enter_tracehook(struct pt_regs *regs * function returns all state is correct, interrupts are enabled and the * subsequent functions can be instrumented. * - * This handles lockdep, RCU (context tracking) and tracing state. + * This handles lockdep, RCU (context tracking) and tracing state, i.e. + * the functionality provided by enter_from_user_mode(). * * This is invoked when there is extra architecture specific functionality * to be done between establishing state and handling user mode entry work. diff --git a/kernel/entry/common.c b/kernel/entry/common.c index dff07b4ce6ec..17b1e032afe7 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -10,16 +10,7 @@ #define CREATE_TRACE_POINTS #include -/** - * __enter_from_user_mode - Establish state when coming from user mode - * - * Syscall/interrupt entry disables interrupts, but user mode is traced as - * interrupts enabled. Also with NO_HZ_FULL RCU might be idle. - * - * 1) Tell lockdep that interrupts are disabled - * 2) Invoke context tracking if enabled to reactivate RCU - * 3) Trace interrupts off state - */ +/* See comment for enter_from_user_mode() in entry-common.h */ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) { arch_check_user_regs(regs); @@ -33,6 +24,11 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) instrumentation_end(); } +void noinstr enter_from_user_mode(struct pt_regs *regs) +{ + __enter_from_user_mode(regs); +} + static inline void syscall_enter_audit(struct pt_regs *regs, long syscall) { if (unlikely(audit_context())) { -- cgit v1.2.3 From 310de1a678b2184c078c593dae343cb79c807f8d Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 1 Dec 2020 15:27:54 +0100 Subject: entry: Add exit_to_user_mode() wrapper Called from architecture specific code when syscall_exit_to_user_mode() is not suitable. It simply calls __exit_to_user_mode(). This way __exit_to_user_mode() can still be inlined because it is declared static __always_inline. [ tglx: Amended comments and moved it to a different place in the header ] Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201201142755.31931-5-svens@linux.ibm.com --- include/linux/entry-common.h | 23 +++++++++++++++++++++-- kernel/entry/common.c | 18 ++++++------------ 2 files changed, 27 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index da60980a2e7b..e370be8121aa 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -300,6 +300,25 @@ static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step) } #endif +/** + * exit_to_user_mode - Fixup state when exiting to user mode + * + * Syscall/interrupt exit enables interrupts, but the kernel state is + * interrupts disabled when this is invoked. Also tell RCU about it. + * + * 1) Trace interrupts on state + * 2) Invoke context tracking if enabled to adjust RCU state + * 3) Invoke architecture specific last minute exit code, e.g. speculation + * mitigations, etc.: arch_exit_to_user_mode() + * 4) Tell lockdep that interrupts are enabled + * + * Invoked from architecture specific code when syscall_exit_to_user_mode() + * is not suitable as the last step before returning to userspace. Must be + * invoked with interrupts disabled and the caller must be + * non-instrumentable. + */ +void exit_to_user_mode(void); + /** * syscall_exit_to_user_mode - Handle work before returning to user mode * @regs: Pointer to currents pt_regs @@ -322,8 +341,8 @@ static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step) * - Architecture specific one time work arch_exit_to_user_mode_prepare() * - Address limit and lockdep checks * - * 3) Final transition (lockdep, tracing, context tracking, RCU). Invokes - * arch_exit_to_user_mode() to handle e.g. speculation mitigations + * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the + * functionality in exit_to_user_mode(). */ void syscall_exit_to_user_mode(struct pt_regs *regs); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 17b1e032afe7..48d30ce2e00e 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -117,18 +117,7 @@ noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs) instrumentation_end(); } -/** - * __exit_to_user_mode - Fixup state when exiting to user mode - * - * Syscall/interupt exit enables interrupts, but the kernel state is - * interrupts disabled when this is invoked. Also tell RCU about it. - * - * 1) Trace interrupts on state - * 2) Invoke context tracking if enabled to adjust RCU state - * 3) Invoke architecture specific last minute exit code, e.g. speculation - * mitigations, etc. - * 4) Tell lockdep that interrupts are enabled - */ +/* See comment for exit_to_user_mode() in entry-common.h */ static __always_inline void __exit_to_user_mode(void) { instrumentation_begin(); @@ -141,6 +130,11 @@ static __always_inline void __exit_to_user_mode(void) lockdep_hardirqs_on(CALLER_ADDR0); } +void noinstr exit_to_user_mode(void) +{ + __exit_to_user_mode(); +} + /* Workaround to allow gradual conversion of architecture code */ void __weak arch_do_signal_or_restart(struct pt_regs *regs, bool has_signal) { } -- cgit v1.2.3 From c6156e1da633f241e132eaea3b676d674376d770 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 1 Dec 2020 15:27:55 +0100 Subject: entry: Add syscall_exit_to_user_mode_work() This is the same as syscall_exit_to_user_mode() but without calling exit_to_user_mode(). This can be used if there is an architectural reason to avoid the combo function, e.g. restarting a syscall without returning to userspace. Before returning to user space the caller has to invoke exit_to_user_mode(). [ tglx: Amended comments ] Signed-off-by: Sven Schnelle Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201201142755.31931-6-svens@linux.ibm.com --- include/linux/entry-common.h | 20 ++++++++++++++++++++ kernel/entry/common.c | 14 ++++++++++++-- 2 files changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index e370be8121aa..7c581a4c3797 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -316,9 +316,25 @@ static inline void arch_syscall_exit_tracehook(struct pt_regs *regs, bool step) * is not suitable as the last step before returning to userspace. Must be * invoked with interrupts disabled and the caller must be * non-instrumentable. + * The caller has to invoke syscall_exit_to_user_mode_work() before this. */ void exit_to_user_mode(void); +/** + * syscall_exit_to_user_mode_work - Handle work before returning to user mode + * @regs: Pointer to currents pt_regs + * + * Same as step 1 and 2 of syscall_exit_to_user_mode() but without calling + * exit_to_user_mode() to perform the final transition to user mode. + * + * Calling convention is the same as for syscall_exit_to_user_mode() and it + * returns with all work handled and interrupts disabled. The caller must + * invoke exit_to_user_mode() before actually switching to user mode to + * make the final state transitions. Interrupts must stay disabled between + * return from this function and the invocation of exit_to_user_mode(). + */ +void syscall_exit_to_user_mode_work(struct pt_regs *regs); + /** * syscall_exit_to_user_mode - Handle work before returning to user mode * @regs: Pointer to currents pt_regs @@ -343,6 +359,10 @@ void exit_to_user_mode(void); * * 3) Final transition (lockdep, tracing, context tracking, RCU), i.e. the * functionality in exit_to_user_mode(). + * + * This is a combination of syscall_exit_to_user_mode_work() (1,2) and + * exit_to_user_mode(). This function is preferred unless there is a + * compelling architectural reason to use the seperate functions. */ void syscall_exit_to_user_mode(struct pt_regs *regs); diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 48d30ce2e00e..d6b73937dab3 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -282,12 +282,22 @@ static void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) syscall_exit_work(regs, work); } -__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) +static __always_inline void __syscall_exit_to_user_mode_work(struct pt_regs *regs) { - instrumentation_begin(); syscall_exit_to_user_mode_prepare(regs); local_irq_disable_exit_to_user(); exit_to_user_mode_prepare(regs); +} + +void syscall_exit_to_user_mode_work(struct pt_regs *regs) +{ + __syscall_exit_to_user_mode_work(regs); +} + +__visible noinstr void syscall_exit_to_user_mode(struct pt_regs *regs) +{ + instrumentation_begin(); + __syscall_exit_to_user_mode_work(regs); instrumentation_end(); __exit_to_user_mode(); } -- cgit v1.2.3 From 7197688b2006357da75a014e0a76be89ca9c2d46 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 2 Dec 2020 12:57:28 +0100 Subject: sched/cputime: Remove symbol exports from IRQ time accounting account_irq_enter_time() and account_irq_exit_time() are not called from modules. EXPORT_SYMBOL_GPL() can be safely removed from the IRQ cputime accounting functions called from there. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201202115732.27827-2-frederic@kernel.org --- arch/s390/kernel/vtime.c | 10 +++++----- kernel/sched/cputime.c | 2 -- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 8df10d3c8f6c..f9f2a11958a5 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -226,7 +226,7 @@ void vtime_flush(struct task_struct *tsk) * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. */ -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_account_kernel(struct task_struct *tsk) { u64 timer; @@ -245,12 +245,12 @@ void vtime_account_irq_enter(struct task_struct *tsk) virt_timer_forward(timer); } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); - -void vtime_account_kernel(struct task_struct *tsk) -__attribute__((alias("vtime_account_irq_enter"))); EXPORT_SYMBOL_GPL(vtime_account_kernel); +void vtime_account_irq_enter(struct task_struct *tsk) +__attribute__((alias("vtime_account_kernel"))); + + /* * Sorted add to a list. List is linear searched until first bigger * element is found. diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5a55d2300452..61ce9f9bf0a3 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -71,7 +71,6 @@ void irqtime_account_irq(struct task_struct *curr) else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } -EXPORT_SYMBOL_GPL(irqtime_account_irq); static u64 irqtime_tick_accounted(u64 maxtime) { @@ -434,7 +433,6 @@ void vtime_account_irq_enter(struct task_struct *tsk) else vtime_account_kernel(tsk); } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, -- cgit v1.2.3 From 2b91ec9f551b56751cde48792f1c0a1130358844 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 2 Dec 2020 12:57:29 +0100 Subject: s390/vtime: Use the generic IRQ entry accounting s390 has its own version of IRQ entry accounting because it doesn't account the idle time the same way the other architectures do. Only the actual idle sleep time is accounted as idle time, the rest of the idle task execution is accounted as system time. Make the generic IRQ entry accounting aware of architectures that have their own way of accounting idle time and convert s390 to use it. This prepares s390 to get involved in further consolidations of IRQ time accounting. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201202115732.27827-3-frederic@kernel.org --- arch/Kconfig | 7 ++++++- arch/s390/Kconfig | 1 + arch/s390/include/asm/vtime.h | 1 - arch/s390/kernel/vtime.c | 4 ---- kernel/sched/cputime.c | 13 ++----------- 5 files changed, 9 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 56b6ccc0e32d..0f151b49c7b7 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -627,6 +627,12 @@ config HAVE_TIF_NOHZ config HAVE_VIRT_CPU_ACCOUNTING bool +config HAVE_VIRT_CPU_ACCOUNTING_IDLE + bool + help + Architecture has its own way to account idle CPU time and therefore + doesn't implement vtime_account_idle(). + config ARCH_HAS_SCALED_CPUTIME bool @@ -641,7 +647,6 @@ config HAVE_VIRT_CPU_ACCOUNTING_GEN some 32-bit arches may require multiple accesses, so proper locking is needed to protect against concurrent accesses. - config HAVE_IRQ_TIME_ACCOUNTING bool help diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 4a2a12be04c9..6f1fdcd3b5db 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -181,6 +181,7 @@ config S390 select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING + select HAVE_VIRT_CPU_ACCOUNTING_IDLE select IOMMU_HELPER if PCI select IOMMU_SUPPORT if PCI select MODULES_USE_ELF_RELA diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h index 3622d4ebc73a..fac6a67988eb 100644 --- a/arch/s390/include/asm/vtime.h +++ b/arch/s390/include/asm/vtime.h @@ -2,7 +2,6 @@ #ifndef _S390_VTIME_H #define _S390_VTIME_H -#define __ARCH_HAS_VTIME_ACCOUNT #define __ARCH_HAS_VTIME_TASK_SWITCH #endif /* _S390_VTIME_H */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index f9f2a11958a5..ebd8e5655789 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -247,10 +247,6 @@ void vtime_account_kernel(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(vtime_account_kernel); -void vtime_account_irq_enter(struct task_struct *tsk) -__attribute__((alias("vtime_account_kernel"))); - - /* * Sorted add to a list. List is linear searched until first bigger * element is found. diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 61ce9f9bf0a3..2783162542b1 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -417,23 +417,14 @@ void vtime_task_switch(struct task_struct *prev) } # endif -/* - * Archs that account the whole time spent in the idle task - * (outside irq) as idle time can rely on this and just implement - * vtime_account_kernel() and vtime_account_idle(). Archs that - * have other meaning of the idle time (s390 only includes the - * time spent by the CPU when it's in low power mode) must override - * vtime_account(). - */ -#ifndef __ARCH_HAS_VTIME_ACCOUNT void vtime_account_irq_enter(struct task_struct *tsk) { - if (!in_interrupt() && is_idle_task(tsk)) + if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && + !in_interrupt() && is_idle_task(tsk)) vtime_account_idle(tsk); else vtime_account_kernel(tsk); } -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, u64 *ut, u64 *st) -- cgit v1.2.3 From 8a6a5920d3286eb0eae9f36a4ec4fc9df511eccb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 2 Dec 2020 12:57:30 +0100 Subject: sched/vtime: Consolidate IRQ time accounting The 3 architectures implementing CONFIG_VIRT_CPU_ACCOUNTING_NATIVE all have their own version of irq time accounting that dispatch the cputime to the appropriate index: hardirq, softirq, system, idle, guest... from an all-in-one function. Instead of having these ad-hoc versions, move the cputime destination dispatch decision to the core code and leave only the actual per-index cputime accounting to the architecture. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201202115732.27827-4-frederic@kernel.org --- arch/ia64/kernel/time.c | 20 ++++++++++++----- arch/powerpc/kernel/time.c | 56 +++++++++++++++++++++++++++++++++------------- arch/s390/kernel/vtime.c | 45 ++++++++++++++++++++++++++----------- include/linux/vtime.h | 16 +++++-------- kernel/sched/cputime.c | 13 +++++++---- 5 files changed, 102 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 7abc5f37bfaf..733e0e3324b8 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -138,12 +138,8 @@ void vtime_account_kernel(struct task_struct *tsk) struct thread_info *ti = task_thread_info(tsk); __u64 stime = vtime_delta(tsk); - if ((tsk->flags & PF_VCPU) && !irq_count()) + if (tsk->flags & PF_VCPU) ti->gtime += stime; - else if (hardirq_count()) - ti->hardirq_time += stime; - else if (in_serving_softirq()) - ti->softirq_time += stime; else ti->stime += stime; } @@ -156,6 +152,20 @@ void vtime_account_idle(struct task_struct *tsk) ti->idle_time += vtime_delta(tsk); } +void vtime_account_softirq(struct task_struct *tsk) +{ + struct thread_info *ti = task_thread_info(tsk); + + ti->softirq_time += vtime_delta(tsk); +} + +void vtime_account_hardirq(struct task_struct *tsk) +{ + struct thread_info *ti = task_thread_info(tsk); + + ti->hardirq_time += vtime_delta(tsk); +} + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static irqreturn_t diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 74efe46f5532..cf3f8db7e0e3 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -311,12 +311,11 @@ static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct, return stime_scaled; } -static unsigned long vtime_delta(struct task_struct *tsk, +static unsigned long vtime_delta(struct cpu_accounting_data *acct, unsigned long *stime_scaled, unsigned long *steal_time) { unsigned long now, stime; - struct cpu_accounting_data *acct = get_accounting(tsk); WARN_ON_ONCE(!irqs_disabled()); @@ -331,29 +330,30 @@ static unsigned long vtime_delta(struct task_struct *tsk, return stime; } +static void vtime_delta_kernel(struct cpu_accounting_data *acct, + unsigned long *stime, unsigned long *stime_scaled) +{ + unsigned long steal_time; + + *stime = vtime_delta(acct, stime_scaled, &steal_time); + *stime -= min(*stime, steal_time); + acct->steal_time += steal_time; +} + void vtime_account_kernel(struct task_struct *tsk) { - unsigned long stime, stime_scaled, steal_time; struct cpu_accounting_data *acct = get_accounting(tsk); + unsigned long stime, stime_scaled; - stime = vtime_delta(tsk, &stime_scaled, &steal_time); - - stime -= min(stime, steal_time); - acct->steal_time += steal_time; + vtime_delta_kernel(acct, &stime, &stime_scaled); - if ((tsk->flags & PF_VCPU) && !irq_count()) { + if (tsk->flags & PF_VCPU) { acct->gtime += stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->utime_scaled += stime_scaled; #endif } else { - if (hardirq_count()) - acct->hardirq_time += stime; - else if (in_serving_softirq()) - acct->softirq_time += stime; - else - acct->stime += stime; - + acct->stime += stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->stime_scaled += stime_scaled; #endif @@ -366,10 +366,34 @@ void vtime_account_idle(struct task_struct *tsk) unsigned long stime, stime_scaled, steal_time; struct cpu_accounting_data *acct = get_accounting(tsk); - stime = vtime_delta(tsk, &stime_scaled, &steal_time); + stime = vtime_delta(acct, &stime_scaled, &steal_time); acct->idle_time += stime + steal_time; } +static void vtime_account_irq_field(struct cpu_accounting_data *acct, + unsigned long *field) +{ + unsigned long stime, stime_scaled; + + vtime_delta_kernel(acct, &stime, &stime_scaled); + *field += stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + acct->stime_scaled += stime_scaled; +#endif +} + +void vtime_account_softirq(struct task_struct *tsk) +{ + struct cpu_accounting_data *acct = get_accounting(tsk); + vtime_account_irq_field(acct, &acct->softirq_time); +} + +void vtime_account_hardirq(struct task_struct *tsk) +{ + struct cpu_accounting_data *acct = get_accounting(tsk); + vtime_account_irq_field(acct, &acct->hardirq_time); +} + static void vtime_flush_scaled(struct task_struct *tsk, struct cpu_accounting_data *acct) { diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index ebd8e5655789..5aaa2ca6a928 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -222,31 +222,50 @@ void vtime_flush(struct task_struct *tsk) S390_lowcore.avg_steal_timer = avg_steal; } +static u64 vtime_delta(void) +{ + u64 timer = S390_lowcore.last_update_timer; + + S390_lowcore.last_update_timer = get_vtimer(); + + return timer - S390_lowcore.last_update_timer; +} + /* * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. */ void vtime_account_kernel(struct task_struct *tsk) { - u64 timer; - - timer = S390_lowcore.last_update_timer; - S390_lowcore.last_update_timer = get_vtimer(); - timer -= S390_lowcore.last_update_timer; + u64 delta = vtime_delta(); - if ((tsk->flags & PF_VCPU) && (irq_count() == 0)) - S390_lowcore.guest_timer += timer; - else if (hardirq_count()) - S390_lowcore.hardirq_timer += timer; - else if (in_serving_softirq()) - S390_lowcore.softirq_timer += timer; + if (tsk->flags & PF_VCPU) + S390_lowcore.guest_timer += delta; else - S390_lowcore.system_timer += timer; + S390_lowcore.system_timer += delta; - virt_timer_forward(timer); + virt_timer_forward(delta); } EXPORT_SYMBOL_GPL(vtime_account_kernel); +void vtime_account_softirq(struct task_struct *tsk) +{ + u64 delta = vtime_delta(); + + S390_lowcore.softirq_timer += delta; + + virt_timer_forward(delta); +} + +void vtime_account_hardirq(struct task_struct *tsk) +{ + u64 delta = vtime_delta(); + + S390_lowcore.hardirq_timer += delta; + + virt_timer_forward(delta); +} + /* * Sorted add to a list. List is linear searched until first bigger * element is found. diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 2cdeca062db3..6c9867419615 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -83,16 +83,12 @@ static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } #endif #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -extern void vtime_account_irq_enter(struct task_struct *tsk); -static inline void vtime_account_irq_exit(struct task_struct *tsk) -{ - /* On hard|softirq exit we always account to hard|softirq cputime */ - vtime_account_kernel(tsk); -} +extern void vtime_account_irq(struct task_struct *tsk); +extern void vtime_account_softirq(struct task_struct *tsk); +extern void vtime_account_hardirq(struct task_struct *tsk); extern void vtime_flush(struct task_struct *tsk); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -static inline void vtime_account_irq_enter(struct task_struct *tsk) { } -static inline void vtime_account_irq_exit(struct task_struct *tsk) { } +static inline void vtime_account_irq(struct task_struct *tsk) { } static inline void vtime_flush(struct task_struct *tsk) { } #endif @@ -105,13 +101,13 @@ static inline void irqtime_account_irq(struct task_struct *tsk) { } static inline void account_irq_enter_time(struct task_struct *tsk) { - vtime_account_irq_enter(tsk); + vtime_account_irq(tsk); irqtime_account_irq(tsk); } static inline void account_irq_exit_time(struct task_struct *tsk) { - vtime_account_irq_exit(tsk); + vtime_account_irq(tsk); irqtime_account_irq(tsk); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 2783162542b1..02163d4260d7 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -417,13 +417,18 @@ void vtime_task_switch(struct task_struct *prev) } # endif -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq(struct task_struct *tsk) { - if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && - !in_interrupt() && is_idle_task(tsk)) + if (hardirq_count()) { + vtime_account_hardirq(tsk); + } else if (in_serving_softirq()) { + vtime_account_softirq(tsk); + } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && + is_idle_task(tsk)) { vtime_account_idle(tsk); - else + } else { vtime_account_kernel(tsk); + } } void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, -- cgit v1.2.3 From d3759e7184f8f6187e62f8c4e7dcb1f6c47c075a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 2 Dec 2020 12:57:31 +0100 Subject: irqtime: Move irqtime entry accounting after irq offset incrementation IRQ time entry is currently accounted before HARDIRQ_OFFSET or SOFTIRQ_OFFSET are incremented. This is convenient to decide to which index the cputime to account is dispatched. Unfortunately it prevents tick_irq_enter() from being called under HARDIRQ_OFFSET because tick_irq_enter() has to be called before the IRQ entry accounting due to the necessary clock catch up. As a result we don't benefit from appropriate lockdep coverage on tick_irq_enter(). To prepare for fixing this, move the IRQ entry cputime accounting after the preempt offset is incremented. This requires the cputime dispatch code to handle the extra offset. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20201202115732.27827-5-frederic@kernel.org --- include/linux/hardirq.h | 4 ++-- include/linux/vtime.h | 34 ++++++++++++++++++++++++---------- kernel/sched/cputime.c | 18 +++++++++++------- kernel/softirq.c | 6 +++--- 4 files changed, 40 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 754f67ac4326..7c9d6a2d7e90 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -32,9 +32,9 @@ static __always_inline void rcu_irq_enter_check_tick(void) */ #define __irq_enter() \ do { \ - account_irq_enter_time(current); \ preempt_count_add(HARDIRQ_OFFSET); \ lockdep_hardirq_enter(); \ + account_hardirq_enter(current); \ } while (0) /* @@ -62,8 +62,8 @@ void irq_enter_rcu(void); */ #define __irq_exit() \ do { \ + account_hardirq_exit(current); \ lockdep_hardirq_exit(); \ - account_irq_exit_time(current); \ preempt_count_sub(HARDIRQ_OFFSET); \ } while (0) diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 6c9867419615..041d6524d144 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -83,32 +83,46 @@ static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } #endif #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -extern void vtime_account_irq(struct task_struct *tsk); +extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); extern void vtime_account_softirq(struct task_struct *tsk); extern void vtime_account_hardirq(struct task_struct *tsk); extern void vtime_flush(struct task_struct *tsk); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -static inline void vtime_account_irq(struct task_struct *tsk) { } +static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } +static inline void vtime_account_softirq(struct task_struct *tsk) { } +static inline void vtime_account_hardirq(struct task_struct *tsk) { } static inline void vtime_flush(struct task_struct *tsk) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING -extern void irqtime_account_irq(struct task_struct *tsk); +extern void irqtime_account_irq(struct task_struct *tsk, unsigned int offset); #else -static inline void irqtime_account_irq(struct task_struct *tsk) { } +static inline void irqtime_account_irq(struct task_struct *tsk, unsigned int offset) { } #endif -static inline void account_irq_enter_time(struct task_struct *tsk) +static inline void account_softirq_enter(struct task_struct *tsk) { - vtime_account_irq(tsk); - irqtime_account_irq(tsk); + vtime_account_irq(tsk, SOFTIRQ_OFFSET); + irqtime_account_irq(tsk, SOFTIRQ_OFFSET); } -static inline void account_irq_exit_time(struct task_struct *tsk) +static inline void account_softirq_exit(struct task_struct *tsk) { - vtime_account_irq(tsk); - irqtime_account_irq(tsk); + vtime_account_softirq(tsk); + irqtime_account_irq(tsk, 0); +} + +static inline void account_hardirq_enter(struct task_struct *tsk) +{ + vtime_account_irq(tsk, HARDIRQ_OFFSET); + irqtime_account_irq(tsk, HARDIRQ_OFFSET); +} + +static inline void account_hardirq_exit(struct task_struct *tsk) +{ + vtime_account_hardirq(tsk); + irqtime_account_irq(tsk, 0); } #endif /* _LINUX_KERNEL_VTIME_H */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 02163d4260d7..5f611658eeab 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -44,12 +44,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, } /* - * Called before incrementing preempt_count on {soft,}irq_enter + * Called after incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void irqtime_account_irq(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr, unsigned int offset) { struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); + unsigned int pc; s64 delta; int cpu; @@ -59,6 +60,7 @@ void irqtime_account_irq(struct task_struct *curr) cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; + pc = preempt_count() - offset; /* * We do not account for softirq time from ksoftirqd here. @@ -66,9 +68,9 @@ void irqtime_account_irq(struct task_struct *curr) * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (hardirq_count()) + if (pc & HARDIRQ_MASK) irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } @@ -417,11 +419,13 @@ void vtime_task_switch(struct task_struct *prev) } # endif -void vtime_account_irq(struct task_struct *tsk) +void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { - if (hardirq_count()) { + unsigned int pc = preempt_count() - offset; + + if (pc & HARDIRQ_OFFSET) { vtime_account_hardirq(tsk); - } else if (in_serving_softirq()) { + } else if (pc & SOFTIRQ_OFFSET) { vtime_account_softirq(tsk); } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && is_idle_task(tsk)) { diff --git a/kernel/softirq.c b/kernel/softirq.c index 617009ccd82c..b8f42b3ba8ca 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -315,10 +315,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - account_irq_enter_time(current); __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); in_hardirq = lockdep_softirq_start(); + account_softirq_enter(current); restart: /* Reset the pending bitmask before enabling irqs */ @@ -365,8 +365,8 @@ restart: wakeup_softirqd(); } + account_softirq_exit(current); lockdep_softirq_end(in_hardirq); - account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); current_restore_flags(old_flags, PF_MEMALLOC); @@ -418,7 +418,7 @@ static inline void __irq_exit_rcu(void) #else lockdep_assert_irqs_disabled(); #endif - account_irq_exit_time(current); + account_hardirq_exit(current); preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); -- cgit v1.2.3 From d14ce74f1fb376ccbbc0b05ded477ada51253729 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 2 Dec 2020 12:57:32 +0100 Subject: irq: Call tick_irq_enter() inside HARDIRQ_OFFSET Now that account_hardirq_enter() is called after HARDIRQ_OFFSET has been incremented, there is nothing left that prevents us from also moving tick_irq_enter() after HARDIRQ_OFFSET is incremented. The desired outcome is to remove the nasty hack that prevents softirqs from being raised through ksoftirqd instead of the hardirq bottom half. Also tick_irq_enter() then becomes appropriately covered by lockdep. Signed-off-by: Frederic Weisbecker Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201202115732.27827-6-frederic@kernel.org --- kernel/softirq.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b8f42b3ba8ca..d5bfd5e661fc 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -377,16 +377,12 @@ restart: */ void irq_enter_rcu(void) { - if (is_idle_task(current) && !in_interrupt()) { - /* - * Prevent raise_softirq from needlessly waking up ksoftirqd - * here, as softirq will be serviced on return from interrupt. - */ - local_bh_disable(); + __irq_enter_raw(); + + if (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)) tick_irq_enter(); - _local_bh_enable(); - } - __irq_enter(); + + account_hardirq_enter(current); } /** -- cgit v1.2.3 From bcfe06bf2622f7c4899468e427683aec49070687 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:27 -0800 Subject: mm: memcontrol: Use helpers to read page's memcg data Patch series "mm: allow mapping accounted kernel pages to userspace", v6. Currently a non-slab kernel page which has been charged to a memory cgroup can't be mapped to userspace. The underlying reason is simple: PageKmemcg flag is defined as a page type (like buddy, offline, etc), so it takes a bit from a page->mapped counter. Pages with a type set can't be mapped to userspace. But in general the kmemcg flag has nothing to do with mapping to userspace. It only means that the page has been accounted by the page allocator, so it has to be properly uncharged on release. Some bpf maps are mapping the vmalloc-based memory to userspace, and their memory can't be accounted because of this implementation detail. This patchset removes this limitation by moving the PageKmemcg flag into one of the free bits of the page->mem_cgroup pointer. Also it formalizes accesses to the page->mem_cgroup and page->obj_cgroups using new helpers, adds several checks and removes a couple of obsolete functions. As the result the code became more robust with fewer open-coded bit tricks. This patch (of 4): Currently there are many open-coded reads of the page->mem_cgroup pointer, as well as a couple of read helpers, which are barely used. It creates an obstacle on a way to reuse some bits of the pointer for storing additional bits of information. In fact, we already do this for slab pages, where the last bit indicates that a pointer has an attached vector of objcg pointers instead of a regular memcg pointer. This commits uses 2 existing helpers and introduces a new helper to converts all read sides to calls of these helpers: struct mem_cgroup *page_memcg(struct page *page); struct mem_cgroup *page_memcg_rcu(struct page *page); struct mem_cgroup *page_memcg_check(struct page *page); page_memcg_check() is intended to be used in cases when the page can be a slab page and have a memcg pointer pointing at objcg vector. It does check the lowest bit, and if set, returns NULL. page_memcg() contains a VM_BUG_ON_PAGE() check for the page not being a slab page. To make sure nobody uses a direct access, struct page's mem_cgroup/obj_cgroups is converted to unsigned long memcg_data. Signed-off-by: Roman Gushchin Signed-off-by: Andrew Morton Signed-off-by: Alexei Starovoitov Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Link: https://lkml.kernel.org/r/20201027001657.3398190-1-guro@fb.com Link: https://lkml.kernel.org/r/20201027001657.3398190-2-guro@fb.com Link: https://lore.kernel.org/bpf/20201201215900.3569844-2-guro@fb.com --- fs/buffer.c | 2 +- fs/iomap/buffered-io.c | 2 +- include/linux/memcontrol.h | 114 +++++++++++++++++++++++++++++++++--- include/linux/mm.h | 22 ------- include/linux/mm_types.h | 5 +- include/trace/events/writeback.h | 2 +- kernel/fork.c | 7 ++- mm/debug.c | 4 +- mm/huge_memory.c | 4 +- mm/memcontrol.c | 121 ++++++++++++++++++--------------------- mm/page_alloc.c | 4 +- mm/page_io.c | 6 +- mm/slab.h | 9 ++- mm/workingset.c | 2 +- 14 files changed, 184 insertions(+), 120 deletions(-) (limited to 'kernel') diff --git a/fs/buffer.c b/fs/buffer.c index 23f645657488..b56f99f82b5b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -657,7 +657,7 @@ int __set_page_dirty_buffers(struct page *page) } while (bh != head); } /* - * Lock out page->mem_cgroup migration to keep PageDirty + * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ lock_page_memcg(page); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 10cc7979ce38..16a1e82e3aeb 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -650,7 +650,7 @@ iomap_set_page_dirty(struct page *page) return !TestSetPageDirty(page); /* - * Lock out page->mem_cgroup migration to keep PageDirty + * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ lock_page_memcg(page); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e391e3c56de5..f95c1433461c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -343,6 +343,79 @@ struct mem_cgroup { extern struct mem_cgroup *root_mem_cgroup; +/* + * page_memcg - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + VM_BUG_ON_PAGE(PageSlab(page), page); + return (struct mem_cgroup *)page->memcg_data; +} + +/* + * page_memcg_rcu - locklessly get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function assumes that the page is known to have a + * proper memory cgroup pointer. It's not safe to call this function + * against some type of pages, e.g. slab pages or ex-slab pages. + */ +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + VM_BUG_ON_PAGE(PageSlab(page), page); + WARN_ON_ONCE(!rcu_read_lock_held()); + + return (struct mem_cgroup *)READ_ONCE(page->memcg_data); +} + +/* + * page_memcg_check - get the memory cgroup associated with a page + * @page: a pointer to the page struct + * + * Returns a pointer to the memory cgroup associated with the page, + * or NULL. This function unlike page_memcg() can take any page + * as an argument. It has to be used in cases when it's not known if a page + * has an associated memory cgroup pointer or an object cgroups vector. + * + * Any of the following ensures page and memcg binding stability: + * - the page lock + * - LRU isolation + * - lock_page_memcg() + * - exclusive reference + */ +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + /* + * Because page->memcg_data might be changed asynchronously + * for slab pages, READ_ONCE() should be used here. + */ + unsigned long memcg_data = READ_ONCE(page->memcg_data); + + /* + * The lowest bit set means that memcg isn't a valid + * memcg pointer, but a obj_cgroups pointer. + * In this case the page is shared and doesn't belong + * to any specific memory cgroup. + */ + if (memcg_data & 0x1UL) + return NULL; + + return (struct mem_cgroup *)memcg_data; +} + static __always_inline bool memcg_stat_item_in_bytes(int idx) { if (idx == MEMCG_PERCPU_B) @@ -743,15 +816,19 @@ static inline void mod_memcg_state(struct mem_cgroup *memcg, static inline void __mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - __mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + __mod_memcg_state(memcg, idx, val); } static inline void mod_memcg_page_state(struct page *page, int idx, int val) { - if (page->mem_cgroup) - mod_memcg_state(page->mem_cgroup, idx, val); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + mod_memcg_state(memcg, idx, val); } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, @@ -834,16 +911,17 @@ static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { struct page *head = compound_head(page); /* rmap on tail pages */ + struct mem_cgroup *memcg = page_memcg(head); pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; /* Untracked pages have no memcg, no lruvec. Update only the node */ - if (!head->mem_cgroup) { + if (!memcg) { __mod_node_page_state(pgdat, idx, val); return; } - lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); + lruvec = mem_cgroup_lruvec(memcg, pgdat); __mod_lruvec_state(lruvec, idx, val); } @@ -878,8 +956,10 @@ static inline void count_memcg_events(struct mem_cgroup *memcg, static inline void count_memcg_page_event(struct page *page, enum vm_event_item idx) { - if (page->mem_cgroup) - count_memcg_events(page->mem_cgroup, idx, 1); + struct mem_cgroup *memcg = page_memcg(page); + + if (memcg) + count_memcg_events(memcg, idx, 1); } static inline void count_memcg_event_mm(struct mm_struct *mm, @@ -941,6 +1021,22 @@ void mem_cgroup_split_huge_fixup(struct page *head); struct mem_cgroup; +static inline struct mem_cgroup *page_memcg(struct page *page) +{ + return NULL; +} + +static inline struct mem_cgroup *page_memcg_rcu(struct page *page) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + return NULL; +} + +static inline struct mem_cgroup *page_memcg_check(struct page *page) +{ + return NULL; +} + static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; @@ -1430,7 +1526,7 @@ static inline void mem_cgroup_track_foreign_dirty(struct page *page, if (mem_cgroup_disabled()) return; - if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) + if (unlikely(&page_memcg(page)->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(page, wb); } diff --git a/include/linux/mm.h b/include/linux/mm.h index db6ae4d3fb4e..6b0c9d2c1d10 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1484,28 +1484,6 @@ static inline void set_page_links(struct page *page, enum zone_type zone, #endif } -#ifdef CONFIG_MEMCG -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return page->mem_cgroup; -} -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return READ_ONCE(page->mem_cgroup); -} -#else -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return NULL; -} -static inline struct mem_cgroup *page_memcg_rcu(struct page *page) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - return NULL; -} -#endif - /* * Some inline functions in vmstat.h depend on page_zone() */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5a9238f6caad..80f5d755c037 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -199,10 +199,7 @@ struct page { atomic_t _refcount; #ifdef CONFIG_MEMCG - union { - struct mem_cgroup *mem_cgroup; - struct obj_cgroup **obj_cgroups; - }; + unsigned long memcg_data; #endif /* diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index e7cbccc7c14c..39a40dfb578a 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -257,7 +257,7 @@ TRACE_EVENT(track_foreign_dirty, __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); - __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); + __entry->page_cgroup_ino = cgroup_ino(page_memcg(page)->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", diff --git a/kernel/fork.c b/kernel/fork.c index 6d266388d380..cbd4f6f58409 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -404,9 +404,10 @@ static int memcg_charge_kernel_stack(struct task_struct *tsk) for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { /* - * If memcg_kmem_charge_page() fails, page->mem_cgroup - * pointer is NULL, and memcg_kmem_uncharge_page() in - * free_thread_stack() will ignore this page. + * If memcg_kmem_charge_page() fails, page's + * memory cgroup pointer is NULL, and + * memcg_kmem_uncharge_page() in free_thread_stack() + * will ignore this page. */ ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0); diff --git a/mm/debug.c b/mm/debug.c index ccca576b2899..8a40b3fefbeb 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -182,8 +182,8 @@ hex_only: pr_warn("page dumped because: %s\n", reason); #ifdef CONFIG_MEMCG - if (!page_poisoned && page->mem_cgroup) - pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup); + if (!page_poisoned && page->memcg_data) + pr_warn("pages's memcg:%lx\n", page->memcg_data); #endif } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9474dbc150ed..cedfb3503411 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -470,7 +470,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) #ifdef CONFIG_MEMCG static inline struct deferred_split *get_deferred_split_queue(struct page *page) { - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(compound_head(page)); struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); if (memcg) @@ -2765,7 +2765,7 @@ void deferred_split_huge_page(struct page *page) { struct deferred_split *ds_queue = get_deferred_split_queue(page); #ifdef CONFIG_MEMCG - struct mem_cgroup *memcg = compound_head(page)->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(compound_head(page)); #endif unsigned long flags; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3dcbf24d2227..3968d68503cb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -533,7 +533,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page) { struct mem_cgroup *memcg; - memcg = page->mem_cgroup; + memcg = page_memcg(page); if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) memcg = root_mem_cgroup; @@ -560,16 +560,7 @@ ino_t page_cgroup_ino(struct page *page) unsigned long ino = 0; rcu_read_lock(); - memcg = page->mem_cgroup; - - /* - * The lowest bit set means that memcg isn't a valid - * memcg pointer, but a obj_cgroups pointer. - * In this case the page is shared and doesn't belong - * to any specific memory cgroup. - */ - if ((unsigned long) memcg & 0x1UL) - memcg = NULL; + memcg = page_memcg_check(page); while (memcg && !(memcg->css.flags & CSS_ONLINE)) memcg = parent_mem_cgroup(memcg); @@ -1050,7 +1041,7 @@ EXPORT_SYMBOL(get_mem_cgroup_from_mm); */ struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); if (mem_cgroup_disabled()) return NULL; @@ -1349,7 +1340,7 @@ struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgd goto out; } - memcg = page->mem_cgroup; + memcg = page_memcg(page); /* * Swapcache readahead pages are added to the LRU - and * possibly migrated - before they are charged. @@ -2109,7 +2100,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) } /** - * lock_page_memcg - lock a page->mem_cgroup binding + * lock_page_memcg - lock a page and memcg binding * @page: the page * * This function protects unlocked LRU pages from being moved to @@ -2141,7 +2132,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page) if (mem_cgroup_disabled()) return NULL; again: - memcg = head->mem_cgroup; + memcg = page_memcg(head); if (unlikely(!memcg)) return NULL; @@ -2149,7 +2140,7 @@ again: return memcg; spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != head->mem_cgroup) { + if (memcg != page_memcg(head)) { spin_unlock_irqrestore(&memcg->move_lock, flags); goto again; } @@ -2187,14 +2178,14 @@ void __unlock_page_memcg(struct mem_cgroup *memcg) } /** - * unlock_page_memcg - unlock a page->mem_cgroup binding + * unlock_page_memcg - unlock a page and memcg binding * @page: the page */ void unlock_page_memcg(struct page *page) { struct page *head = compound_head(page); - __unlock_page_memcg(head->mem_cgroup); + __unlock_page_memcg(page_memcg(head)); } EXPORT_SYMBOL(unlock_page_memcg); @@ -2884,7 +2875,7 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) static void commit_charge(struct page *page, struct mem_cgroup *memcg) { - VM_BUG_ON_PAGE(page->mem_cgroup, page); + VM_BUG_ON_PAGE(page_memcg(page), page); /* * Any of the following ensures page->mem_cgroup stability: * @@ -2893,7 +2884,7 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg) * - lock_page_memcg() * - exclusive reference */ - page->mem_cgroup = memcg; + page->memcg_data = (unsigned long)memcg; } #ifdef CONFIG_MEMCG_KMEM @@ -2908,8 +2899,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, if (!vec) return -ENOMEM; - if (cmpxchg(&page->obj_cgroups, NULL, - (struct obj_cgroup **) ((unsigned long)vec | 0x1UL))) + if (cmpxchg(&page->memcg_data, 0, (unsigned long)vec | 0x1UL)) kfree(vec); else kmemleak_not_leak(vec); @@ -2920,6 +2910,12 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, /* * Returns a pointer to the memory cgroup to which the kernel object is charged. * + * A passed kernel object can be a slab object or a generic kernel page, so + * different mechanisms for getting the memory cgroup pointer should be used. + * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller + * can not know for sure how the kernel object is implemented. + * mem_cgroup_from_obj() can be safely used in such cases. + * * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(), * cgroup_mutex, etc. */ @@ -2932,17 +2928,6 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) page = virt_to_head_page(p); - /* - * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer - * or a pointer to obj_cgroup vector. In the latter case the lowest - * bit of the pointer is set. - * The page->mem_cgroup pointer can be asynchronously changed - * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed - * from a valid memcg pointer to objcg vector or back. - */ - if (!page->mem_cgroup) - return NULL; - /* * Slab objects are accounted individually, not per-page. * Memcg membership data for each individual object is saved in @@ -2960,8 +2945,14 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p) return NULL; } - /* All other pages use page->mem_cgroup */ - return page->mem_cgroup; + /* + * page_memcg_check() is used here, because page_has_obj_cgroups() + * check above could fail because the object cgroups vector wasn't set + * at that moment, but it can be set concurrently. + * page_memcg_check(page) will guarantee that a proper memory + * cgroup pointer or NULL will be returned. + */ + return page_memcg_check(page); } __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void) @@ -3099,7 +3090,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) if (memcg && !mem_cgroup_is_root(memcg)) { ret = __memcg_kmem_charge(memcg, gfp, 1 << order); if (!ret) { - page->mem_cgroup = memcg; + page->memcg_data = (unsigned long)memcg; __SetPageKmemcg(page); return 0; } @@ -3115,7 +3106,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) */ void __memcg_kmem_uncharge_page(struct page *page, int order) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); unsigned int nr_pages = 1 << order; if (!memcg) @@ -3123,7 +3114,7 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page); __memcg_kmem_uncharge(memcg, nr_pages); - page->mem_cgroup = NULL; + page->memcg_data = 0; css_put(&memcg->css); /* slab pages do not have PageKmemcg flag set */ @@ -3274,7 +3265,7 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size) */ void mem_cgroup_split_huge_fixup(struct page *head) { - struct mem_cgroup *memcg = head->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(head); int i; if (mem_cgroup_disabled()) @@ -3282,7 +3273,7 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { css_get(&memcg->css); - head[i].mem_cgroup = memcg; + head[i].memcg_data = (unsigned long)memcg; } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -4664,7 +4655,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, struct bdi_writeback *wb) { - struct mem_cgroup *memcg = page->mem_cgroup; + struct mem_cgroup *memcg = page_memcg(page); struct memcg_cgwb_frn *frn; u64 now = get_jiffies_64(); u64 oldest_at = now; @@ -5641,14 +5632,14 @@ static int mem_cgroup_move_account(struct page *page, /* * Prevent mem_cgroup_migrate() from looking at - * page->mem_cgroup of its source page while we change it. + * page's memory cgroup of its source page while we change it. */ ret = -EBUSY; if (!trylock_page(page)) goto out; ret = -EINVAL; - if (page->mem_cgroup != from) + if (page_memcg(page) != from) goto out_unlock; pgdat = page_pgdat(page); @@ -5703,13 +5694,13 @@ static int mem_cgroup_move_account(struct page *page, /* * All state has been migrated, let's switch to the new memcg. * - * It is safe to change page->mem_cgroup here because the page + * It is safe to change page's memcg here because the page * is referenced, charged, isolated, and locked: we can't race * with (un)charging, migration, LRU putback, or anything else - * that would rely on a stable page->mem_cgroup. + * that would rely on a stable page's memory cgroup. * * Note that lock_page_memcg is a memcg lock, not a page lock, - * to save space. As soon as we switch page->mem_cgroup to a + * to save space. As soon as we switch page's memory cgroup to a * new memcg that isn't locked, the above state can change * concurrently again. Make sure we're truly done with it. */ @@ -5718,7 +5709,7 @@ static int mem_cgroup_move_account(struct page *page, css_get(&to->css); css_put(&from->css); - page->mem_cgroup = to; + page->memcg_data = (unsigned long)to; __unlock_page_memcg(from); @@ -5784,7 +5775,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, * mem_cgroup_move_account() checks the page is valid or * not under LRU exclusion. */ - if (page->mem_cgroup == mc.from) { + if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; if (is_device_private_page(page)) ret = MC_TARGET_DEVICE; @@ -5828,7 +5819,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!page || !PageHead(page), page); if (!(mc.flags & MOVE_ANON)) return ret; - if (page->mem_cgroup == mc.from) { + if (page_memcg(page) == mc.from) { ret = MC_TARGET_PAGE; if (target) { get_page(page); @@ -6774,12 +6765,12 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) /* * Every swap fault against a single page tries to charge the * page, bail as early as possible. shmem_unuse() encounters - * already charged pages, too. page->mem_cgroup is protected - * by the page lock, which serializes swap cache removal, which - * in turn serializes uncharging. + * already charged pages, too. page and memcg binding is + * protected by the page lock, which serializes swap cache + * removal, which in turn serializes uncharging. */ VM_BUG_ON_PAGE(!PageLocked(page), page); - if (compound_head(page)->mem_cgroup) + if (page_memcg(compound_head(page))) goto out; id = lookup_swap_cgroup_id(ent); @@ -6863,21 +6854,21 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) VM_BUG_ON_PAGE(PageLRU(page), page); - if (!page->mem_cgroup) + if (!page_memcg(page)) return; /* * Nobody should be changing or seriously looking at - * page->mem_cgroup at this point, we have fully + * page_memcg(page) at this point, we have fully * exclusive access to the page. */ - if (ug->memcg != page->mem_cgroup) { + if (ug->memcg != page_memcg(page)) { if (ug->memcg) { uncharge_batch(ug); uncharge_gather_clear(ug); } - ug->memcg = page->mem_cgroup; + ug->memcg = page_memcg(page); /* pairs with css_put in uncharge_batch */ css_get(&ug->memcg->css); @@ -6894,7 +6885,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) } ug->dummy_page = page; - page->mem_cgroup = NULL; + page->memcg_data = 0; css_put(&ug->memcg->css); } @@ -6937,7 +6928,7 @@ void mem_cgroup_uncharge(struct page *page) return; /* Don't touch page->lru of any random page, pre-check: */ - if (!page->mem_cgroup) + if (!page_memcg(page)) return; uncharge_gather_clear(&ug); @@ -6987,11 +6978,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) return; /* Page cache replacement: new page already charged? */ - if (newpage->mem_cgroup) + if (page_memcg(newpage)) return; /* Swapcache readahead pages can get replaced before being charged */ - memcg = oldpage->mem_cgroup; + memcg = page_memcg(oldpage); if (!memcg) return; @@ -7186,7 +7177,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return; - memcg = page->mem_cgroup; + memcg = page_memcg(page); /* Readahead page, never charged */ if (!memcg) @@ -7207,7 +7198,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) VM_BUG_ON_PAGE(oldid, page); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - page->mem_cgroup = NULL; + page->memcg_data = 0; if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); @@ -7250,7 +7241,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) return 0; - memcg = page->mem_cgroup; + memcg = page_memcg(page); /* Readahead page, never charged */ if (!memcg) @@ -7331,7 +7322,7 @@ bool mem_cgroup_swap_full(struct page *page) if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; - memcg = page->mem_cgroup; + memcg = page_memcg(page); if (!memcg) return false; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23f5066bd4a5..271133b8243b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1092,7 +1092,7 @@ static inline bool page_expected_state(struct page *page, if (unlikely((unsigned long)page->mapping | page_ref_count(page) | #ifdef CONFIG_MEMCG - (unsigned long)page->mem_cgroup | + (unsigned long)page_memcg(page) | #endif (page->flags & check_flags))) return false; @@ -1117,7 +1117,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; } #ifdef CONFIG_MEMCG - if (unlikely(page->mem_cgroup)) + if (unlikely(page_memcg(page))) bad_reason = "page still charged to cgroup"; #endif return bad_reason; diff --git a/mm/page_io.c b/mm/page_io.c index 433df1263349..9bca17ecc4df 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -291,12 +291,14 @@ static inline void count_swpout_vm_event(struct page *page) static void bio_associate_blkg_from_page(struct bio *bio, struct page *page) { struct cgroup_subsys_state *css; + struct mem_cgroup *memcg; - if (!page->mem_cgroup) + memcg = page_memcg(page); + if (!memcg) return; rcu_read_lock(); - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); + css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys); bio_associate_blkg_from_css(bio, css); rcu_read_unlock(); } diff --git a/mm/slab.h b/mm/slab.h index 6d7c6a5056ba..e2535cee0d33 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -242,18 +242,17 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla static inline struct obj_cgroup **page_obj_cgroups(struct page *page) { /* - * page->mem_cgroup and page->obj_cgroups are sharing the same + * Page's memory cgroup and obj_cgroups vector are sharing the same * space. To distinguish between them in case we don't know for sure * that the page is a slab page (e.g. page_cgroup_ino()), let's * always set the lowest bit of obj_cgroups. */ - return (struct obj_cgroup **) - ((unsigned long)page->obj_cgroups & ~0x1UL); + return (struct obj_cgroup **)(page->memcg_data & ~0x1UL); } static inline bool page_has_obj_cgroups(struct page *page) { - return ((unsigned long)page->obj_cgroups & 0x1UL); + return page->memcg_data & 0x1UL; } int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, @@ -262,7 +261,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s, static inline void memcg_free_page_obj_cgroups(struct page *page) { kfree(page_obj_cgroups(page)); - page->obj_cgroups = NULL; + page->memcg_data = 0; } static inline size_t obj_full_size(struct kmem_cache *s) diff --git a/mm/workingset.c b/mm/workingset.c index 975a4d2dd02e..130348cbf40a 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -257,7 +257,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg) struct lruvec *lruvec; int memcgid; - /* Page is fully exclusive and pins page->mem_cgroup */ + /* Page is fully exclusive and pins page's memory cgroup pointer */ VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); -- cgit v1.2.3 From ddf8503c7c434374a1112e89bcedfe1ccb3057df Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:31 -0800 Subject: bpf: Memcg-based memory accounting for bpf progs Include memory used by bpf programs into the memcg-based accounting. This includes the memory used by programs itself, auxiliary data, statistics and bpf line info. A memory cgroup containing the process which loads the program is getting charged. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-6-guro@fb.com --- kernel/bpf/core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ff55cbcfbab4..2921f58c03a8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -77,7 +77,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; struct bpf_prog *fp; @@ -86,7 +86,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag if (fp == NULL) return NULL; - aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); + aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT | gfp_extra_flags); if (aux == NULL) { vfree(fp); return NULL; @@ -106,7 +106,7 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *prog; int cpu; @@ -138,7 +138,7 @@ int bpf_prog_alloc_jited_linfo(struct bpf_prog *prog) prog->aux->jited_linfo = kcalloc(prog->aux->nr_linfo, sizeof(*prog->aux->jited_linfo), - GFP_KERNEL | __GFP_NOWARN); + GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (!prog->aux->jited_linfo) return -ENOMEM; @@ -219,7 +219,7 @@ void bpf_prog_free_linfo(struct bpf_prog *prog) struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; u32 pages, delta; int ret; -- cgit v1.2.3 From 48edc1f78aabeba35ed00e40c36f211de89e0090 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:32 -0800 Subject: bpf: Prepare for memcg-based memory accounting for bpf maps Bpf maps can be updated from an interrupt context and in such case there is no process which can be charged. It makes the memory accounting of bpf maps non-trivial. Fortunately, after commit 4127c6504f25 ("mm: kmem: enable kernel memcg accounting from interrupt contexts") and commit b87d8cefe43c ("mm, memcg: rework remote charging API to support nesting") it's finally possible. To make the ownership model simple and consistent, when the map is created, the memory cgroup of the current process is recorded. All subsequent allocations related to the bpf map are charged to the same memory cgroup. It includes allocations made by any processes (even if they do belong to a different cgroup) and from interrupts. This commit introduces 3 new helpers, which will be used by following commits to enable the accounting of bpf maps memory: - bpf_map_kmalloc_node() - bpf_map_kzalloc() - bpf_map_alloc_percpu() They are wrapping popular memory allocation functions. They set the active memory cgroup to the map's memory cgroup and add __GFP_ACCOUNT to the passed gfp flags. Then they call into the corresponding memory allocation function and restore the original active memory cgroup. These helpers are supposed to use everywhere except the map creation path. During the map creation when the map structure is allocated by itself, it cannot be passed to those helpers. In those cases default memory allocation function will be used with the __GFP_ACCOUNT flag. Signed-off-by: Roman Gushchin Acked-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-7-guro@fb.com --- include/linux/bpf.h | 34 ++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1bcb6d7345c..e1f2c95c15ec 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -20,6 +20,8 @@ #include #include #include +#include +#include struct bpf_verifier_env; struct bpf_verifier_log; @@ -37,6 +39,7 @@ struct bpf_iter_aux_info; struct bpf_local_storage; struct bpf_local_storage_map; struct kobject; +struct mem_cgroup; extern struct idr btf_idr; extern spinlock_t btf_idr_lock; @@ -161,6 +164,9 @@ struct bpf_map { u32 btf_value_type_id; struct btf *btf; struct bpf_map_memory memory; +#ifdef CONFIG_MEMCG_KMEM + struct mem_cgroup *memcg; +#endif char name[BPF_OBJ_NAME_LEN]; u32 btf_vmlinux_value_type_id; bool bypass_spec_v1; @@ -1240,6 +1246,34 @@ int generic_map_delete_batch(struct bpf_map *map, struct bpf_map *bpf_map_get_curr_or_next(u32 *id); struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id); +#ifdef CONFIG_MEMCG_KMEM +void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, + int node); +void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags); +void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, + size_t align, gfp_t flags); +#else +static inline void * +bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, + int node) +{ + return kmalloc_node(size, flags, node); +} + +static inline void * +bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) +{ + return kzalloc(size, flags); +} + +static inline void __percpu * +bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, size_t align, + gfp_t flags) +{ + return __alloc_percpu_gfp(size, align, flags); +} +#endif + extern int sysctl_unprivileged_bpf_disabled; static inline bool bpf_allow_ptr_leaks(void) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f3fe9f53f93c..dedbf6d4cd84 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -31,6 +31,7 @@ #include #include #include +#include #define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \ @@ -456,6 +457,65 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) __release(&map_idr_lock); } +#ifdef CONFIG_MEMCG_KMEM +static void bpf_map_save_memcg(struct bpf_map *map) +{ + map->memcg = get_mem_cgroup_from_mm(current->mm); +} + +static void bpf_map_release_memcg(struct bpf_map *map) +{ + mem_cgroup_put(map->memcg); +} + +void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags, + int node) +{ + struct mem_cgroup *old_memcg; + void *ptr; + + old_memcg = set_active_memcg(map->memcg); + ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node); + set_active_memcg(old_memcg); + + return ptr; +} + +void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags) +{ + struct mem_cgroup *old_memcg; + void *ptr; + + old_memcg = set_active_memcg(map->memcg); + ptr = kzalloc(size, flags | __GFP_ACCOUNT); + set_active_memcg(old_memcg); + + return ptr; +} + +void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size, + size_t align, gfp_t flags) +{ + struct mem_cgroup *old_memcg; + void __percpu *ptr; + + old_memcg = set_active_memcg(map->memcg); + ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT); + set_active_memcg(old_memcg); + + return ptr; +} + +#else +static void bpf_map_save_memcg(struct bpf_map *map) +{ +} + +static void bpf_map_release_memcg(struct bpf_map *map) +{ +} +#endif + /* called from workqueue */ static void bpf_map_free_deferred(struct work_struct *work) { @@ -464,6 +524,7 @@ static void bpf_map_free_deferred(struct work_struct *work) bpf_map_charge_move(&mem, &map->memory); security_bpf_map_free(map); + bpf_map_release_memcg(map); /* implementation dependent freeing */ map->ops->map_free(map); bpf_map_charge_finish(&mem); @@ -875,6 +936,8 @@ static int map_create(union bpf_attr *attr) if (err) goto free_map_sec; + bpf_map_save_memcg(map); + err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. -- cgit v1.2.3 From d5299b67dd59445902cd30cbc60a03c869cf1adb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:33 -0800 Subject: bpf: Memcg-based memory accounting for bpf maps This patch enables memcg-based memory accounting for memory allocated by __bpf_map_area_alloc(), which is used by many types of bpf maps for large initial memory allocations. Please note, that __bpf_map_area_alloc() should not be used outside of map creation paths without setting the active memory cgroup to the map's memory cgroup. Following patches in the series will refine the accounting for some of the map types. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-8-guro@fb.com --- kernel/bpf/syscall.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dedbf6d4cd84..dff3a5f62d7a 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -268,6 +268,10 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value, return err; } +/* Please, do not use this function outside from the map creation path + * (e.g. in map update path) without taking care of setting the active + * memory cgroup (see at bpf_map_kmalloc_node() for example). + */ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) { /* We really just want to fail instead of triggering OOM killer @@ -280,7 +284,7 @@ static void *__bpf_map_area_alloc(u64 size, int numa_node, bool mmapable) * __GFP_RETRY_MAYFAIL to avoid such situations. */ - const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO; + const gfp_t gfp = __GFP_NOWARN | __GFP_ZERO | __GFP_ACCOUNT; unsigned int flags = 0; unsigned long align = 1; void *area; -- cgit v1.2.3 From 6d192c7938b7e53a6bb55b90b86bd02ea0153731 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:34 -0800 Subject: bpf: Refine memcg-based memory accounting for arraymap maps Include percpu arrays and auxiliary data into the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-9-guro@fb.com --- kernel/bpf/arraymap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index c6c81eceb68f..d837e0603c89 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -34,8 +34,8 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) int i; for (i = 0; i < array->map.max_entries; i++) { - ptr = __alloc_percpu_gfp(array->elem_size, 8, - GFP_USER | __GFP_NOWARN); + ptr = bpf_map_alloc_percpu(&array->map, array->elem_size, 8, + GFP_USER | __GFP_NOWARN); if (!ptr) { bpf_array_free_percpu(array); return -ENOMEM; @@ -1018,7 +1018,7 @@ static struct bpf_map *prog_array_map_alloc(union bpf_attr *attr) struct bpf_array_aux *aux; struct bpf_map *map; - aux = kzalloc(sizeof(*aux), GFP_KERNEL); + aux = kzalloc(sizeof(*aux), GFP_KERNEL_ACCOUNT); if (!aux) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From e88cc05b61f3fe8bd4bd8ce1a0a2d03357225305 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:35 -0800 Subject: bpf: Refine memcg-based memory accounting for cpumap maps Include metadata and percpu data into the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-10-guro@fb.com --- kernel/bpf/cpumap.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index c61a23b564aa..90b949666605 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -97,7 +97,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) attr->map_flags & ~BPF_F_NUMA_NODE) return ERR_PTR(-EINVAL); - cmap = kzalloc(sizeof(*cmap), GFP_USER); + cmap = kzalloc(sizeof(*cmap), GFP_USER | __GFP_ACCOUNT); if (!cmap) return ERR_PTR(-ENOMEM); @@ -412,7 +412,8 @@ static int __cpu_map_load_bpf_program(struct bpf_cpu_map_entry *rcpu, int fd) } static struct bpf_cpu_map_entry * -__cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) +__cpu_map_entry_alloc(struct bpf_map *map, struct bpf_cpumap_val *value, + u32 cpu) { int numa, err, i, fd = value->bpf_prog.fd; gfp_t gfp = GFP_KERNEL | __GFP_NOWARN; @@ -422,13 +423,13 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) /* Have map->numa_node, but choose node of redirect target CPU */ numa = cpu_to_node(cpu); - rcpu = kzalloc_node(sizeof(*rcpu), gfp, numa); + rcpu = bpf_map_kmalloc_node(map, sizeof(*rcpu), gfp | __GFP_ZERO, numa); if (!rcpu) return NULL; /* Alloc percpu bulkq */ - rcpu->bulkq = __alloc_percpu_gfp(sizeof(*rcpu->bulkq), - sizeof(void *), gfp); + rcpu->bulkq = bpf_map_alloc_percpu(map, sizeof(*rcpu->bulkq), + sizeof(void *), gfp); if (!rcpu->bulkq) goto free_rcu; @@ -438,7 +439,8 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) } /* Alloc queue */ - rcpu->queue = kzalloc_node(sizeof(*rcpu->queue), gfp, numa); + rcpu->queue = bpf_map_kmalloc_node(map, sizeof(*rcpu->queue), gfp, + numa); if (!rcpu->queue) goto free_bulkq; @@ -447,7 +449,7 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) goto free_queue; rcpu->cpu = cpu; - rcpu->map_id = map_id; + rcpu->map_id = map->id; rcpu->value.qsize = value->qsize; if (fd > 0 && __cpu_map_load_bpf_program(rcpu, fd)) @@ -455,7 +457,8 @@ __cpu_map_entry_alloc(struct bpf_cpumap_val *value, u32 cpu, int map_id) /* Setup kthread */ rcpu->kthread = kthread_create_on_node(cpu_map_kthread_run, rcpu, numa, - "cpumap/%d/map:%d", cpu, map_id); + "cpumap/%d/map:%d", cpu, + map->id); if (IS_ERR(rcpu->kthread)) goto free_prog; @@ -571,7 +574,7 @@ static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, rcpu = NULL; /* Same as deleting */ } else { /* Updating qsize cause re-allocation of bpf_cpu_map_entry */ - rcpu = __cpu_map_entry_alloc(&cpumap_value, key_cpu, map->id); + rcpu = __cpu_map_entry_alloc(map, &cpumap_value, key_cpu); if (!rcpu) return -ENOMEM; rcpu->cmap = cmap; -- cgit v1.2.3 From 3a61c7c58b3012ac28c166801842615ca99b49c5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:36 -0800 Subject: bpf: Memcg-based memory accounting for cgroup storage maps Account memory used by cgroup storage maps including metadata structures. Account the percpu memory for the percpu flavor of cgroup storage. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-11-guro@fb.com --- kernel/bpf/local_storage.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 571bb351ed3b..74dcee8926e5 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -164,10 +164,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key, return 0; } - new = kmalloc_node(sizeof(struct bpf_storage_buffer) + - map->value_size, - __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, - map->numa_node); + new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) + + map->value_size, + __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN, + map->numa_node); if (!new) return -ENOMEM; @@ -313,7 +313,7 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) return ERR_PTR(ret); map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), - __GFP_ZERO | GFP_USER, numa_node); + __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); if (!map) { bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); @@ -496,9 +496,9 @@ static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages) struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { + const gfp_t gfp = __GFP_ZERO | GFP_USER; struct bpf_cgroup_storage *storage; struct bpf_map *map; - gfp_t flags; size_t size; u32 pages; @@ -511,20 +511,19 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, if (bpf_map_charge_memlock(map, pages)) return ERR_PTR(-EPERM); - storage = kmalloc_node(sizeof(struct bpf_cgroup_storage), - __GFP_ZERO | GFP_USER, map->numa_node); + storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), + gfp, map->numa_node); if (!storage) goto enomem; - flags = __GFP_ZERO | GFP_USER; - if (stype == BPF_CGROUP_STORAGE_SHARED) { - storage->buf = kmalloc_node(size, flags, map->numa_node); + storage->buf = bpf_map_kmalloc_node(map, size, gfp, + map->numa_node); if (!storage->buf) goto enomem; check_and_init_map_lock(map, storage->buf->data); } else { - storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags); + storage->percpu_buf = bpf_map_alloc_percpu(map, size, 8, gfp); if (!storage->percpu_buf) goto enomem; } -- cgit v1.2.3 From 1440290adf7bb27602bbb7d8b2dc3d903ed3c6c9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:37 -0800 Subject: bpf: Refine memcg-based memory accounting for devmap maps Include map metadata and the node size (struct bpf_dtab_netdev) into the accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-12-guro@fb.com --- kernel/bpf/devmap.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2b5ca93c17de..b43ab247302d 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -175,7 +175,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!capable(CAP_NET_ADMIN)) return ERR_PTR(-EPERM); - dtab = kzalloc(sizeof(*dtab), GFP_USER); + dtab = kzalloc(sizeof(*dtab), GFP_USER | __GFP_ACCOUNT); if (!dtab) return ERR_PTR(-ENOMEM); @@ -602,8 +602,9 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_prog *prog = NULL; struct bpf_dtab_netdev *dev; - dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN, - dtab->map.numa_node); + dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev), + GFP_ATOMIC | __GFP_NOWARN, + dtab->map.numa_node); if (!dev) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 881456811a33b9d3952897f4d01ee4d74fa2f30e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:38 -0800 Subject: bpf: Refine memcg-based memory accounting for hashtab maps Include percpu objects and the size of map metadata into the accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-13-guro@fb.com --- kernel/bpf/hashtab.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index ec46266aaf1c..bf70fb3ed9c1 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -292,7 +292,8 @@ static int prealloc_init(struct bpf_htab *htab) u32 size = round_up(htab->map.value_size, 8); void __percpu *pptr; - pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); + pptr = bpf_map_alloc_percpu(&htab->map, size, 8, + GFP_USER | __GFP_NOWARN); if (!pptr) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, @@ -346,8 +347,8 @@ static int alloc_extra_elems(struct bpf_htab *htab) struct pcpu_freelist_node *l; int cpu; - pptr = __alloc_percpu_gfp(sizeof(struct htab_elem *), 8, - GFP_USER | __GFP_NOWARN); + pptr = bpf_map_alloc_percpu(&htab->map, sizeof(struct htab_elem *), 8, + GFP_USER | __GFP_NOWARN); if (!pptr) return -ENOMEM; @@ -444,7 +445,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) int err, i; u64 cost; - htab = kzalloc(sizeof(*htab), GFP_USER); + htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); if (!htab) return ERR_PTR(-ENOMEM); @@ -502,8 +503,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) goto free_charge; for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { - htab->map_locked[i] = __alloc_percpu_gfp(sizeof(int), - sizeof(int), GFP_USER); + htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, + sizeof(int), + sizeof(int), + GFP_USER); if (!htab->map_locked[i]) goto free_map_locked; } @@ -925,8 +928,9 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, l_new = ERR_PTR(-E2BIG); goto dec_count; } - l_new = kmalloc_node(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN, - htab->map.numa_node); + l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size, + GFP_ATOMIC | __GFP_NOWARN, + htab->map.numa_node); if (!l_new) { l_new = ERR_PTR(-ENOMEM); goto dec_count; @@ -942,8 +946,8 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, pptr = htab_elem_get_ptr(l_new, key_size); } else { /* alloc_percpu zero-fills */ - pptr = __alloc_percpu_gfp(size, 8, - GFP_ATOMIC | __GFP_NOWARN); + pptr = bpf_map_alloc_percpu(&htab->map, size, 8, + GFP_ATOMIC | __GFP_NOWARN); if (!pptr) { kfree(l_new); l_new = ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 353e7af4bf5e7247c35e9ba5beab42904f1b3499 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:39 -0800 Subject: bpf: Memcg-based memory accounting for lpm_trie maps Include lpm trie and lpm trie node objects into the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-14-guro@fb.com --- kernel/bpf/lpm_trie.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 00e32f2ec3e6..1a6981203d7f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -282,8 +282,8 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie, if (value) size += trie->map.value_size; - node = kmalloc_node(size, GFP_ATOMIC | __GFP_NOWARN, - trie->map.numa_node); + node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN, + trie->map.numa_node); if (!node) return NULL; @@ -557,7 +557,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) attr->value_size > LPM_VAL_SIZE_MAX) return ERR_PTR(-EINVAL); - trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN); + trie = kzalloc(sizeof(*trie), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!trie) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From be4035c734d12918866c5eb2c496d420aa80adeb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:40 -0800 Subject: bpf: Memcg-based memory accounting for bpf ringbuffer Enable the memcg-based memory accounting for the memory used by the bpf ringbuffer. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-15-guro@fb.com --- kernel/bpf/ringbuf.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 31cb04a4dd2d..8983a46f6580 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -60,8 +60,8 @@ struct bpf_ringbuf_hdr { static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) { - const gfp_t flags = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN | - __GFP_ZERO; + const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL | + __GFP_NOWARN | __GFP_ZERO; int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES; int nr_data_pages = data_sz >> PAGE_SHIFT; int nr_pages = nr_meta_pages + nr_data_pages; @@ -88,10 +88,7 @@ static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node) * user-space implementations significantly. */ array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages); - if (array_size > PAGE_SIZE) - pages = vmalloc_node(array_size, numa_node); - else - pages = kmalloc_node(array_size, flags, numa_node); + pages = bpf_map_area_alloc(array_size, numa_node); if (!pages) return NULL; @@ -167,7 +164,7 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) return ERR_PTR(-E2BIG); #endif - rb_map = kzalloc(sizeof(*rb_map), GFP_USER); + rb_map = kzalloc(sizeof(*rb_map), GFP_USER | __GFP_ACCOUNT); if (!rb_map) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From e9aae8beba825e4670463ddcf420b954f18d5ced Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:41 -0800 Subject: bpf: Memcg-based memory accounting for bpf local storage maps Account memory used by bpf local storage maps: per-socket, per-inode and per-task storages. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201201215900.3569844-16-guro@fb.com --- kernel/bpf/bpf_local_storage.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 5d3a7af9ba9b..023a3eaa4d74 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -67,7 +67,8 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, if (charge_mem && mem_charge(smap, owner, smap->elem_size)) return NULL; - selem = kzalloc(smap->elem_size, GFP_ATOMIC | __GFP_NOWARN); + selem = bpf_map_kzalloc(&smap->map, smap->elem_size, + GFP_ATOMIC | __GFP_NOWARN); if (selem) { if (value) memcpy(SDATA(selem)->data, value, smap->map.value_size); @@ -264,7 +265,8 @@ int bpf_local_storage_alloc(void *owner, if (err) return err; - storage = kzalloc(sizeof(*storage), GFP_ATOMIC | __GFP_NOWARN); + storage = bpf_map_kzalloc(&smap->map, sizeof(*storage), + GFP_ATOMIC | __GFP_NOWARN); if (!storage) { err = -ENOMEM; goto uncharge; @@ -546,7 +548,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) u64 cost; int ret; - smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN); + smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap) return ERR_PTR(-ENOMEM); bpf_map_init_from_attr(&smap->map, attr); @@ -564,7 +566,7 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) } smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, - GFP_USER | __GFP_NOWARN); + GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap->buckets) { bpf_map_charge_finish(&smap->map.memory); kfree(smap); -- cgit v1.2.3 From 1bc5975613ed155fc57ee321041d3463e580b4a3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:44 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for arraymap maps Do not use rlimit-based memory accounting for arraymap maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-19-guro@fb.com --- kernel/bpf/arraymap.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index d837e0603c89..1f8453343bf2 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -81,11 +81,10 @@ int array_map_alloc_check(union bpf_attr *attr) static struct bpf_map *array_map_alloc(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; - int ret, numa_node = bpf_map_attr_numa_node(attr); + int numa_node = bpf_map_attr_numa_node(attr); u32 elem_size, index_mask, max_entries; bool bypass_spec_v1 = bpf_bypass_spec_v1(); - u64 cost, array_size, mask64; - struct bpf_map_memory mem; + u64 array_size, mask64; struct bpf_array *array; elem_size = round_up(attr->value_size, 8); @@ -126,44 +125,29 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } } - /* make sure there is no u32 overflow later in round_up() */ - cost = array_size; - if (percpu) - cost += (u64)attr->max_entries * elem_size * num_possible_cpus(); - - ret = bpf_map_charge_init(&mem, cost); - if (ret < 0) - return ERR_PTR(ret); - /* allocate all map elements and zero-initialize them */ if (attr->map_flags & BPF_F_MMAPABLE) { void *data; /* kmalloc'ed memory can't be mmap'ed, use explicit vmalloc */ data = bpf_map_area_mmapable_alloc(array_size, numa_node); - if (!data) { - bpf_map_charge_finish(&mem); + if (!data) return ERR_PTR(-ENOMEM); - } array = data + PAGE_ALIGN(sizeof(struct bpf_array)) - offsetof(struct bpf_array, value); } else { array = bpf_map_area_alloc(array_size, numa_node); } - if (!array) { - bpf_map_charge_finish(&mem); + if (!array) return ERR_PTR(-ENOMEM); - } array->index_mask = index_mask; array->map.bypass_spec_v1 = bypass_spec_v1; /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - bpf_map_charge_move(&array->map.memory, &mem); array->elem_size = elem_size; if (percpu && bpf_array_alloc_percpu(array)) { - bpf_map_charge_finish(&array->map.memory); bpf_map_area_free(array); return ERR_PTR(-ENOMEM); } -- cgit v1.2.3 From f043733f31e5e12c6254045a03e519290543fa1b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:45 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for bpf_struct_ops maps Do not use rlimit-based memory accounting for bpf_struct_ops maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-20-guro@fb.com --- kernel/bpf/bpf_struct_ops.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c index 4c3b543bb33b..1a666a975416 100644 --- a/kernel/bpf/bpf_struct_ops.c +++ b/kernel/bpf/bpf_struct_ops.c @@ -548,12 +548,10 @@ static int bpf_struct_ops_map_alloc_check(union bpf_attr *attr) static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) { const struct bpf_struct_ops *st_ops; - size_t map_total_size, st_map_size; + size_t st_map_size; struct bpf_struct_ops_map *st_map; const struct btf_type *t, *vt; - struct bpf_map_memory mem; struct bpf_map *map; - int err; if (!bpf_capable()) return ERR_PTR(-EPERM); @@ -573,20 +571,11 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) * struct bpf_struct_ops_tcp_congestions_ops */ (vt->size - sizeof(struct bpf_struct_ops_value)); - map_total_size = st_map_size + - /* uvalue */ - sizeof(vt->size) + - /* struct bpf_progs **progs */ - btf_type_vlen(t) * sizeof(struct bpf_prog *); - err = bpf_map_charge_init(&mem, map_total_size); - if (err < 0) - return ERR_PTR(err); st_map = bpf_map_area_alloc(st_map_size, NUMA_NO_NODE); - if (!st_map) { - bpf_map_charge_finish(&mem); + if (!st_map) return ERR_PTR(-ENOMEM); - } + st_map->st_ops = st_ops; map = &st_map->map; @@ -597,14 +586,12 @@ static struct bpf_map *bpf_struct_ops_map_alloc(union bpf_attr *attr) st_map->image = bpf_jit_alloc_exec(PAGE_SIZE); if (!st_map->uvalue || !st_map->progs || !st_map->image) { bpf_struct_ops_map_free(map); - bpf_map_charge_finish(&mem); return ERR_PTR(-ENOMEM); } mutex_init(&st_map->lock); set_vm_flush_reset_perms(st_map->image); bpf_map_init_from_attr(map, attr); - bpf_map_charge_move(&map->memory, &mem); return map; } -- cgit v1.2.3 From 711cabaf1432fbec4a5f9ffcfbfe2ed7a78cd096 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:46 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for cpumap maps Do not use rlimit-based memory accounting for cpumap maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-21-guro@fb.com --- kernel/bpf/cpumap.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 90b949666605..747313698178 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -84,8 +84,6 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) u32 value_size = attr->value_size; struct bpf_cpu_map *cmap; int err = -ENOMEM; - u64 cost; - int ret; if (!bpf_capable()) return ERR_PTR(-EPERM); @@ -109,26 +107,14 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) goto free_cmap; } - /* make sure page count doesn't overflow */ - cost = (u64) cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *); - - /* Notice returns -EPERM on if map size is larger than memlock limit */ - ret = bpf_map_charge_init(&cmap->map.memory, cost); - if (ret) { - err = ret; - goto free_cmap; - } - /* Alloc array for possible remote "destination" CPUs */ cmap->cpu_map = bpf_map_area_alloc(cmap->map.max_entries * sizeof(struct bpf_cpu_map_entry *), cmap->map.numa_node); if (!cmap->cpu_map) - goto free_charge; + goto free_cmap; return &cmap->map; -free_charge: - bpf_map_charge_finish(&cmap->map.memory); free_cmap: kfree(cmap); return ERR_PTR(err); -- cgit v1.2.3 From 087b0d39fe22dcc2ddcef7ed699c658f0e725bac Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:47 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for cgroup storage maps Do not use rlimit-based memory accounting for cgroup storage maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-22-guro@fb.com --- kernel/bpf/local_storage.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c index 74dcee8926e5..2d4f9ac12377 100644 --- a/kernel/bpf/local_storage.c +++ b/kernel/bpf/local_storage.c @@ -287,8 +287,6 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); struct bpf_cgroup_storage_map *map; - struct bpf_map_memory mem; - int ret; if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) && attr->key_size != sizeof(__u64)) @@ -308,18 +306,10 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr) /* max_entries is not used and enforced to be 0 */ return ERR_PTR(-EINVAL); - ret = bpf_map_charge_init(&mem, sizeof(struct bpf_cgroup_storage_map)); - if (ret < 0) - return ERR_PTR(ret); - map = kmalloc_node(sizeof(struct bpf_cgroup_storage_map), __GFP_ZERO | GFP_USER | __GFP_ACCOUNT, numa_node); - if (!map) { - bpf_map_charge_finish(&mem); + if (!map) return ERR_PTR(-ENOMEM); - } - - bpf_map_charge_move(&map->map.memory, &mem); /* copy mandatory map attributes */ bpf_map_init_from_attr(&map->map, attr); @@ -508,9 +498,6 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, size = bpf_cgroup_storage_calculate_size(map, &pages); - if (bpf_map_charge_memlock(map, pages)) - return ERR_PTR(-EPERM); - storage = bpf_map_kmalloc_node(map, sizeof(struct bpf_cgroup_storage), gfp, map->numa_node); if (!storage) @@ -533,7 +520,6 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog, return storage; enomem: - bpf_map_uncharge_memlock(map, pages); kfree(storage); return ERR_PTR(-ENOMEM); } @@ -560,16 +546,11 @@ void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage) { enum bpf_cgroup_storage_type stype; struct bpf_map *map; - u32 pages; if (!storage) return; map = &storage->map->map; - - bpf_cgroup_storage_calculate_size(map, &pages); - bpf_map_uncharge_memlock(map, pages); - stype = cgroup_storage_type(map); if (stype == BPF_CGROUP_STORAGE_SHARED) call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu); -- cgit v1.2.3 From 844f157f6c0a905d039d2e20212ab3231f2e5eaf Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:48 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for devmap maps Do not use rlimit-based memory accounting for devmap maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-23-guro@fb.com --- kernel/bpf/devmap.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index b43ab247302d..f6e9c68afdd4 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -109,8 +109,6 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) { u32 valsize = attr->value_size; - u64 cost = 0; - int err; /* check sanity of attributes. 2 value sizes supported: * 4 bytes: ifindex @@ -135,21 +133,13 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) if (!dtab->n_buckets) /* Overflow check */ return -EINVAL; - cost += (u64) sizeof(struct hlist_head) * dtab->n_buckets; - } else { - cost += (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); } - /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&dtab->map.memory, cost); - if (err) - return -EINVAL; - if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets, dtab->map.numa_node); if (!dtab->dev_index_head) - goto free_charge; + return -ENOMEM; spin_lock_init(&dtab->index_lock); } else { @@ -157,14 +147,10 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr) sizeof(struct bpf_dtab_netdev *), dtab->map.numa_node); if (!dtab->netdev_map) - goto free_charge; + return -ENOMEM; } return 0; - -free_charge: - bpf_map_charge_finish(&dtab->map.memory); - return -ENOMEM; } static struct bpf_map *dev_map_alloc(union bpf_attr *attr) -- cgit v1.2.3 From 755e5d55367af5ff75a4db9b6cf439416878e2c7 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:49 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for hashtab maps Do not use rlimit-based memory accounting for hashtab maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-24-guro@fb.com --- kernel/bpf/hashtab.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index bf70fb3ed9c1..fe7a0733a63a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -443,7 +443,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); struct bpf_htab *htab; int err, i; - u64 cost; htab = kzalloc(sizeof(*htab), GFP_USER | __GFP_ACCOUNT); if (!htab) @@ -481,26 +480,12 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - cost = (u64) htab->n_buckets * sizeof(struct bucket) + - (u64) htab->elem_size * htab->map.max_entries; - - if (percpu) - cost += (u64) round_up(htab->map.value_size, 8) * - num_possible_cpus() * htab->map.max_entries; - else - cost += (u64) htab->elem_size * num_possible_cpus(); - - /* if map size is larger than memlock limit, reject it */ - err = bpf_map_charge_init(&htab->map.memory, cost); - if (err) - goto free_htab; - err = -ENOMEM; htab->buckets = bpf_map_area_alloc(htab->n_buckets * sizeof(struct bucket), htab->map.numa_node); if (!htab->buckets) - goto free_charge; + goto free_htab; for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) { htab->map_locked[i] = bpf_map_alloc_percpu(&htab->map, @@ -541,8 +526,6 @@ free_map_locked: for (i = 0; i < HASHTAB_MAP_LOCK_COUNT; i++) free_percpu(htab->map_locked[i]); bpf_map_area_free(htab->buckets); -free_charge: - bpf_map_charge_finish(&htab->map.memory); free_htab: lockdep_unregister_key(&htab->lockdep_key); kfree(htab); -- cgit v1.2.3 From cbddcb574d419fd5b70c5f87ba733feec6147aeb Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:50 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for lpm_trie maps Do not use rlimit-based memory accounting for lpm_trie maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-25-guro@fb.com --- kernel/bpf/lpm_trie.c | 13 ------------- 1 file changed, 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 1a6981203d7f..cec792a17e5f 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -540,8 +540,6 @@ out: static struct bpf_map *trie_alloc(union bpf_attr *attr) { struct lpm_trie *trie; - u64 cost = sizeof(*trie), cost_per_node; - int ret; if (!bpf_capable()) return ERR_PTR(-EPERM); @@ -567,20 +565,9 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; - cost_per_node = sizeof(struct lpm_trie_node) + - attr->value_size + trie->data_size; - cost += (u64) attr->max_entries * cost_per_node; - - ret = bpf_map_charge_init(&trie->map.memory, cost); - if (ret) - goto out_err; - spin_lock_init(&trie->lock); return &trie->map; -out_err: - kfree(trie); - return ERR_PTR(ret); } static void trie_free(struct bpf_map *map) -- cgit v1.2.3 From a37fb7ef24a475012547fa28f0148d2e538ad5d4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:51 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for queue_stack_maps maps Do not use rlimit-based memory accounting for queue_stack maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-26-guro@fb.com --- kernel/bpf/queue_stack_maps.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c index 0ee2347ba510..f9c734aaa990 100644 --- a/kernel/bpf/queue_stack_maps.c +++ b/kernel/bpf/queue_stack_maps.c @@ -66,29 +66,21 @@ static int queue_stack_map_alloc_check(union bpf_attr *attr) static struct bpf_map *queue_stack_map_alloc(union bpf_attr *attr) { - int ret, numa_node = bpf_map_attr_numa_node(attr); - struct bpf_map_memory mem = {0}; + int numa_node = bpf_map_attr_numa_node(attr); struct bpf_queue_stack *qs; - u64 size, queue_size, cost; + u64 size, queue_size; size = (u64) attr->max_entries + 1; - cost = queue_size = sizeof(*qs) + size * attr->value_size; - - ret = bpf_map_charge_init(&mem, cost); - if (ret < 0) - return ERR_PTR(ret); + queue_size = sizeof(*qs) + size * attr->value_size; qs = bpf_map_area_alloc(queue_size, numa_node); - if (!qs) { - bpf_map_charge_finish(&mem); + if (!qs) return ERR_PTR(-ENOMEM); - } memset(qs, 0, sizeof(*qs)); bpf_map_init_from_attr(&qs->map, attr); - bpf_map_charge_move(&qs->map.memory, &mem); qs->size = size; raw_spin_lock_init(&qs->lock); -- cgit v1.2.3 From db54330d3e137c23bea26784cecf5ae17e72e4c6 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:52 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for reuseport_array maps Do not use rlimit-based memory accounting for reuseport_array maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-27-guro@fb.com --- kernel/bpf/reuseport_array.c | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c index a55cd542f2ce..4838922f723d 100644 --- a/kernel/bpf/reuseport_array.c +++ b/kernel/bpf/reuseport_array.c @@ -150,9 +150,8 @@ static void reuseport_array_free(struct bpf_map *map) static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) { - int err, numa_node = bpf_map_attr_numa_node(attr); + int numa_node = bpf_map_attr_numa_node(attr); struct reuseport_array *array; - struct bpf_map_memory mem; u64 array_size; if (!bpf_capable()) @@ -161,20 +160,13 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr) array_size = sizeof(*array); array_size += (u64)attr->max_entries * sizeof(struct sock *); - err = bpf_map_charge_init(&mem, array_size); - if (err) - return ERR_PTR(err); - /* allocate all map elements and zero-initialize them */ array = bpf_map_area_alloc(array_size, numa_node); - if (!array) { - bpf_map_charge_finish(&mem); + if (!array) return ERR_PTR(-ENOMEM); - } /* copy mandatory map attributes */ bpf_map_init_from_attr(&array->map, attr); - bpf_map_charge_move(&array->map.memory, &mem); return &array->map; } -- cgit v1.2.3 From abbdd0813f347f9d1eea376409a68295318b2ef5 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:53 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for bpf ringbuffer Do not use rlimit-based memory accounting for bpf ringbuffer. It has been replaced with the memcg-based memory accounting. bpf_ringbuf_alloc() can't return anything except ERR_PTR(-ENOMEM) and a valid pointer, so to simplify the code make it return NULL in the first case. This allows to drop a couple of lines in ringbuf_map_alloc() and also makes it look similar to other memory allocating function like kmalloc(). Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201201215900.3569844-28-guro@fb.com --- kernel/bpf/ringbuf.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index 8983a46f6580..f25b719ac786 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -48,7 +48,6 @@ struct bpf_ringbuf { struct bpf_ringbuf_map { struct bpf_map map; - struct bpf_map_memory memory; struct bpf_ringbuf *rb; }; @@ -131,7 +130,7 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) rb = bpf_ringbuf_area_alloc(data_sz, numa_node); if (!rb) - return ERR_PTR(-ENOMEM); + return NULL; spin_lock_init(&rb->spinlock); init_waitqueue_head(&rb->waitq); @@ -147,8 +146,6 @@ static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node) static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) { struct bpf_ringbuf_map *rb_map; - u64 cost; - int err; if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK) return ERR_PTR(-EINVAL); @@ -170,26 +167,13 @@ static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr) bpf_map_init_from_attr(&rb_map->map, attr); - cost = sizeof(struct bpf_ringbuf_map) + - sizeof(struct bpf_ringbuf) + - attr->max_entries; - err = bpf_map_charge_init(&rb_map->map.memory, cost); - if (err) - goto err_free_map; - rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node); - if (IS_ERR(rb_map->rb)) { - err = PTR_ERR(rb_map->rb); - goto err_uncharge; + if (!rb_map->rb) { + kfree(rb_map); + return ERR_PTR(-ENOMEM); } return &rb_map->map; - -err_uncharge: - bpf_map_charge_finish(&rb_map->map.memory); -err_free_map: - kfree(rb_map); - return ERR_PTR(err); } static void bpf_ringbuf_free(struct bpf_ringbuf *rb) -- cgit v1.2.3 From 370868107bf6624cc104038bf38be2ca153eeb2e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:55 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for stackmap maps Do not use rlimit-based memory accounting for stackmap maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-30-guro@fb.com --- kernel/bpf/stackmap.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 06065fa27124..3325add8e629 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -90,7 +90,6 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) { u32 value_size = attr->value_size; struct bpf_stack_map *smap; - struct bpf_map_memory mem; u64 cost, n_buckets; int err; @@ -119,15 +118,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); - err = bpf_map_charge_init(&mem, cost); - if (err) - return ERR_PTR(err); - smap = bpf_map_area_alloc(cost, bpf_map_attr_numa_node(attr)); - if (!smap) { - bpf_map_charge_finish(&mem); + if (!smap) return ERR_PTR(-ENOMEM); - } bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; @@ -135,20 +128,17 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) err = get_callchain_buffers(sysctl_perf_event_max_stack); if (err) - goto free_charge; + goto free_smap; err = prealloc_elems_and_freelist(smap); if (err) goto put_buffers; - bpf_map_charge_move(&smap->map.memory, &mem); - return &smap->map; put_buffers: put_callchain_buffers(); -free_charge: - bpf_map_charge_finish(&mem); +free_smap: bpf_map_area_free(smap); return ERR_PTR(err); } -- cgit v1.2.3 From ab31be378a63a8bc1868c9890d28b0206f81396e Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:57 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for bpf local storage maps Do not use rlimit-based memory accounting for bpf local storage maps. It has been replaced with the memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-32-guro@fb.com --- kernel/bpf/bpf_local_storage.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c index 023a3eaa4d74..dd5aedee99e7 100644 --- a/kernel/bpf/bpf_local_storage.c +++ b/kernel/bpf/bpf_local_storage.c @@ -545,8 +545,6 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) struct bpf_local_storage_map *smap; unsigned int i; u32 nbuckets; - u64 cost; - int ret; smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap) @@ -557,18 +555,10 @@ struct bpf_local_storage_map *bpf_local_storage_map_alloc(union bpf_attr *attr) /* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */ nbuckets = max_t(u32, 2, nbuckets); smap->bucket_log = ilog2(nbuckets); - cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap); - - ret = bpf_map_charge_init(&smap->map.memory, cost); - if (ret < 0) { - kfree(smap); - return ERR_PTR(ret); - } smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets, GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT); if (!smap->buckets) { - bpf_map_charge_finish(&smap->map.memory); kfree(smap); return ERR_PTR(-ENOMEM); } -- cgit v1.2.3 From 80ee81e0403c48f4eb342f7c8d40477c89b8836a Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:58 -0800 Subject: bpf: Eliminate rlimit-based memory accounting infra for bpf maps Remove rlimit-based accounting infrastructure code, which is not used anymore. To provide a backward compatibility, use an approximation of the bpf map memory footprint as a "memlock" value, available to a user via map info. The approximation is based on the maximal number of elements and key and value sizes. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-33-guro@fb.com --- include/linux/bpf.h | 12 --- kernel/bpf/syscall.c | 96 ++++------------------ .../testing/selftests/bpf/progs/bpf_iter_bpf_map.c | 2 +- tools/testing/selftests/bpf/progs/map_ptr_kern.c | 7 -- 4 files changed, 17 insertions(+), 100 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1f2c95c15ec..61331a148cde 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -138,11 +138,6 @@ struct bpf_map_ops { const struct bpf_iter_seq_info *iter_seq_info; }; -struct bpf_map_memory { - u32 pages; - struct user_struct *user; -}; - struct bpf_map { /* The first two cachelines with read-mostly members of which some * are also accessed in fast-path (e.g. ops, max_entries). @@ -163,7 +158,6 @@ struct bpf_map { u32 btf_key_type_id; u32 btf_value_type_id; struct btf *btf; - struct bpf_map_memory memory; #ifdef CONFIG_MEMCG_KMEM struct mem_cgroup *memcg; #endif @@ -1224,12 +1218,6 @@ void bpf_map_inc_with_uref(struct bpf_map *map); struct bpf_map * __must_check bpf_map_inc_not_zero(struct bpf_map *map); void bpf_map_put_with_uref(struct bpf_map *map); void bpf_map_put(struct bpf_map *map); -int bpf_map_charge_memlock(struct bpf_map *map, u32 pages); -void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages); -int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size); -void bpf_map_charge_finish(struct bpf_map_memory *mem); -void bpf_map_charge_move(struct bpf_map_memory *dst, - struct bpf_map_memory *src); void *bpf_map_area_alloc(u64 size, int numa_node); void *bpf_map_area_mmapable_alloc(u64 size, int numa_node); void bpf_map_area_free(void *base); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dff3a5f62d7a..29096d96d989 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -128,7 +128,7 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) return map; } -static u32 bpf_map_value_size(struct bpf_map *map) +static u32 bpf_map_value_size(const struct bpf_map *map) { if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH || @@ -346,77 +346,6 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) map->numa_node = bpf_map_attr_numa_node(attr); } -static int bpf_charge_memlock(struct user_struct *user, u32 pages) -{ - unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (atomic_long_add_return(pages, &user->locked_vm) > memlock_limit) { - atomic_long_sub(pages, &user->locked_vm); - return -EPERM; - } - return 0; -} - -static void bpf_uncharge_memlock(struct user_struct *user, u32 pages) -{ - if (user) - atomic_long_sub(pages, &user->locked_vm); -} - -int bpf_map_charge_init(struct bpf_map_memory *mem, u64 size) -{ - u32 pages = round_up(size, PAGE_SIZE) >> PAGE_SHIFT; - struct user_struct *user; - int ret; - - if (size >= U32_MAX - PAGE_SIZE) - return -E2BIG; - - user = get_current_user(); - ret = bpf_charge_memlock(user, pages); - if (ret) { - free_uid(user); - return ret; - } - - mem->pages = pages; - mem->user = user; - - return 0; -} - -void bpf_map_charge_finish(struct bpf_map_memory *mem) -{ - bpf_uncharge_memlock(mem->user, mem->pages); - free_uid(mem->user); -} - -void bpf_map_charge_move(struct bpf_map_memory *dst, - struct bpf_map_memory *src) -{ - *dst = *src; - - /* Make sure src will not be used for the redundant uncharging. */ - memset(src, 0, sizeof(struct bpf_map_memory)); -} - -int bpf_map_charge_memlock(struct bpf_map *map, u32 pages) -{ - int ret; - - ret = bpf_charge_memlock(map->memory.user, pages); - if (ret) - return ret; - map->memory.pages += pages; - return ret; -} - -void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages) -{ - bpf_uncharge_memlock(map->memory.user, pages); - map->memory.pages -= pages; -} - static int bpf_map_alloc_id(struct bpf_map *map) { int id; @@ -524,14 +453,11 @@ static void bpf_map_release_memcg(struct bpf_map *map) static void bpf_map_free_deferred(struct work_struct *work) { struct bpf_map *map = container_of(work, struct bpf_map, work); - struct bpf_map_memory mem; - bpf_map_charge_move(&mem, &map->memory); security_bpf_map_free(map); bpf_map_release_memcg(map); /* implementation dependent freeing */ map->ops->map_free(map); - bpf_map_charge_finish(&mem); } static void bpf_map_put_uref(struct bpf_map *map) @@ -592,6 +518,19 @@ static fmode_t map_get_sys_perms(struct bpf_map *map, struct fd f) } #ifdef CONFIG_PROC_FS +/* Provides an approximation of the map's memory footprint. + * Used only to provide a backward compatibility and display + * a reasonable "memlock" info. + */ +static unsigned long bpf_map_memory_footprint(const struct bpf_map *map) +{ + unsigned long size; + + size = round_up(map->key_size + bpf_map_value_size(map), 8); + + return round_up(map->max_entries * size, PAGE_SIZE); +} + static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) { const struct bpf_map *map = filp->private_data; @@ -610,7 +549,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) "value_size:\t%u\n" "max_entries:\t%u\n" "map_flags:\t%#x\n" - "memlock:\t%llu\n" + "memlock:\t%lu\n" "map_id:\t%u\n" "frozen:\t%u\n", map->map_type, @@ -618,7 +557,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) map->value_size, map->max_entries, map->map_flags, - map->memory.pages * 1ULL << PAGE_SHIFT, + bpf_map_memory_footprint(map), map->id, READ_ONCE(map->frozen)); if (type) { @@ -861,7 +800,6 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf, static int map_create(union bpf_attr *attr) { int numa_node = bpf_map_attr_numa_node(attr); - struct bpf_map_memory mem; struct bpf_map *map; int f_flags; int err; @@ -960,9 +898,7 @@ free_map_sec: security_bpf_map_free(map); free_map: btf_put(map->btf); - bpf_map_charge_move(&mem, &map->memory); map->ops->map_free(map); - bpf_map_charge_finish(&mem); return err; } diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c index 08651b23edba..b83b5d2e17dc 100644 --- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c +++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_map.c @@ -23,6 +23,6 @@ int dump_bpf_map(struct bpf_iter__bpf_map *ctx) BPF_SEQ_PRINTF(seq, "%8u %8ld %8ld %10lu\n", map->id, map->refcnt.counter, map->usercnt.counter, - map->memory.user->locked_vm.counter); + 0LLU); return 0; } diff --git a/tools/testing/selftests/bpf/progs/map_ptr_kern.c b/tools/testing/selftests/bpf/progs/map_ptr_kern.c index c325405751e2..d8850bc6a9f1 100644 --- a/tools/testing/selftests/bpf/progs/map_ptr_kern.c +++ b/tools/testing/selftests/bpf/progs/map_ptr_kern.c @@ -26,17 +26,12 @@ __u32 g_line = 0; return 0; \ }) -struct bpf_map_memory { - __u32 pages; -} __attribute__((preserve_access_index)); - struct bpf_map { enum bpf_map_type map_type; __u32 key_size; __u32 value_size; __u32 max_entries; __u32 id; - struct bpf_map_memory memory; } __attribute__((preserve_access_index)); static inline int check_bpf_map_fields(struct bpf_map *map, __u32 key_size, @@ -47,7 +42,6 @@ static inline int check_bpf_map_fields(struct bpf_map *map, __u32 key_size, VERIFY(map->value_size == value_size); VERIFY(map->max_entries == max_entries); VERIFY(map->id > 0); - VERIFY(map->memory.pages > 0); return 1; } @@ -60,7 +54,6 @@ static inline int check_bpf_map_ptr(struct bpf_map *indirect, VERIFY(indirect->value_size == direct->value_size); VERIFY(indirect->max_entries == direct->max_entries); VERIFY(indirect->id == direct->id); - VERIFY(indirect->memory.pages == direct->memory.pages); return 1; } -- cgit v1.2.3 From 3ac1f01b43b6e2759cc34d3a715ba5eed04c5805 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 1 Dec 2020 13:58:59 -0800 Subject: bpf: Eliminate rlimit-based memory accounting for bpf progs Do not use rlimit-based memory accounting for bpf progs. It has been replaced with memcg-based memory accounting. Signed-off-by: Roman Gushchin Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201201215900.3569844-34-guro@fb.com --- include/linux/bpf.h | 11 --------- kernel/bpf/core.c | 12 ++------- kernel/bpf/syscall.c | 69 ++++++++-------------------------------------------- 3 files changed, 12 insertions(+), 80 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 61331a148cde..a9de5711b23f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1202,8 +1202,6 @@ void bpf_prog_sub(struct bpf_prog *prog, int i); void bpf_prog_inc(struct bpf_prog *prog); struct bpf_prog * __must_check bpf_prog_inc_not_zero(struct bpf_prog *prog); void bpf_prog_put(struct bpf_prog *prog); -int __bpf_prog_charge(struct user_struct *user, u32 pages); -void __bpf_prog_uncharge(struct user_struct *user, u32 pages); void __bpf_free_used_maps(struct bpf_prog_aux *aux, struct bpf_map **used_maps, u32 len); @@ -1512,15 +1510,6 @@ bpf_prog_inc_not_zero(struct bpf_prog *prog) return ERR_PTR(-EOPNOTSUPP); } -static inline int __bpf_prog_charge(struct user_struct *user, u32 pages) -{ - return 0; -} - -static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages) -{ -} - static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type, const struct bpf_link_ops *ops, struct bpf_prog *prog) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2921f58c03a8..261f8692d0d2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -221,23 +221,15 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, { gfp_t gfp_flags = GFP_KERNEL_ACCOUNT | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; - u32 pages, delta; - int ret; + u32 pages; size = round_up(size, PAGE_SIZE); pages = size / PAGE_SIZE; if (pages <= fp_old->pages) return fp_old; - delta = pages - fp_old->pages; - ret = __bpf_prog_charge(fp_old->aux->user, delta); - if (ret) - return NULL; - fp = __vmalloc(size, gfp_flags); - if (fp == NULL) { - __bpf_prog_uncharge(fp_old->aux->user, delta); - } else { + if (fp) { memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 29096d96d989..d16dd4945100 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1632,51 +1632,6 @@ static void bpf_audit_prog(const struct bpf_prog *prog, unsigned int op) audit_log_end(ab); } -int __bpf_prog_charge(struct user_struct *user, u32 pages) -{ - unsigned long memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - unsigned long user_bufs; - - if (user) { - user_bufs = atomic_long_add_return(pages, &user->locked_vm); - if (user_bufs > memlock_limit) { - atomic_long_sub(pages, &user->locked_vm); - return -EPERM; - } - } - - return 0; -} - -void __bpf_prog_uncharge(struct user_struct *user, u32 pages) -{ - if (user) - atomic_long_sub(pages, &user->locked_vm); -} - -static int bpf_prog_charge_memlock(struct bpf_prog *prog) -{ - struct user_struct *user = get_current_user(); - int ret; - - ret = __bpf_prog_charge(user, prog->pages); - if (ret) { - free_uid(user); - return ret; - } - - prog->aux->user = user; - return 0; -} - -static void bpf_prog_uncharge_memlock(struct bpf_prog *prog) -{ - struct user_struct *user = prog->aux->user; - - __bpf_prog_uncharge(user, prog->pages); - free_uid(user); -} - static int bpf_prog_alloc_id(struct bpf_prog *prog) { int id; @@ -1726,7 +1681,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) kvfree(aux->func_info); kfree(aux->func_info_aux); - bpf_prog_uncharge_memlock(aux->prog); + free_uid(aux->user); security_bpf_prog_free(aux); bpf_prog_free(aux->prog); } @@ -2164,7 +2119,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) dst_prog = bpf_prog_get(attr->attach_prog_fd); if (IS_ERR(dst_prog)) { err = PTR_ERR(dst_prog); - goto free_prog_nouncharge; + goto free_prog; } prog->aux->dst_prog = dst_prog; } @@ -2174,18 +2129,15 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) err = security_bpf_prog_alloc(prog->aux); if (err) - goto free_prog_nouncharge; - - err = bpf_prog_charge_memlock(prog); - if (err) - goto free_prog_sec; + goto free_prog; + prog->aux->user = get_current_user(); prog->len = attr->insn_cnt; err = -EFAULT; if (copy_from_user(prog->insns, u64_to_user_ptr(attr->insns), bpf_prog_insn_size(prog)) != 0) - goto free_prog; + goto free_prog_sec; prog->orig_prog = NULL; prog->jited = 0; @@ -2196,19 +2148,19 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_offload_init(prog, attr); if (err) - goto free_prog; + goto free_prog_sec; } /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) - goto free_prog; + goto free_prog_sec; prog->aux->load_time = ktime_get_boottime_ns(); err = bpf_obj_name_cpy(prog->aux->name, attr->prog_name, sizeof(attr->prog_name)); if (err < 0) - goto free_prog; + goto free_prog_sec; /* run eBPF verifier */ err = bpf_check(&prog, attr, uattr); @@ -2253,11 +2205,10 @@ free_used_maps: */ __bpf_prog_put_noref(prog, prog->aux->func_cnt); return err; -free_prog: - bpf_prog_uncharge_memlock(prog); free_prog_sec: + free_uid(prog->aux->user); security_bpf_prog_free(prog->aux); -free_prog_nouncharge: +free_prog: bpf_prog_free(prog); return err; } -- cgit v1.2.3 From 6b3211842a115d697fbf78d09f3e83852200e413 Mon Sep 17 00:00:00 2001 From: Yejune Deng Date: Mon, 30 Nov 2020 16:35:45 +0800 Subject: audit: replace atomic_add_return() atomic_inc_return() is a little neater Signed-off-by: Yejune Deng Signed-off-by: Paul Moore --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index e22f22bdc000..1ffc2e059027 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1779,7 +1779,7 @@ unsigned int audit_serial(void) { static atomic_t serial = ATOMIC_INIT(0); - return atomic_add_return(1, &serial); + return atomic_inc_return(&serial); } static inline void audit_get_stamp(struct audit_context *ctx, -- cgit v1.2.3 From 8af26be062721e52eba1550caf50b712f774c5fd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 11 Nov 2020 13:43:57 +0100 Subject: perf/core: Fix arch_perf_get_page_size() The (new) page-table walker in arch_perf_get_page_size() is broken in various ways. Specifically while it is used in a lockless manner, it doesn't depend on CONFIG_HAVE_FAST_GUP nor uses the proper _lockless offset methods, nor is careful to only read each entry only once. Also the hugetlb support is broken due to calling pte_page() without first checking pte_special(). Rewrite the whole thing to be a proper lockless page-table walker and employ the new pXX_leaf_size() pgtable functions to determine the pagetable size without looking at the page-frames. Fixes: 51b646b2d9f8 ("perf,mm: Handle non-page-table-aligned hugetlbfs") Fixes: 8d97e71811aa ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE") Signed-off-by: Peter Zijlstra (Intel) Tested-by: Kan Liang Link: https://lkml.kernel.org/r/20201126124207.GM3040@hirez.programming.kicks-ass.net --- kernel/events/core.c | 103 +++++++++++++++++++-------------------------------- 1 file changed, 38 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d2f3ca792936..a21b0be2f22c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -52,6 +52,7 @@ #include #include #include +#include #include "internal.h" @@ -7001,90 +7002,62 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } -#ifdef CONFIG_MMU - /* - * Return the MMU page size of a given virtual address. - * - * This generic implementation handles page-table aligned huge pages, as well - * as non-page-table aligned hugetlbfs compound pages. - * - * If an architecture supports and uses non-page-table aligned pages in their - * kernel mapping it will need to provide it's own implementation of this - * function. + * Return the pagetable size of a given virtual address. */ -__weak u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr) +static u64 perf_get_pgtable_size(struct mm_struct *mm, unsigned long addr) { - struct page *page; - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + u64 size = 0; - pgd = pgd_offset(mm, addr); - if (pgd_none(*pgd)) - return 0; +#ifdef CONFIG_HAVE_FAST_GUP + pgd_t *pgdp, pgd; + p4d_t *p4dp, p4d; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; - p4d = p4d_offset(pgd, addr); - if (!p4d_present(*p4d)) + pgdp = pgd_offset(mm, addr); + pgd = READ_ONCE(*pgdp); + if (pgd_none(pgd)) return 0; - if (p4d_leaf(*p4d)) - return 1ULL << P4D_SHIFT; + if (pgd_leaf(pgd)) + return pgd_leaf_size(pgd); - pud = pud_offset(p4d, addr); - if (!pud_present(*pud)) + p4dp = p4d_offset_lockless(pgdp, pgd, addr); + p4d = READ_ONCE(*p4dp); + if (!p4d_present(p4d)) return 0; - if (pud_leaf(*pud)) { -#ifdef pud_page - page = pud_page(*pud); - if (PageHuge(page)) - return page_size(compound_head(page)); -#endif - return 1ULL << PUD_SHIFT; - } + if (p4d_leaf(p4d)) + return p4d_leaf_size(p4d); - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) + pudp = pud_offset_lockless(p4dp, p4d, addr); + pud = READ_ONCE(*pudp); + if (!pud_present(pud)) return 0; - if (pmd_leaf(*pmd)) { -#ifdef pmd_page - page = pmd_page(*pmd); - if (PageHuge(page)) - return page_size(compound_head(page)); -#endif - return 1ULL << PMD_SHIFT; - } + if (pud_leaf(pud)) + return pud_leaf_size(pud); - pte = pte_offset_map(pmd, addr); - if (!pte_present(*pte)) { - pte_unmap(pte); + pmdp = pmd_offset_lockless(pudp, pud, addr); + pmd = READ_ONCE(*pmdp); + if (!pmd_present(pmd)) return 0; - } - page = pte_page(*pte); - if (PageHuge(page)) { - u64 size = page_size(compound_head(page)); - pte_unmap(pte); - return size; - } + if (pmd_leaf(pmd)) + return pmd_leaf_size(pmd); - pte_unmap(pte); - return PAGE_SIZE; -} + ptep = pte_offset_map(&pmd, addr); + pte = ptep_get_lockless(ptep); + if (pte_present(pte)) + size = pte_leaf_size(pte); + pte_unmap(ptep); +#endif /* CONFIG_HAVE_FAST_GUP */ -#else - -static u64 arch_perf_get_page_size(struct mm_struct *mm, unsigned long addr) -{ - return 0; + return size; } -#endif - static u64 perf_get_page_size(unsigned long addr) { struct mm_struct *mm; @@ -7109,7 +7082,7 @@ static u64 perf_get_page_size(unsigned long addr) mm = &init_mm; } - size = arch_perf_get_page_size(mm, addr); + size = perf_get_pgtable_size(mm, addr); local_irq_restore(flags); -- cgit v1.2.3 From 950cc0d2bef078e1f6459900ca4d4b2a2e0e3c37 Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Wed, 2 Dec 2020 14:07:07 +0200 Subject: fsnotify: generalize handle_inode_event() The handle_inode_event() interface was added as (quoting comment): "a simple variant of handle_event() for groups that only have inode marks and don't have ignore mask". In other words, all backends except fanotify. The inotify backend also falls under this category, but because it required extra arguments it was left out of the initial pass of backends conversion to the simple interface. This results in code duplication between the generic helper fsnotify_handle_event() and the inotify_handle_event() callback which also happen to be buggy code. Generalize the handle_inode_event() arguments and add the check for FS_EXCL_UNLINK flag to the generic helper, so inotify backend could be converted to use the simple interface. Link: https://lore.kernel.org/r/20201202120713.702387-2-amir73il@gmail.com CC: stable@vger.kernel.org Fixes: b9a1b9772509 ("fsnotify: create method handle_inode_event() in fsnotify_operations") Signed-off-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/nfsd/filecache.c | 2 +- fs/notify/dnotify/dnotify.c | 2 +- fs/notify/fsnotify.c | 31 ++++++++++++++++++++++++------- include/linux/fsnotify_backend.h | 3 ++- kernel/audit_fsnotify.c | 2 +- kernel/audit_tree.c | 2 +- kernel/audit_watch.c | 2 +- 7 files changed, 31 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 3c6c2f7d1688..5849c1bd88f1 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -600,7 +600,7 @@ static struct notifier_block nfsd_file_lease_notifier = { static int nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, - const struct qstr *name) + const struct qstr *name, u32 cookie) { trace_nfsd_file_fsnotify_handle_event(inode, mask); diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 5dcda8f20c04..e45ca6ecba95 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -72,7 +72,7 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) */ static int dnotify_handle_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, - const struct qstr *name) + const struct qstr *name, u32 cookie) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 8d3ad5ef2925..c5c68bcbaadf 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -232,6 +232,26 @@ notify: } EXPORT_SYMBOL_GPL(__fsnotify_parent); +static int fsnotify_handle_inode_event(struct fsnotify_group *group, + struct fsnotify_mark *inode_mark, + u32 mask, const void *data, int data_type, + struct inode *dir, const struct qstr *name, + u32 cookie) +{ + const struct path *path = fsnotify_data_path(data, data_type); + struct inode *inode = fsnotify_data_inode(data, data_type); + const struct fsnotify_ops *ops = group->ops; + + if (WARN_ON_ONCE(!ops->handle_inode_event)) + return 0; + + if ((inode_mark->mask & FS_EXCL_UNLINK) && + path && d_unlinked(path->dentry)) + return 0; + + return ops->handle_inode_event(inode_mark, mask, inode, dir, name, cookie); +} + static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, @@ -239,13 +259,8 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, { struct fsnotify_mark *inode_mark = fsnotify_iter_inode_mark(iter_info); struct fsnotify_mark *child_mark = fsnotify_iter_child_mark(iter_info); - struct inode *inode = fsnotify_data_inode(data, data_type); - const struct fsnotify_ops *ops = group->ops; int ret; - if (WARN_ON_ONCE(!ops->handle_inode_event)) - return 0; - if (WARN_ON_ONCE(fsnotify_iter_sb_mark(iter_info)) || WARN_ON_ONCE(fsnotify_iter_vfsmount_mark(iter_info))) return 0; @@ -262,7 +277,8 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, name = NULL; } - ret = ops->handle_inode_event(inode_mark, mask, inode, dir, name); + ret = fsnotify_handle_inode_event(group, inode_mark, mask, data, data_type, + dir, name, cookie); if (ret || !child_mark) return ret; @@ -272,7 +288,8 @@ static int fsnotify_handle_event(struct fsnotify_group *group, __u32 mask, * report the event once to parent dir with name and once to child * without name. */ - return ops->handle_inode_event(child_mark, mask, inode, NULL, NULL); + return fsnotify_handle_inode_event(group, child_mark, mask, data, data_type, + NULL, NULL, 0); } static int send_to_group(__u32 mask, const void *data, int data_type, diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index f8529a3a2923..4ee3044eedd0 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -137,6 +137,7 @@ struct mem_cgroup; * if @file_name is not NULL, this is the directory that * @file_name is relative to. * @file_name: optional file name associated with event + * @cookie: inotify rename cookie * * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group @@ -151,7 +152,7 @@ struct fsnotify_ops { struct fsnotify_iter_info *iter_info); int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, - const struct qstr *file_name); + const struct qstr *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_event *event); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index bfcfcd61adb6..5b3f01da172b 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -154,7 +154,7 @@ static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark) /* Update mark data in audit rules based on fsnotify events. */ static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, - const struct qstr *dname) + const struct qstr *dname, u32 cookie) { struct audit_fsnotify_mark *audit_mark; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 83e1c07fc99e..6c91902f4f45 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -1037,7 +1037,7 @@ static void evict_chunk(struct audit_chunk *chunk) static int audit_tree_handle_event(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, - const struct qstr *file_name) + const struct qstr *file_name, u32 cookie) { return 0; } diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 246e5ba704c0..2acf7ca49154 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -466,7 +466,7 @@ void audit_remove_watch_rule(struct audit_krule *krule) /* Update watch data in audit rules based on fsnotify events. */ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, - const struct qstr *dname) + const struct qstr *dname, u32 cookie) { struct audit_parent *parent; -- cgit v1.2.3 From 12cc126df82c96c89706aa207ad27c56f219047c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:21 -0800 Subject: bpf: Fix bpf_put_raw_tracepoint()'s use of __module_address() __module_address() needs to be called with preemption disabled or with module_mutex taken. preempt_disable() is enough for read-only uses, which is what this fix does. Also, module_put() does internal check for NULL, so drop it as well. Fixes: a38d1107f937 ("bpf: support raw tracepoints in modules") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201203204634.1325171-2-andrii@kernel.org --- kernel/trace/bpf_trace.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d255bc9b2bfa..23a390aac524 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -2060,10 +2060,12 @@ struct bpf_raw_event_map *bpf_get_raw_tracepoint(const char *name) void bpf_put_raw_tracepoint(struct bpf_raw_event_map *btp) { - struct module *mod = __module_address((unsigned long)btp); + struct module *mod; - if (mod) - module_put(mod); + preempt_disable(); + mod = __module_address((unsigned long)btp); + module_put(mod); + preempt_enable(); } static __always_inline -- cgit v1.2.3 From 2fe8890848c799515a881502339a0a7b2b555988 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:22 -0800 Subject: bpf: Keep module's btf_data_size intact after load Having real btf_data_size stored in struct module is benefitial to quickly determine which kernel modules have associated BTF object and which don't. There is no harm in keeping this info, as opposed to keeping invalid pointer. Fixes: 607c543f939d ("bpf: Sanitize BTF data pointer after module is loaded") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-3-andrii@kernel.org --- kernel/module.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 18f259d61d14..c3a9e972d3b2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3712,7 +3712,6 @@ static noinline int do_init_module(struct module *mod) #ifdef CONFIG_DEBUG_INFO_BTF_MODULES /* .BTF is not SHF_ALLOC and will get removed, so sanitize pointer */ mod->btf_data = NULL; - mod->btf_data_size = 0; #endif /* * We want to free module_init, but be aware that kallsyms may be -- cgit v1.2.3 From 22dc4a0f5ed11b6dc8fd73a0892fa0ea1a4c3cdf Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:29 -0800 Subject: bpf: Remove hard-coded btf_vmlinux assumption from BPF verifier Remove a permeating assumption thoughout BPF verifier of vmlinux BTF. Instead, wherever BTF type IDs are involved, also track the instance of struct btf that goes along with the type ID. This allows to gradually add support for kernel module BTFs and using/tracking module types across BPF helper calls and registers. This patch also renames btf_id() function to btf_obj_id() to minimize naming clash with using btf_id to denote BTF *type* ID, rather than BTF *object*'s ID. Also, altough btf_vmlinux can't get destructed and thus doesn't need refcounting, module BTFs need that, so apply BTF refcounting universally when BPF program is using BTF-powered attachment (tp_btf, fentry/fexit, etc). This makes for simpler clean up code. Now that BTF type ID is not enough to uniquely identify a BTF type, extend BPF trampoline key to include BTF object ID. To differentiate that from target program BPF ID, set 31st bit of type ID. BTF type IDs (at least currently) are not allowed to take full 32 bits, so there is no danger of confusing that bit with a valid BTF type ID. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-10-andrii@kernel.org --- include/linux/bpf.h | 13 +++++--- include/linux/bpf_verifier.h | 28 ++++++++++++---- include/linux/btf.h | 5 ++- kernel/bpf/btf.c | 65 +++++++++++++++++++++++++------------ kernel/bpf/syscall.c | 24 ++++++++++++-- kernel/bpf/verifier.c | 77 +++++++++++++++++++++++++++----------------- net/ipv4/bpf_tcp_ca.c | 3 +- 7 files changed, 148 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a9de5711b23f..d05e75ed8c1b 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -421,7 +421,10 @@ struct bpf_insn_access_aux { enum bpf_reg_type reg_type; union { int ctx_field_size; - u32 btf_id; + struct { + struct btf *btf; + u32 btf_id; + }; }; struct bpf_verifier_log *log; /* for verbose logs */ }; @@ -458,6 +461,7 @@ struct bpf_verifier_ops { struct bpf_insn *dst, struct bpf_prog *prog, u32 *target_size); int (*btf_struct_access)(struct bpf_verifier_log *log, + const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); @@ -771,6 +775,7 @@ struct bpf_prog_aux { u32 ctx_arg_info_size; u32 max_rdonly_access; u32 max_rdwr_access; + struct btf *attach_btf; const struct bpf_ctx_arg_aux *ctx_arg_info; struct mutex dst_mutex; /* protects dst_* pointers below, *after* prog becomes visible */ struct bpf_prog *dst_prog; @@ -1005,7 +1010,6 @@ struct bpf_event_entry { bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp); int bpf_prog_calc_tag(struct bpf_prog *fp); -const char *kernel_type_name(u32 btf_type_id); const struct bpf_func_proto *bpf_get_trace_printk_proto(void); @@ -1450,12 +1454,13 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, bool btf_ctx_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, struct bpf_insn_access_aux *info); -int btf_struct_access(struct bpf_verifier_log *log, +int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id); bool btf_struct_ids_match(struct bpf_verifier_log *log, - int off, u32 id, u32 need_type_id); + const struct btf *btf, u32 id, int off, + const struct btf *need_btf, u32 need_type_id); int btf_distill_func_proto(struct bpf_verifier_log *log, struct btf *btf, diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 306869d4743b..e941fe1484e5 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -5,6 +5,7 @@ #define _LINUX_BPF_VERIFIER_H 1 #include /* for enum bpf_reg_type */ +#include /* for struct btf and btf_id() */ #include /* for MAX_BPF_STACK */ #include @@ -43,6 +44,8 @@ enum bpf_reg_liveness { struct bpf_reg_state { /* Ordering of fields matters. See states_equal() */ enum bpf_reg_type type; + /* Fixed part of pointer offset, pointer types only */ + s32 off; union { /* valid when type == PTR_TO_PACKET */ int range; @@ -52,15 +55,20 @@ struct bpf_reg_state { */ struct bpf_map *map_ptr; - u32 btf_id; /* for PTR_TO_BTF_ID */ + /* for PTR_TO_BTF_ID */ + struct { + struct btf *btf; + u32 btf_id; + }; u32 mem_size; /* for PTR_TO_MEM | PTR_TO_MEM_OR_NULL */ /* Max size from any of the above. */ - unsigned long raw; + struct { + unsigned long raw1; + unsigned long raw2; + } raw; }; - /* Fixed part of pointer offset, pointer types only */ - s32 off; /* For PTR_TO_PACKET, used to find other pointers with the same variable * offset, so they can share range knowledge. * For PTR_TO_MAP_VALUE_OR_NULL this is used to share which map value we @@ -311,7 +319,10 @@ struct bpf_insn_aux_data { struct { enum bpf_reg_type reg_type; /* type of pseudo_btf_id */ union { - u32 btf_id; /* btf_id for struct typed var */ + struct { + struct btf *btf; + u32 btf_id; /* btf_id for struct typed var */ + }; u32 mem_size; /* mem_size for non-struct typed var */ }; } btf_var; @@ -459,9 +470,12 @@ int check_ctx_reg(struct bpf_verifier_env *env, /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, - u32 btf_id) + struct btf *btf, u32 btf_id) { - return tgt_prog ? (((u64)tgt_prog->aux->id) << 32 | btf_id) : btf_id; + if (tgt_prog) + return ((u64)tgt_prog->aux->id << 32) | btf_id; + else + return ((u64)btf_obj_id(btf) << 32) | 0x80000000 | btf_id; } int bpf_check_attach_target(struct bpf_verifier_log *log, diff --git a/include/linux/btf.h b/include/linux/btf.h index 2bf641829664..fb608e4de076 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -18,6 +18,7 @@ struct btf_show; extern const struct file_operations btf_fops; +void btf_get(struct btf *btf); void btf_put(struct btf *btf); int btf_new_fd(const union bpf_attr *attr); struct btf *btf_get_by_fd(int fd); @@ -88,7 +89,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, char *buf, int len, u64 flags); int btf_get_fd_by_id(u32 id); -u32 btf_id(const struct btf *btf); +u32 btf_obj_id(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); @@ -206,6 +207,8 @@ static inline const struct btf_var_secinfo *btf_type_var_secinfo( } #ifdef CONFIG_BPF_SYSCALL +struct bpf_prog; + const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id); const char *btf_name_by_offset(const struct btf *btf, u32 offset); struct btf *btf_parse_vmlinux(void); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 6b2d508b33d4..7a19bf5bfe97 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -1524,6 +1524,11 @@ static void btf_free_rcu(struct rcu_head *rcu) btf_free(btf); } +void btf_get(struct btf *btf) +{ + refcount_inc(&btf->refcnt); +} + void btf_put(struct btf *btf) { if (btf && refcount_dec_and_test(&btf->refcnt)) { @@ -4555,11 +4560,10 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog) { struct bpf_prog *tgt_prog = prog->aux->dst_prog; - if (tgt_prog) { + if (tgt_prog) return tgt_prog->aux->btf; - } else { - return btf_vmlinux; - } + else + return prog->aux->attach_btf; } static bool is_string_ptr(struct btf *btf, const struct btf_type *t) @@ -4700,6 +4704,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, if (ctx_arg_info->offset == off) { info->reg_type = ctx_arg_info->reg_type; + info->btf = btf_vmlinux; info->btf_id = ctx_arg_info->btf_id; return true; } @@ -4716,6 +4721,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, ret = btf_translate_to_vmlinux(log, btf, t, tgt_type, arg); if (ret > 0) { + info->btf = btf_vmlinux; info->btf_id = ret; return true; } else { @@ -4723,6 +4729,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type, } } + info->btf = btf; info->btf_id = t->type; t = btf_type_by_id(btf, t->type); /* skip modifiers */ @@ -4749,7 +4756,7 @@ enum bpf_struct_walk_result { WALK_STRUCT, }; -static int btf_struct_walk(struct bpf_verifier_log *log, +static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, u32 *next_btf_id) { @@ -4760,7 +4767,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, u32 vlen, elem_id, mid; again: - tname = __btf_name_by_offset(btf_vmlinux, t->name_off); + tname = __btf_name_by_offset(btf, t->name_off); if (!btf_type_is_struct(t)) { bpf_log(log, "Type '%s' is not a struct\n", tname); return -EINVAL; @@ -4777,7 +4784,7 @@ again: goto error; member = btf_type_member(t) + vlen - 1; - mtype = btf_type_skip_modifiers(btf_vmlinux, member->type, + mtype = btf_type_skip_modifiers(btf, member->type, NULL); if (!btf_type_is_array(mtype)) goto error; @@ -4793,7 +4800,7 @@ again: /* Only allow structure for now, can be relaxed for * other types later. */ - t = btf_type_skip_modifiers(btf_vmlinux, array_elem->type, + t = btf_type_skip_modifiers(btf, array_elem->type, NULL); if (!btf_type_is_struct(t)) goto error; @@ -4851,10 +4858,10 @@ error: /* type of the field */ mid = member->type; - mtype = btf_type_by_id(btf_vmlinux, member->type); - mname = __btf_name_by_offset(btf_vmlinux, member->name_off); + mtype = btf_type_by_id(btf, member->type); + mname = __btf_name_by_offset(btf, member->name_off); - mtype = __btf_resolve_size(btf_vmlinux, mtype, &msize, + mtype = __btf_resolve_size(btf, mtype, &msize, &elem_type, &elem_id, &total_nelems, &mid); if (IS_ERR(mtype)) { @@ -4949,7 +4956,7 @@ error: mname, moff, tname, off, size); return -EACCES; } - stype = btf_type_skip_modifiers(btf_vmlinux, mtype->type, &id); + stype = btf_type_skip_modifiers(btf, mtype->type, &id); if (btf_type_is_struct(stype)) { *next_btf_id = id; return WALK_PTR; @@ -4975,7 +4982,7 @@ error: return -EINVAL; } -int btf_struct_access(struct bpf_verifier_log *log, +int btf_struct_access(struct bpf_verifier_log *log, const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype __maybe_unused, u32 *next_btf_id) @@ -4984,7 +4991,7 @@ int btf_struct_access(struct bpf_verifier_log *log, u32 id; do { - err = btf_struct_walk(log, t, off, size, &id); + err = btf_struct_walk(log, btf, t, off, size, &id); switch (err) { case WALK_PTR: @@ -5000,7 +5007,7 @@ int btf_struct_access(struct bpf_verifier_log *log, * by diving in it. At this point the offset is * aligned with the new type, so set it to 0. */ - t = btf_type_by_id(btf_vmlinux, id); + t = btf_type_by_id(btf, id); off = 0; break; default: @@ -5016,21 +5023,37 @@ int btf_struct_access(struct bpf_verifier_log *log, return -EINVAL; } +/* Check that two BTF types, each specified as an BTF object + id, are exactly + * the same. Trivial ID check is not enough due to module BTFs, because we can + * end up with two different module BTFs, but IDs point to the common type in + * vmlinux BTF. + */ +static bool btf_types_are_same(const struct btf *btf1, u32 id1, + const struct btf *btf2, u32 id2) +{ + if (id1 != id2) + return false; + if (btf1 == btf2) + return true; + return btf_type_by_id(btf1, id1) == btf_type_by_id(btf2, id2); +} + bool btf_struct_ids_match(struct bpf_verifier_log *log, - int off, u32 id, u32 need_type_id) + const struct btf *btf, u32 id, int off, + const struct btf *need_btf, u32 need_type_id) { const struct btf_type *type; int err; /* Are we already done? */ - if (need_type_id == id && off == 0) + if (off == 0 && btf_types_are_same(btf, id, need_btf, need_type_id)) return true; again: - type = btf_type_by_id(btf_vmlinux, id); + type = btf_type_by_id(btf, id); if (!type) return false; - err = btf_struct_walk(log, type, off, 1, &id); + err = btf_struct_walk(log, btf, type, off, 1, &id); if (err != WALK_STRUCT) return false; @@ -5039,7 +5062,7 @@ again: * continue the search with offset 0 in the new * type. */ - if (need_type_id != id) { + if (!btf_types_are_same(btf, id, need_btf, need_type_id)) { off = 0; goto again; } @@ -5710,7 +5733,7 @@ int btf_get_fd_by_id(u32 id) return fd; } -u32 btf_id(const struct btf *btf) +u32 btf_obj_id(const struct btf *btf) { return btf->id; } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index d16dd4945100..184204169949 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1691,6 +1691,8 @@ static void __bpf_prog_put_noref(struct bpf_prog *prog, bool deferred) bpf_prog_kallsyms_del_all(prog); btf_put(prog->aux->btf); bpf_prog_free_linfo(prog); + if (prog->aux->attach_btf) + btf_put(prog->aux->attach_btf); if (deferred) { if (prog->aux->sleepable) @@ -2113,6 +2115,20 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) prog->expected_attach_type = attr->expected_attach_type; prog->aux->attach_btf_id = attr->attach_btf_id; + + if (attr->attach_btf_id && !attr->attach_prog_fd) { + struct btf *btf; + + btf = bpf_get_btf_vmlinux(); + if (IS_ERR(btf)) + return PTR_ERR(btf); + if (!btf) + return -EINVAL; + + btf_get(btf); + prog->aux->attach_btf = btf; + } + if (attr->attach_prog_fd) { struct bpf_prog *dst_prog; @@ -2209,6 +2225,8 @@ free_prog_sec: free_uid(prog->aux->user); security_bpf_prog_free(prog->aux); free_prog: + if (prog->aux->attach_btf) + btf_put(prog->aux->attach_btf); bpf_prog_free(prog); return err; } @@ -2566,7 +2584,7 @@ static int bpf_tracing_prog_attach(struct bpf_prog *prog, goto out_put_prog; } - key = bpf_trampoline_compute_key(tgt_prog, btf_id); + key = bpf_trampoline_compute_key(tgt_prog, NULL, btf_id); } link = kzalloc(sizeof(*link), GFP_USER); @@ -3543,7 +3561,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, } if (prog->aux->btf) - info.btf_id = btf_id(prog->aux->btf); + info.btf_id = btf_obj_id(prog->aux->btf); ulen = info.nr_func_info; info.nr_func_info = prog->aux->func_info_cnt; @@ -3646,7 +3664,7 @@ static int bpf_map_get_info_by_fd(struct file *file, memcpy(info.name, map->name, sizeof(map->name)); if (map->btf) { - info.btf_id = btf_id(map->btf); + info.btf_id = btf_obj_id(map->btf); info.btf_key_type_id = map->btf_key_type_id; info.btf_value_type_id = map->btf_value_type_id; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e333ce43f281..2f3950839b85 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -238,7 +238,9 @@ struct bpf_call_arg_meta { u64 msize_max_value; int ref_obj_id; int func_id; + struct btf *btf; u32 btf_id; + struct btf *ret_btf; u32 ret_btf_id; }; @@ -556,10 +558,9 @@ static struct bpf_func_state *func(struct bpf_verifier_env *env, return cur->frame[reg->frameno]; } -const char *kernel_type_name(u32 id) +static const char *kernel_type_name(const struct btf* btf, u32 id) { - return btf_name_by_offset(btf_vmlinux, - btf_type_by_id(btf_vmlinux, id)->name_off); + return btf_name_by_offset(btf, btf_type_by_id(btf, id)->name_off); } static void print_verifier_state(struct bpf_verifier_env *env, @@ -589,7 +590,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, if (t == PTR_TO_BTF_ID || t == PTR_TO_BTF_ID_OR_NULL || t == PTR_TO_PERCPU_BTF_ID) - verbose(env, "%s", kernel_type_name(reg->btf_id)); + verbose(env, "%s", kernel_type_name(reg->btf, reg->btf_id)); verbose(env, "(id=%d", reg->id); if (reg_type_may_be_refcounted_or_null(t)) verbose(env, ",ref_obj_id=%d", reg->ref_obj_id); @@ -1383,7 +1384,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, static void mark_btf_ld_reg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno, - enum bpf_reg_type reg_type, u32 btf_id) + enum bpf_reg_type reg_type, + struct btf *btf, u32 btf_id) { if (reg_type == SCALAR_VALUE) { mark_reg_unknown(env, regs, regno); @@ -1391,6 +1393,7 @@ static void mark_btf_ld_reg(struct bpf_verifier_env *env, } mark_reg_known_zero(env, regs, regno); regs[regno].type = PTR_TO_BTF_ID; + regs[regno].btf = btf; regs[regno].btf_id = btf_id; } @@ -2764,7 +2767,7 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, /* check access to 'struct bpf_context' fields. Supports fixed offsets only */ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size, enum bpf_access_type t, enum bpf_reg_type *reg_type, - u32 *btf_id) + struct btf **btf, u32 *btf_id) { struct bpf_insn_access_aux info = { .reg_type = *reg_type, @@ -2782,10 +2785,12 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, */ *reg_type = info.reg_type; - if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) + if (*reg_type == PTR_TO_BTF_ID || *reg_type == PTR_TO_BTF_ID_OR_NULL) { + *btf = info.btf; *btf_id = info.btf_id; - else + } else { env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size; + } /* remember the offset of last byte accessed in ctx */ if (env->prog->aux->max_ctx_offset < off + size) env->prog->aux->max_ctx_offset = off + size; @@ -3297,8 +3302,8 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, int value_regno) { struct bpf_reg_state *reg = regs + regno; - const struct btf_type *t = btf_type_by_id(btf_vmlinux, reg->btf_id); - const char *tname = btf_name_by_offset(btf_vmlinux, t->name_off); + const struct btf_type *t = btf_type_by_id(reg->btf, reg->btf_id); + const char *tname = btf_name_by_offset(reg->btf, t->name_off); u32 btf_id; int ret; @@ -3319,23 +3324,23 @@ static int check_ptr_to_btf_access(struct bpf_verifier_env *env, } if (env->ops->btf_struct_access) { - ret = env->ops->btf_struct_access(&env->log, t, off, size, - atype, &btf_id); + ret = env->ops->btf_struct_access(&env->log, reg->btf, t, + off, size, atype, &btf_id); } else { if (atype != BPF_READ) { verbose(env, "only read is supported\n"); return -EACCES; } - ret = btf_struct_access(&env->log, t, off, size, atype, - &btf_id); + ret = btf_struct_access(&env->log, reg->btf, t, off, size, + atype, &btf_id); } if (ret < 0) return ret; if (atype == BPF_READ && value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, reg->btf, btf_id); return 0; } @@ -3385,12 +3390,12 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return -EACCES; } - ret = btf_struct_access(&env->log, t, off, size, atype, &btf_id); + ret = btf_struct_access(&env->log, btf_vmlinux, t, off, size, atype, &btf_id); if (ret < 0) return ret; if (value_regno >= 0) - mark_btf_ld_reg(env, regs, value_regno, ret, btf_id); + mark_btf_ld_reg(env, regs, value_regno, ret, btf_vmlinux, btf_id); return 0; } @@ -3466,6 +3471,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn mark_reg_unknown(env, regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; + struct btf *btf = NULL; u32 btf_id = 0; if (t == BPF_WRITE && value_regno >= 0 && @@ -3478,7 +3484,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err < 0) return err; - err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf_id); + err = check_ctx_access(env, insn_idx, off, size, t, ®_type, &btf, &btf_id); if (err) verbose_linfo(env, insn_idx, "; "); if (!err && t == BPF_READ && value_regno >= 0) { @@ -3500,8 +3506,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn */ regs[value_regno].subreg_def = DEF_NOT_SUBREG; if (reg_type == PTR_TO_BTF_ID || - reg_type == PTR_TO_BTF_ID_OR_NULL) + reg_type == PTR_TO_BTF_ID_OR_NULL) { + regs[value_regno].btf = btf; regs[value_regno].btf_id = btf_id; + } } regs[value_regno].type = reg_type; } @@ -4118,11 +4126,11 @@ found: arg_btf_id = compatible->btf_id; } - if (!btf_struct_ids_match(&env->log, reg->off, reg->btf_id, - *arg_btf_id)) { + if (!btf_struct_ids_match(&env->log, reg->btf, reg->btf_id, reg->off, + btf_vmlinux, *arg_btf_id)) { verbose(env, "R%d is of type %s but %s is expected\n", - regno, kernel_type_name(reg->btf_id), - kernel_type_name(*arg_btf_id)); + regno, kernel_type_name(reg->btf, reg->btf_id), + kernel_type_name(btf_vmlinux, *arg_btf_id)); return -EACCES; } @@ -4244,6 +4252,7 @@ skip_type_check: verbose(env, "Helper has invalid btf_id in R%d\n", regno); return -EACCES; } + meta->ret_btf = reg->btf; meta->ret_btf_id = reg->btf_id; } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) { if (meta->func_id == BPF_FUNC_spin_lock) { @@ -5190,16 +5199,16 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn const struct btf_type *t; mark_reg_known_zero(env, regs, BPF_REG_0); - t = btf_type_skip_modifiers(btf_vmlinux, meta.ret_btf_id, NULL); + t = btf_type_skip_modifiers(meta.ret_btf, meta.ret_btf_id, NULL); if (!btf_type_is_struct(t)) { u32 tsize; const struct btf_type *ret; const char *tname; /* resolve the type size of ksym. */ - ret = btf_resolve_size(btf_vmlinux, t, &tsize); + ret = btf_resolve_size(meta.ret_btf, t, &tsize); if (IS_ERR(ret)) { - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(meta.ret_btf, t->name_off); verbose(env, "unable to resolve the size of type '%s': %ld\n", tname, PTR_ERR(ret)); return -EINVAL; @@ -5212,6 +5221,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn regs[BPF_REG_0].type = fn->ret_type == RET_PTR_TO_MEM_OR_BTF_ID ? PTR_TO_BTF_ID : PTR_TO_BTF_ID_OR_NULL; + regs[BPF_REG_0].btf = meta.ret_btf; regs[BPF_REG_0].btf_id = meta.ret_btf_id; } } else if (fn->ret_type == RET_PTR_TO_BTF_ID_OR_NULL || @@ -5228,6 +5238,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn fn->ret_type, func_id_name(func_id), func_id); return -EINVAL; } + /* current BPF helper definitions are only coming from + * built-in code with type IDs from vmlinux BTF + */ + regs[BPF_REG_0].btf = btf_vmlinux; regs[BPF_REG_0].btf_id = ret_btf_id; } else { verbose(env, "unknown return type %d of func %s#%d\n", @@ -5627,7 +5641,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (reg_is_pkt_pointer(ptr_reg)) { dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ - dst_reg->raw = 0; + memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); } break; case BPF_SUB: @@ -5692,7 +5706,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->id = ++env->id_gen; /* something was added to pkt_ptr, set range to zero */ if (smin_val < 0) - dst_reg->raw = 0; + memset(&dst_reg->raw, 0, sizeof(dst_reg->raw)); } break; case BPF_AND: @@ -7744,6 +7758,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) break; case PTR_TO_BTF_ID: case PTR_TO_PERCPU_BTF_ID: + dst_reg->btf = aux->btf_var.btf; dst_reg->btf_id = aux->btf_var.btf_id; break; default: @@ -9739,6 +9754,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); if (percpu) { aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; + aux->btf_var.btf = btf_vmlinux; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; @@ -9757,6 +9773,7 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; + aux->btf_var.btf = btf_vmlinux; aux->btf_var.btf_id = type; } return 0; @@ -11609,7 +11626,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, bpf_log(log, "Tracing programs must provide btf_id\n"); return -EINVAL; } - btf = tgt_prog ? tgt_prog->aux->btf : btf_vmlinux; + btf = tgt_prog ? tgt_prog->aux->btf : prog->aux->attach_btf; if (!btf) { bpf_log(log, "FENTRY/FEXIT program can only be attached to another program annotated with BTF\n"); @@ -11885,7 +11902,7 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) return ret; } - key = bpf_trampoline_compute_key(tgt_prog, btf_id); + key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); tr = bpf_trampoline_get(key, &tgt_info); if (!tr) return -ENOMEM; diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 618954f82764..d520e61649c8 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -95,6 +95,7 @@ static bool bpf_tcp_ca_is_valid_access(int off, int size, } static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, const struct btf_type *t, int off, int size, enum bpf_access_type atype, u32 *next_btf_id) @@ -102,7 +103,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, size_t end; if (atype == BPF_READ) - return btf_struct_access(log, t, off, size, atype, next_btf_id); + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id); if (t != tcp_sock_type) { bpf_log(log, "only read is supported\n"); -- cgit v1.2.3 From 290248a5b7d829871b3ea3c62578613a580a1744 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 3 Dec 2020 12:46:30 -0800 Subject: bpf: Allow to specify kernel module BTFs when attaching BPF programs Add ability for user-space programs to specify non-vmlinux BTF when attaching BTF-powered BPF programs: raw_tp, fentry/fexit/fmod_ret, LSM, etc. For this, attach_prog_fd (now with the alias name attach_btf_obj_fd) should specify FD of a module or vmlinux BTF object. For backwards compatibility reasons, 0 denotes vmlinux BTF. Only kernel BTF (vmlinux or module) can be specified. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201203204634.1325171-11-andrii@kernel.org --- include/linux/btf.h | 1 + include/uapi/linux/bpf.h | 7 +++- kernel/bpf/btf.c | 5 +++ kernel/bpf/syscall.c | 82 ++++++++++++++++++++++++++---------------- tools/include/uapi/linux/bpf.h | 7 +++- 5 files changed, 69 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/include/linux/btf.h b/include/linux/btf.h index fb608e4de076..4c200f5d242b 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -90,6 +90,7 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, int btf_get_fd_by_id(u32 id); u32 btf_obj_id(const struct btf *btf); +bool btf_is_kernel(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c3458ec1f30a..1233f14f659f 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -557,7 +557,12 @@ union bpf_attr { __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ - __u32 attach_prog_fd; /* 0 to attach to vmlinux */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 7a19bf5bfe97..8d6bdb4f4d61 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5738,6 +5738,11 @@ u32 btf_obj_id(const struct btf *btf) return btf->id; } +bool btf_is_kernel(const struct btf *btf) +{ + return btf->kernel_btf; +} + static int btf_id_cmp_func(const void *a, const void *b) { const int *pa = a, *pb = b; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 184204169949..0cd3cc2af9c1 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1926,12 +1926,16 @@ static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) static int bpf_prog_load_check_attach(enum bpf_prog_type prog_type, enum bpf_attach_type expected_attach_type, - u32 btf_id, u32 prog_fd) + struct btf *attach_btf, u32 btf_id, + struct bpf_prog *dst_prog) { if (btf_id) { if (btf_id > BTF_MAX_TYPE) return -EINVAL; + if (!attach_btf && !dst_prog) + return -EINVAL; + switch (prog_type) { case BPF_PROG_TYPE_TRACING: case BPF_PROG_TYPE_LSM: @@ -1943,7 +1947,10 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type, } } - if (prog_fd && prog_type != BPF_PROG_TYPE_TRACING && + if (attach_btf && (!btf_id || dst_prog)) + return -EINVAL; + + if (dst_prog && prog_type != BPF_PROG_TYPE_TRACING && prog_type != BPF_PROG_TYPE_EXT) return -EINVAL; @@ -2060,7 +2067,8 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type) static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) { enum bpf_prog_type type = attr->prog_type; - struct bpf_prog *prog; + struct bpf_prog *prog, *dst_prog = NULL; + struct btf *attach_btf = NULL; int err; char license[128]; bool is_gpl; @@ -2102,44 +2110,56 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (is_perfmon_prog_type(type) && !perfmon_capable()) return -EPERM; + /* attach_prog_fd/attach_btf_obj_fd can specify fd of either bpf_prog + * or btf, we need to check which one it is + */ + if (attr->attach_prog_fd) { + dst_prog = bpf_prog_get(attr->attach_prog_fd); + if (IS_ERR(dst_prog)) { + dst_prog = NULL; + attach_btf = btf_get_by_fd(attr->attach_btf_obj_fd); + if (IS_ERR(attach_btf)) + return -EINVAL; + if (!btf_is_kernel(attach_btf)) { + btf_put(attach_btf); + return -EINVAL; + } + } + } else if (attr->attach_btf_id) { + /* fall back to vmlinux BTF, if BTF type ID is specified */ + attach_btf = bpf_get_btf_vmlinux(); + if (IS_ERR(attach_btf)) + return PTR_ERR(attach_btf); + if (!attach_btf) + return -EINVAL; + btf_get(attach_btf); + } + bpf_prog_load_fixup_attach_type(attr); if (bpf_prog_load_check_attach(type, attr->expected_attach_type, - attr->attach_btf_id, - attr->attach_prog_fd)) + attach_btf, attr->attach_btf_id, + dst_prog)) { + if (dst_prog) + bpf_prog_put(dst_prog); + if (attach_btf) + btf_put(attach_btf); return -EINVAL; + } /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); - if (!prog) + if (!prog) { + if (dst_prog) + bpf_prog_put(dst_prog); + if (attach_btf) + btf_put(attach_btf); return -ENOMEM; + } prog->expected_attach_type = attr->expected_attach_type; + prog->aux->attach_btf = attach_btf; prog->aux->attach_btf_id = attr->attach_btf_id; - - if (attr->attach_btf_id && !attr->attach_prog_fd) { - struct btf *btf; - - btf = bpf_get_btf_vmlinux(); - if (IS_ERR(btf)) - return PTR_ERR(btf); - if (!btf) - return -EINVAL; - - btf_get(btf); - prog->aux->attach_btf = btf; - } - - if (attr->attach_prog_fd) { - struct bpf_prog *dst_prog; - - dst_prog = bpf_prog_get(attr->attach_prog_fd); - if (IS_ERR(dst_prog)) { - err = PTR_ERR(dst_prog); - goto free_prog; - } - prog->aux->dst_prog = dst_prog; - } - + prog->aux->dst_prog = dst_prog; prog->aux->offload_requested = !!attr->prog_ifindex; prog->aux->sleepable = attr->prog_flags & BPF_F_SLEEPABLE; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c3458ec1f30a..1233f14f659f 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -557,7 +557,12 @@ union bpf_attr { __aligned_u64 line_info; /* line info */ __u32 line_info_cnt; /* number of bpf_line_info records */ __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ - __u32 attach_prog_fd; /* 0 to attach to vmlinux */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ -- cgit v1.2.3 From b81b8f40c5b43dcb2ff473236baccc421706435f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:35 +0100 Subject: block: remove the unused block_sleeprq tracepoint The block_sleeprq tracepoint was only used by the legacy request code. Remove it now that the legacy request code is gone. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- include/trace/events/block.h | 18 ------------------ kernel/trace/blktrace.c | 22 ---------------------- 2 files changed, 40 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 34d64ca306b1..76459cf750e1 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -441,24 +441,6 @@ DEFINE_EVENT(block_get_rq, block_getrq, TP_ARGS(q, bio, rw) ); -/** - * block_sleeprq - waiting to get a free request entry in queue for block IO operation - * @q: queue for operation - * @bio: pending block IO operation (can be %NULL) - * @rw: low bit indicates a read (%0) or a write (%1) - * - * In the case where a request struct cannot be provided for queue @q - * the process needs to wait for an request struct to become - * available. This tracepoint event is generated each time the - * process goes to sleep waiting for request struct become available. - */ -DEFINE_EVENT(block_get_rq, block_sleeprq, - - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - - TP_ARGS(q, bio, rw) -); - /** * block_plug - keep operations requests in request queue * @q: request queue to plug diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a482a37848bf..ced589df304b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -959,25 +959,6 @@ static void blk_add_trace_getrq(void *ignore, } } - -static void blk_add_trace_sleeprq(void *ignore, - struct request_queue *q, - struct bio *bio, int rw) -{ - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); - else { - struct blk_trace *bt; - - rcu_read_lock(); - bt = rcu_dereference(q->blk_trace); - if (bt) - __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, - 0, 0, NULL, 0); - rcu_read_unlock(); - } -} - static void blk_add_trace_plug(void *ignore, struct request_queue *q) { struct blk_trace *bt; @@ -1164,8 +1145,6 @@ static void blk_register_tracepoints(void) WARN_ON(ret); ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); - ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); - WARN_ON(ret); ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); ret = register_trace_block_unplug(blk_add_trace_unplug, NULL); @@ -1185,7 +1164,6 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_split(blk_add_trace_split, NULL); unregister_trace_block_unplug(blk_add_trace_unplug, NULL); unregister_trace_block_plug(blk_add_trace_plug, NULL); - unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); unregister_trace_block_getrq(blk_add_trace_getrq, NULL); unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); -- cgit v1.2.3 From e8a676d61c07eccfcd9d6fddfe4dcb630651c29a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:36 +0100 Subject: block: simplify and extend the block_bio_merge tracepoint class The block_bio_merge tracepoint class can be reused for most bio-based tracepoints. For that it just needs to lose the superfluous q and rq parameters. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- block/blk-merge.c | 4 +- block/blk-mq.c | 2 +- block/bounce.c | 2 +- include/trace/events/block.h | 158 +++++++++---------------------------------- kernel/trace/blktrace.c | 41 +++-------- 6 files changed, 48 insertions(+), 161 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index cee568389b7e..cb24654983e1 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -907,7 +907,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_queue(q, bio); + trace_block_bio_queue(bio); /* Now that enqueuing has been traced, we need to trace * completion as well. */ diff --git a/block/blk-merge.c b/block/blk-merge.c index cb351ab9b77d..1a46d5bbd399 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -922,7 +922,7 @@ static enum bio_merge_status bio_attempt_back_merge(struct request *req, if (!ll_back_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; - trace_block_bio_backmerge(req->q, req, bio); + trace_block_bio_backmerge(bio); rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) @@ -946,7 +946,7 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req, if (!ll_front_merge_fn(req, bio, nr_segs)) return BIO_MERGE_FAILED; - trace_block_bio_frontmerge(req->q, req, bio); + trace_block_bio_frontmerge(bio); rq_qos_merge(req->q, req, bio); if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) diff --git a/block/blk-mq.c b/block/blk-mq.c index 37c682855a63..21e2b4b6b742 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2184,7 +2184,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) goto queue_exit; } - trace_block_getrq(q, bio, bio->bi_opf); + trace_block_getrq(bio); rq_qos_track(q, rq, bio); diff --git a/block/bounce.c b/block/bounce.c index 162a6eee8999..d3f51acd6e3b 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -340,7 +340,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, } } - trace_block_bio_bounce(q, *bio_orig); + trace_block_bio_bounce(*bio_orig); bio->bi_flags |= (1 << BIO_BOUNCED); diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 76459cf750e1..506c29dc7c76 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -226,45 +226,6 @@ DEFINE_EVENT(block_rq, block_rq_merge, TP_ARGS(q, rq) ); -/** - * block_bio_bounce - used bounce buffer when processing block operation - * @q: queue holding the block operation - * @bio: block operation - * - * A bounce buffer was used to handle the block operation @bio in @q. - * This occurs when hardware limitations prevent a direct transfer of - * data between the @bio data memory area and the IO device. Use of a - * bounce buffer requires extra copying of data and decreases - * performance. - */ -TRACE_EVENT(block_bio_bounce, - - TP_PROTO(struct request_queue *q, struct bio *bio), - - TP_ARGS(q, bio), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( sector_t, sector ) - __field( unsigned int, nr_sector ) - __array( char, rwbs, RWBS_LEN ) - __array( char, comm, TASK_COMM_LEN ) - ), - - TP_fast_assign( - __entry->dev = bio_dev(bio); - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - ), - - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) -); - /** * block_bio_complete - completed all work on the block operation * @q: queue holding the block operation @@ -301,11 +262,11 @@ TRACE_EVENT(block_bio_complete, __entry->nr_sector, __entry->error) ); -DECLARE_EVENT_CLASS(block_bio_merge, +DECLARE_EVENT_CLASS(block_bio, - TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), + TP_PROTO(struct bio *bio), - TP_ARGS(q, rq, bio), + TP_ARGS(bio), TP_STRUCT__entry( __field( dev_t, dev ) @@ -329,116 +290,63 @@ DECLARE_EVENT_CLASS(block_bio_merge, __entry->nr_sector, __entry->comm) ); +/** + * block_bio_bounce - used bounce buffer when processing block operation + * @bio: block operation + * + * A bounce buffer was used to handle the block operation @bio in @q. + * This occurs when hardware limitations prevent a direct transfer of + * data between the @bio data memory area and the IO device. Use of a + * bounce buffer requires extra copying of data and decreases + * performance. + */ +DEFINE_EVENT(block_bio, block_bio_bounce, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + /** * block_bio_backmerge - merging block operation to the end of an existing operation - * @q: queue holding operation - * @rq: request bio is being merged into * @bio: new block operation to merge * - * Merging block request @bio to the end of an existing block request - * in queue @q. + * Merging block request @bio to the end of an existing block request. */ -DEFINE_EVENT(block_bio_merge, block_bio_backmerge, - - TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), - - TP_ARGS(q, rq, bio) +DEFINE_EVENT(block_bio, block_bio_backmerge, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** * block_bio_frontmerge - merging block operation to the beginning of an existing operation - * @q: queue holding operation - * @rq: request bio is being merged into * @bio: new block operation to merge * - * Merging block IO operation @bio to the beginning of an existing block - * operation in queue @q. + * Merging block IO operation @bio to the beginning of an existing block request. */ -DEFINE_EVENT(block_bio_merge, block_bio_frontmerge, - - TP_PROTO(struct request_queue *q, struct request *rq, struct bio *bio), - - TP_ARGS(q, rq, bio) +DEFINE_EVENT(block_bio, block_bio_frontmerge, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** * block_bio_queue - putting new block IO operation in queue - * @q: queue holding operation * @bio: new block operation * * About to place the block IO operation @bio into queue @q. */ -TRACE_EVENT(block_bio_queue, - - TP_PROTO(struct request_queue *q, struct bio *bio), - - TP_ARGS(q, bio), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( sector_t, sector ) - __field( unsigned int, nr_sector ) - __array( char, rwbs, RWBS_LEN ) - __array( char, comm, TASK_COMM_LEN ) - ), - - TP_fast_assign( - __entry->dev = bio_dev(bio); - __entry->sector = bio->bi_iter.bi_sector; - __entry->nr_sector = bio_sectors(bio); - blk_fill_rwbs(__entry->rwbs, bio->bi_opf, bio->bi_iter.bi_size); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - ), - - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) -); - -DECLARE_EVENT_CLASS(block_get_rq, - - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - - TP_ARGS(q, bio, rw), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( sector_t, sector ) - __field( unsigned int, nr_sector ) - __array( char, rwbs, RWBS_LEN ) - __array( char, comm, TASK_COMM_LEN ) - ), - - TP_fast_assign( - __entry->dev = bio ? bio_dev(bio) : 0; - __entry->sector = bio ? bio->bi_iter.bi_sector : 0; - __entry->nr_sector = bio ? bio_sectors(bio) : 0; - blk_fill_rwbs(__entry->rwbs, - bio ? bio->bi_opf : 0, __entry->nr_sector); - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - ), - - TP_printk("%d,%d %s %llu + %u [%s]", - MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, - (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->comm) +DEFINE_EVENT(block_bio, block_bio_queue, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** * block_getrq - get a free request entry in queue for block IO operations - * @q: queue for operations * @bio: pending block IO operation (can be %NULL) - * @rw: low bit indicates a read (%0) or a write (%1) * - * A request struct for queue @q has been allocated to handle the - * block IO operation @bio. + * A request struct has been allocated to handle the block IO operation @bio. */ -DEFINE_EVENT(block_get_rq, block_getrq, - - TP_PROTO(struct request_queue *q, struct bio *bio, int rw), - - TP_ARGS(q, bio, rw) +DEFINE_EVENT(block_bio, block_getrq, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) ); /** diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index ced589df304b..7ab88e00c157 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -906,10 +906,9 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, rcu_read_unlock(); } -static void blk_add_trace_bio_bounce(void *ignore, - struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, @@ -919,44 +918,24 @@ static void blk_add_trace_bio_complete(void *ignore, blk_status_to_errno(bio->bi_status)); } -static void blk_add_trace_bio_backmerge(void *ignore, - struct request_queue *q, - struct request *rq, - struct bio *bio) +static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0); } -static void blk_add_trace_bio_frontmerge(void *ignore, - struct request_queue *q, - struct request *rq, - struct bio *bio) +static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0); } -static void blk_add_trace_bio_queue(void *ignore, - struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0); } -static void blk_add_trace_getrq(void *ignore, - struct request_queue *q, - struct bio *bio, int rw) +static void blk_add_trace_getrq(void *ignore, struct bio *bio) { - if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); - else { - struct blk_trace *bt; - - rcu_read_lock(); - bt = rcu_dereference(q->blk_trace); - if (bt) - __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, - NULL, 0); - rcu_read_unlock(); - } + blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0); } static void blk_add_trace_plug(void *ignore, struct request_queue *q) -- cgit v1.2.3 From eb6f7f7cd3af0f67ce57b21fab1bc64beb643581 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:37 +0100 Subject: block: remove the request_queue argument to the block_split tracepoint The request_queue can trivially be derived from the bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- drivers/md/dm.c | 2 +- include/trace/events/block.h | 14 ++++++-------- kernel/trace/blktrace.c | 5 ++--- 4 files changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/block/blk-merge.c b/block/blk-merge.c index 1a46d5bbd399..4071daa88a5e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -338,7 +338,7 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) split->bi_opf |= REQ_NOMERGE; bio_chain(split, *bio); - trace_block_split(q, split, (*bio)->bi_iter.bi_sector); + trace_block_split(split, (*bio)->bi_iter.bi_sector); submit_bio_noacct(*bio); *bio = split; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index ed7e836efbcd..9a5bd90779c7 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1612,7 +1612,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, part_stat_unlock(); bio_chain(b, bio); - trace_block_split(md->queue, b, bio->bi_iter.bi_sector); + trace_block_split(b, bio->bi_iter.bi_sector); ret = submit_bio_noacct(bio); break; } diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 506c29dc7c76..b415e4cba843 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -411,21 +411,19 @@ DEFINE_EVENT(block_unplug, block_unplug, /** * block_split - split a single bio struct into two bio structs - * @q: queue containing the bio * @bio: block operation being split * @new_sector: The starting sector for the new bio * - * The bio request @bio in request queue @q needs to be split into two - * bio requests. The newly created @bio request starts at - * @new_sector. This split may be required due to hardware limitation - * such as operation crossing device boundaries in a RAID system. + * The bio request @bio needs to be split into two bio requests. The newly + * created @bio request starts at @new_sector. This split may be required due to + * hardware limitations such as operation crossing device boundaries in a RAID + * system. */ TRACE_EVENT(block_split, - TP_PROTO(struct request_queue *q, struct bio *bio, - unsigned int new_sector), + TP_PROTO(struct bio *bio, unsigned int new_sector), - TP_ARGS(q, bio, new_sector), + TP_ARGS(bio, new_sector), TP_STRUCT__entry( __field( dev_t, dev ) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7ab88e00c157..3ca6d62114f4 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -970,10 +970,9 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, rcu_read_unlock(); } -static void blk_add_trace_split(void *ignore, - struct request_queue *q, struct bio *bio, - unsigned int pdu) +static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { + struct request_queue *q = bio->bi_disk->queue; struct blk_trace *bt; rcu_read_lock(); -- cgit v1.2.3 From 1c02fca620f7273b597591065d366e2cca948d8f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:38 +0100 Subject: block: remove the request_queue argument to the block_bio_remap tracepoint The request_queue can trivially be derived from the bio. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 2 +- drivers/md/dm.c | 3 +-- drivers/md/md-linear.c | 3 +-- drivers/md/md.c | 5 ++--- drivers/md/raid0.c | 4 ++-- drivers/md/raid1.c | 7 +++---- drivers/md/raid10.c | 6 ++---- drivers/md/raid5.c | 15 +++++++-------- drivers/nvme/host/multipath.c | 3 +-- include/trace/events/block.h | 8 +++----- kernel/trace/blktrace.c | 14 +++++--------- 11 files changed, 28 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/block/blk-core.c b/block/blk-core.c index cb24654983e1..96e5fcd7f071 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -758,7 +758,7 @@ static inline int blk_partition_remap(struct bio *bio) if (bio_check_eod(bio, bdev_nr_sectors(p))) goto out; bio->bi_iter.bi_sector += p->bd_start_sect; - trace_block_bio_remap(bio->bi_disk->queue, bio, p->bd_dev, + trace_block_bio_remap(bio, p->bd_dev, bio->bi_iter.bi_sector - p->bd_start_sect); } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 9a5bd90779c7..5181907dc595 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1276,8 +1276,7 @@ static blk_qc_t __map_bio(struct dm_target_io *tio) break; case DM_MAPIO_REMAPPED: /* the bio has been remapped so dispatch it */ - trace_block_bio_remap(clone->bi_disk->queue, clone, - bio_dev(io->orig_bio), sector); + trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector); ret = submit_bio_noacct(clone); break; case DM_MAPIO_KILL: diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 98f1b4b2bdce..68cac7d19278 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -257,8 +257,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio_endio(bio); } else { if (mddev->gendisk) - trace_block_bio_remap(bio->bi_disk->queue, - bio, disk_devt(mddev->gendisk), + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); diff --git a/drivers/md/md.c b/drivers/md/md.c index 0065736f05b4..c555be0a8dce 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8591,9 +8591,8 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, bio_chain(discard_bio, bio); bio_clone_blkg_association(discard_bio, bio); if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(rdev->bdev), - discard_bio, disk_devt(mddev->gendisk), - bio->bi_iter.bi_sector); + trace_block_bio_remap(discard_bio, disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); submit_bio_noacct(discard_bio); } EXPORT_SYMBOL(md_submit_discard_bio); diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 6f44177593a5..e5d7411cba9b 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -571,8 +571,8 @@ static bool raid0_make_request(struct mddev *mddev, struct bio *bio) tmp_dev->data_offset; if (mddev->gendisk) - trace_block_bio_remap(bio->bi_disk->queue, bio, - disk_devt(mddev->gendisk), bio_sector); + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), + bio_sector); mddev_check_writesame(mddev, bio); mddev_check_write_zeroes(mddev, bio); submit_bio_noacct(bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 960d854c07f8..c0347997f6ff 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1305,8 +1305,8 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r1_bio; if (mddev->gendisk) - trace_block_bio_remap(read_bio->bi_disk->queue, read_bio, - disk_devt(mddev->gendisk), r1_bio->sector); + trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), + r1_bio->sector); submit_bio_noacct(read_bio); } @@ -1517,8 +1517,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, atomic_inc(&r1_bio->remaining); if (mddev->gendisk) - trace_block_bio_remap(mbio->bi_disk->queue, - mbio, disk_devt(mddev->gendisk), + trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_disk = (void *)conf->mirrors[i].rdev; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b7bca6703df8..a6f99fa0b32c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1200,8 +1200,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, read_bio->bi_private = r10_bio; if (mddev->gendisk) - trace_block_bio_remap(read_bio->bi_disk->queue, - read_bio, disk_devt(mddev->gendisk), + trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), r10_bio->sector); submit_bio_noacct(read_bio); return; @@ -1250,8 +1249,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, mbio->bi_private = r10_bio; if (conf->mddev->gendisk) - trace_block_bio_remap(mbio->bi_disk->queue, - mbio, disk_devt(conf->mddev->gendisk), + trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), r10_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_disk = (void *)rdev; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 39343479ac2a..3a90cc0e43ca 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1222,9 +1222,9 @@ again: set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); if (conf->mddev->gendisk) - trace_block_bio_remap(bi->bi_disk->queue, - bi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + trace_block_bio_remap(bi, + disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, bi); else @@ -1272,9 +1272,9 @@ again: if (op == REQ_OP_DISCARD) rbi->bi_vcnt = 0; if (conf->mddev->gendisk) - trace_block_bio_remap(rbi->bi_disk->queue, - rbi, disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + trace_block_bio_remap(rbi, + disk_devt(conf->mddev->gendisk), + sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, rbi); else @@ -5468,8 +5468,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) spin_unlock_irq(&conf->device_lock); if (mddev->gendisk) - trace_block_bio_remap(align_bi->bi_disk->queue, - align_bi, disk_devt(mddev->gendisk), + trace_block_bio_remap(align_bi, disk_devt(mddev->gendisk), raid_bio->bi_iter.bi_sector); submit_bio_noacct(align_bi); return 1; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 74896be40c17..106cf5c44ee7 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -312,8 +312,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) if (likely(ns)) { bio->bi_disk = ns->disk; bio->bi_opf |= REQ_NVME_MPATH; - trace_block_bio_remap(bio->bi_disk->queue, bio, - disk_devt(ns->head->disk), + trace_block_bio_remap(bio, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); ret = submit_bio_noacct(bio); } else if (nvme_available_path(head)) { diff --git a/include/trace/events/block.h b/include/trace/events/block.h index b415e4cba843..8fb89574d867 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -450,9 +450,8 @@ TRACE_EVENT(block_split, /** * block_bio_remap - map request for a logical device to the raw device - * @q: queue holding the operation * @bio: revised operation - * @dev: device for the operation + * @dev: original device for the operation * @from: original sector for the operation * * An operation for a logical device has been mapped to the @@ -460,10 +459,9 @@ TRACE_EVENT(block_split, */ TRACE_EVENT(block_bio_remap, - TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t from), + TP_PROTO(struct bio *bio, dev_t dev, sector_t from), - TP_ARGS(q, bio, dev, from), + TP_ARGS(bio, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 3ca6d62114f4..405637144a03 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -993,20 +993,16 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) /** * blk_add_trace_bio_remap - Add a trace for a bio-remap operation * @ignore: trace callback data parameter (not used) - * @q: queue the io is for * @bio: the source bio - * @dev: target device + * @dev: source device * @from: source sector * - * Description: - * Device mapper or raid target sometimes need to split a bio because - * it spans a stripe (or similar). Add a trace for that action. - * + * Called after a bio is remapped to a different device and/or sector. **/ -static void blk_add_trace_bio_remap(void *ignore, - struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from) +static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, + sector_t from) { + struct request_queue *q = bio->bi_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r; -- cgit v1.2.3 From a54895fa057c67700270777f7661d8d3c7fda88a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 3 Dec 2020 17:21:39 +0100 Subject: block: remove the request_queue to argument request based tracepoints The request_queue can trivially be derived from the request. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Reviewed-by: Hannes Reinecke Reviewed-by: Chaitanya Kulkarni Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-merge.c | 2 +- block/blk-mq-sched.c | 2 +- block/blk-mq.c | 8 ++++---- drivers/md/dm-rq.c | 2 +- drivers/s390/scsi/zfcp_fsf.c | 3 +-- include/linux/blktrace_api.h | 5 ++--- include/trace/events/block.h | 30 ++++++++++++------------------ kernel/trace/blktrace.c | 44 +++++++++++++++++--------------------------- 8 files changed, 39 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/block/blk-merge.c b/block/blk-merge.c index 4071daa88a5e..7497d86fff38 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -799,7 +799,7 @@ static struct request *attempt_merge(struct request_queue *q, */ blk_account_io_merge_request(next); - trace_block_rq_merge(q, next); + trace_block_rq_merge(next); /* * ownership of bio passed from next to req, return 'next' for diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index d1eafe2c045c..deff4e826e23 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -386,7 +386,7 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); void blk_mq_sched_request_inserted(struct request *rq) { - trace_block_rq_insert(rq->q, rq); + trace_block_rq_insert(rq); } EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); diff --git a/block/blk-mq.c b/block/blk-mq.c index 21e2b4b6b742..cf3916e2852f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -733,7 +733,7 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; - trace_block_rq_issue(q, rq); + trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { rq->io_start_time_ns = ktime_get_ns(); @@ -760,7 +760,7 @@ static void __blk_mq_requeue_request(struct request *rq) blk_mq_put_driver_tag(rq); - trace_block_rq_requeue(q, rq); + trace_block_rq_requeue(rq); rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { @@ -1821,7 +1821,7 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, lockdep_assert_held(&ctx->lock); - trace_block_rq_insert(hctx->queue, rq); + trace_block_rq_insert(rq); if (at_head) list_add(&rq->queuelist, &ctx->rq_lists[type]); @@ -1878,7 +1878,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, */ list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); - trace_block_rq_insert(hctx->queue, rq); + trace_block_rq_insert(rq); } spin_lock(&ctx->lock); diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 729a72ec30cc..13b4385f4d5a 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -397,7 +397,7 @@ static int map_request(struct dm_rq_target_io *tio) } /* The target has remapped the I/O so dispatch it */ - trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), + trace_block_rq_remap(clone, disk_devt(dm_disk(md)), blk_rq_pos(rq)); ret = dm_dispatch_clone_request(clone, rq); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index 6cb963a06777..37d450f46952 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -2359,8 +2359,7 @@ static void zfcp_fsf_req_trace(struct zfcp_fsf_req *req, struct scsi_cmnd *scsi) } } - blk_add_driver_data(scsi->request->q, scsi->request, &blktrc, - sizeof(blktrc)); + blk_add_driver_data(scsi->request, &blktrc, sizeof(blktrc)); } /** diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 3b6ff5902edc..05556573b896 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -75,8 +75,7 @@ static inline bool blk_trace_note_message_enabled(struct request_queue *q) return ret; } -extern void blk_add_driver_data(struct request_queue *q, struct request *rq, - void *data, size_t len); +extern void blk_add_driver_data(struct request *rq, void *data, size_t len); extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct block_device *bdev, char __user *arg); @@ -90,7 +89,7 @@ extern struct attribute_group blk_trace_attr_group; #else /* !CONFIG_BLK_DEV_IO_TRACE */ # define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) # define blk_trace_shutdown(q) do { } while (0) -# define blk_add_driver_data(q, rq, data, len) do {} while (0) +# define blk_add_driver_data(rq, data, len) do {} while (0) # define blk_trace_setup(q, name, dev, bdev, arg) (-ENOTTY) # define blk_trace_startstop(q, start) (-ENOTTY) # define blk_trace_remove(q) (-ENOTTY) diff --git a/include/trace/events/block.h b/include/trace/events/block.h index 8fb89574d867..0d782663a005 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -64,7 +64,6 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer, /** * block_rq_requeue - place block IO request back on a queue - * @q: queue holding operation * @rq: block IO operation request * * The block operation request @rq is being placed back into queue @@ -73,9 +72,9 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer, */ TRACE_EVENT(block_rq_requeue, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq), + TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) @@ -147,9 +146,9 @@ TRACE_EVENT(block_rq_complete, DECLARE_EVENT_CLASS(block_rq, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq), + TP_ARGS(rq), TP_STRUCT__entry( __field( dev_t, dev ) @@ -181,7 +180,6 @@ DECLARE_EVENT_CLASS(block_rq, /** * block_rq_insert - insert block operation request into queue - * @q: target queue * @rq: block IO operation request * * Called immediately before block operation request @rq is inserted @@ -191,14 +189,13 @@ DECLARE_EVENT_CLASS(block_rq, */ DEFINE_EVENT(block_rq, block_rq_insert, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq) + TP_ARGS(rq) ); /** * block_rq_issue - issue pending block IO request operation to device driver - * @q: queue holding operation * @rq: block IO operation operation request * * Called when block operation request @rq from queue @q is sent to a @@ -206,14 +203,13 @@ DEFINE_EVENT(block_rq, block_rq_insert, */ DEFINE_EVENT(block_rq, block_rq_issue, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq) + TP_ARGS(rq) ); /** * block_rq_merge - merge request with another one in the elevator - * @q: queue holding operation * @rq: block IO operation operation request * * Called when block operation request @rq from queue @q is merged to another @@ -221,9 +217,9 @@ DEFINE_EVENT(block_rq, block_rq_issue, */ DEFINE_EVENT(block_rq, block_rq_merge, - TP_PROTO(struct request_queue *q, struct request *rq), + TP_PROTO(struct request *rq), - TP_ARGS(q, rq) + TP_ARGS(rq) ); /** @@ -491,7 +487,6 @@ TRACE_EVENT(block_bio_remap, /** * block_rq_remap - map request for a block operation request - * @q: queue holding the operation * @rq: block IO operation request * @dev: device for the operation * @from: original sector for the operation @@ -502,10 +497,9 @@ TRACE_EVENT(block_bio_remap, */ TRACE_EVENT(block_rq_remap, - TP_PROTO(struct request_queue *q, struct request *rq, dev_t dev, - sector_t from), + TP_PROTO(struct request *rq, dev_t dev, sector_t from), - TP_ARGS(q, rq, dev, from), + TP_ARGS(rq, dev, from), TP_STRUCT__entry( __field( dev_t, dev ) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 405637144a03..7839a78205c2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -795,12 +795,12 @@ static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) #endif static u64 -blk_trace_request_get_cgid(struct request_queue *q, struct request *rq) +blk_trace_request_get_cgid(struct request *rq) { if (!rq->bio) return 0; /* Use the first bio */ - return blk_trace_bio_get_cgid(q, rq->bio); + return blk_trace_bio_get_cgid(rq->q, rq->bio); } /* @@ -841,40 +841,35 @@ static void blk_add_trace_rq(struct request *rq, int error, rcu_read_unlock(); } -static void blk_add_trace_rq_insert(void *ignore, - struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_insert(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } -static void blk_add_trace_rq_issue(void *ignore, - struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_issue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } -static void blk_add_trace_rq_merge(void *ignore, - struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_merge(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_BACKMERGE, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } -static void blk_add_trace_rq_requeue(void *ignore, - struct request_queue *q, - struct request *rq) +static void blk_add_trace_rq_requeue(void *ignore, struct request *rq) { blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); } static void blk_add_trace_rq_complete(void *ignore, struct request *rq, int error, unsigned int nr_bytes) { blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE, - blk_trace_request_get_cgid(rq->q, rq)); + blk_trace_request_get_cgid(rq)); } /** @@ -1037,16 +1032,14 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, * Add a trace for that action. * **/ -static void blk_add_trace_rq_remap(void *ignore, - struct request_queue *q, - struct request *rq, dev_t dev, +static void blk_add_trace_rq_remap(void *ignore, struct request *rq, dev_t dev, sector_t from) { struct blk_trace *bt; struct blk_io_trace_remap r; rcu_read_lock(); - bt = rcu_dereference(q->blk_trace); + bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; @@ -1058,13 +1051,12 @@ static void blk_add_trace_rq_remap(void *ignore, __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rq_data_dir(rq), 0, BLK_TA_REMAP, 0, - sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); + sizeof(r), &r, blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } /** * blk_add_driver_data - Add binary message with driver-specific data - * @q: queue the io is for * @rq: io request * @data: driver-specific data * @len: length of driver-specific data @@ -1073,14 +1065,12 @@ static void blk_add_trace_rq_remap(void *ignore, * Some drivers might want to write driver-specific data per request. * **/ -void blk_add_driver_data(struct request_queue *q, - struct request *rq, - void *data, size_t len) +void blk_add_driver_data(struct request *rq, void *data, size_t len) { struct blk_trace *bt; rcu_read_lock(); - bt = rcu_dereference(q->blk_trace); + bt = rcu_dereference(rq->q->blk_trace); if (likely(!bt)) { rcu_read_unlock(); return; @@ -1088,7 +1078,7 @@ void blk_add_driver_data(struct request_queue *q, __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, BLK_TA_DRV_DATA, 0, len, data, - blk_trace_request_get_cgid(q, rq)); + blk_trace_request_get_cgid(rq)); rcu_read_unlock(); } EXPORT_SYMBOL_GPL(blk_add_driver_data); -- cgit v1.2.3 From 4f19cab76136e800a3f04d8c9aa4d8e770e3d3d8 Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Fri, 4 Dec 2020 12:36:05 +0100 Subject: bpf: Add a bpf_sock_from_file helper While eBPF programs can check whether a file is a socket by file->f_op == &socket_file_ops, they cannot convert the void private_data pointer to a struct socket BTF pointer. In order to do this a new helper wrapping sock_from_file is added. This is useful to tracing programs but also other program types inheriting this set of helpers such as iterators or LSM programs. Signed-off-by: Florent Revest Signed-off-by: Daniel Borkmann Acked-by: KP Singh Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20201204113609.1850150-2-revest@google.com --- include/uapi/linux/bpf.h | 9 +++++++++ kernel/trace/bpf_trace.c | 20 ++++++++++++++++++++ scripts/bpf_helpers_doc.py | 4 ++++ tools/include/uapi/linux/bpf.h | 9 +++++++++ 4 files changed, 42 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1233f14f659f..30b477a26482 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3822,6 +3822,14 @@ union bpf_attr { * The **hash_algo** is returned on success, * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. + * + * struct socket *bpf_sock_from_file(struct file *file) + * Description + * If the given file represents a socket, returns the associated + * socket. + * Return + * A pointer to a struct socket on success or NULL if the file is + * not a socket. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3986,6 +3994,7 @@ union bpf_attr { FN(bprm_opts_set), \ FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ + FN(sock_from_file), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cb9d7478ef0c..0cf0a6331482 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1270,6 +1270,24 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .arg5_type = ARG_ANYTHING, }; +BPF_CALL_1(bpf_sock_from_file, struct file *, file) +{ + return (unsigned long) sock_from_file(file); +} + +BTF_ID_LIST(bpf_sock_from_file_btf_ids) +BTF_ID(struct, socket) +BTF_ID(struct, file) + +static const struct bpf_func_proto bpf_sock_from_file_proto = { + .func = bpf_sock_from_file, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &bpf_sock_from_file_btf_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], +}; + const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1366,6 +1384,8 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_bpf_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; + case BPF_FUNC_sock_from_file: + return &bpf_sock_from_file_proto; default: return NULL; } diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 8b829748d488..867ada23281c 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -437,6 +437,8 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct inode', + 'struct socket', + 'struct file', ] known_types = { '...', @@ -482,6 +484,8 @@ class PrinterHelpers(Printer): 'struct path', 'struct btf_ptr', 'struct inode', + 'struct socket', + 'struct file', } mapped_types = { 'u8': '__u8', diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 1233f14f659f..30b477a26482 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3822,6 +3822,14 @@ union bpf_attr { * The **hash_algo** is returned on success, * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. + * + * struct socket *bpf_sock_from_file(struct file *file) + * Description + * If the given file represents a socket, returns the associated + * socket. + * Return + * A pointer to a struct socket on success or NULL if the file is + * not a socket. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -3986,6 +3994,7 @@ union bpf_attr { FN(bprm_opts_set), \ FN(ktime_get_coarse_ns), \ FN(ima_inode_hash), \ + FN(sock_from_file), \ /* */ /* integer value in 'imm' field of BPF_CALL instruction selects which helper -- cgit v1.2.3 From bcee5278958802b40ee8b26679155a6d9231783e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 4 Dec 2020 16:36:16 -0500 Subject: tracing: Fix userstacktrace option for instances When the instances were able to use their own options, the userstacktrace option was left hardcoded for the top level. This made the instance userstacktrace option bascially into a nop, and will confuse users that set it, but nothing happens (I was confused when it happened to me!) Cc: stable@vger.kernel.org Fixes: 16270145ce6b ("tracing: Add trace options for core options to instances") Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 7d53c5bdea3e..06134189e9a7 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -163,7 +163,8 @@ static union trace_eval_map_item *trace_eval_maps; #endif /* CONFIG_TRACE_EVAL_MAP_FILE */ int tracing_set_tracer(struct trace_array *tr, const char *buf); -static void ftrace_trace_userstack(struct trace_buffer *buffer, +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned long flags, int pc); #define MAX_TRACER_SIZE 100 @@ -2870,7 +2871,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, * two. They are not that meaningful. */ ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); - ftrace_trace_userstack(buffer, flags, pc); + ftrace_trace_userstack(tr, buffer, flags, pc); } /* @@ -3056,13 +3057,14 @@ EXPORT_SYMBOL_GPL(trace_dump_stack); static DEFINE_PER_CPU(int, user_stack_count); static void -ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc) +ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned long flags, int pc) { struct trace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; - if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE)) + if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE)) return; /* @@ -3101,7 +3103,8 @@ ftrace_trace_userstack(struct trace_buffer *buffer, unsigned long flags, int pc) preempt_enable(); } #else /* CONFIG_USER_STACKTRACE_SUPPORT */ -static void ftrace_trace_userstack(struct trace_buffer *buffer, +static void ftrace_trace_userstack(struct trace_array *tr, + struct trace_buffer *buffer, unsigned long flags, int pc) { } -- cgit v1.2.3 From 45dc656aeb4d50e6a4b2ca110345fb0c96cf1189 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Dec 2020 14:40:48 +0100 Subject: blktrace: fix up a kerneldoc comment Fixes: a54895fa057c ("block: remove the request_queue to argument request based tracepoints") Reported-by: Stephen Rothwell Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- kernel/trace/blktrace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 7839a78205c2..2c5b3c5317c2 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1022,7 +1022,6 @@ static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, /** * blk_add_trace_rq_remap - Add a trace for a request-remap operation * @ignore: trace callback data parameter (not used) - * @q: queue the io is for * @rq: the source request * @dev: target device * @from: source sector -- cgit v1.2.3 From e1868b9e36d0ca52e4e7c6c06953f191446e44df Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Dec 2020 10:28:21 -0800 Subject: bpf: Avoid overflows involving hash elem_size Use of bpf_map_charge_init() was making sure hash tables would not use more than 4GB of memory. Since the implicit check disappeared, we have to be more careful about overflows, to support big hash tables. syzbot triggers a panic using : bpf(BPF_MAP_CREATE, {map_type=BPF_MAP_TYPE_LRU_HASH, key_size=16384, value_size=8, max_entries=262200, map_flags=0, inner_map_fd=-1, map_name="", map_ifindex=0, btf_fd=-1, btf_key_type_id=0, btf_value_type_id=0, btf_vmlinux_value_type_id=0}, 64) = ... BUG: KASAN: vmalloc-out-of-bounds in bpf_percpu_lru_populate kernel/bpf/bpf_lru_list.c:594 [inline] BUG: KASAN: vmalloc-out-of-bounds in bpf_lru_populate+0x4ef/0x5e0 kernel/bpf/bpf_lru_list.c:611 Write of size 2 at addr ffffc90017e4a020 by task syz-executor.5/19786 CPU: 0 PID: 19786 Comm: syz-executor.5 Not tainted 5.10.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0x5/0x4c8 mm/kasan/report.c:385 __kasan_report mm/kasan/report.c:545 [inline] kasan_report.cold+0x1f/0x37 mm/kasan/report.c:562 bpf_percpu_lru_populate kernel/bpf/bpf_lru_list.c:594 [inline] bpf_lru_populate+0x4ef/0x5e0 kernel/bpf/bpf_lru_list.c:611 prealloc_init kernel/bpf/hashtab.c:319 [inline] htab_map_alloc+0xf6e/0x1230 kernel/bpf/hashtab.c:507 find_and_alloc_map kernel/bpf/syscall.c:123 [inline] map_create kernel/bpf/syscall.c:829 [inline] __do_sys_bpf+0xa81/0x5170 kernel/bpf/syscall.c:4336 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x45deb9 Code: 0d b4 fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 db b3 fb ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007fd93fbc0c78 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 0000000000001a40 RCX: 000000000045deb9 RDX: 0000000000000040 RSI: 0000000020000280 RDI: 0000000000000000 RBP: 000000000119bf60 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000246 R12: 000000000119bf2c R13: 00007ffc08a7be8f R14: 00007fd93fbc19c0 R15: 000000000119bf2c Fixes: 755e5d55367a ("bpf: Eliminate rlimit-based memory accounting for hashtab maps") Reported-by: syzbot Signed-off-by: Eric Dumazet Signed-off-by: Alexei Starovoitov Acked-by: Roman Gushchin Link: https://lore.kernel.org/bpf/20201207182821.3940306-1-eric.dumazet@gmail.com --- kernel/bpf/hashtab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index fe7a0733a63a..f53cca70e215 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -224,7 +224,7 @@ static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { - return (struct htab_elem *) (htab->elems + i * htab->elem_size); + return (struct htab_elem *) (htab->elems + i * (u64)htab->elem_size); } static void htab_free_elems(struct bpf_htab *htab) @@ -280,7 +280,7 @@ static int prealloc_init(struct bpf_htab *htab) if (!htab_is_percpu(htab) && !htab_is_lru(htab)) num_entries += num_possible_cpus(); - htab->elems = bpf_map_area_alloc(htab->elem_size * num_entries, + htab->elems = bpf_map_area_alloc((u64)htab->elem_size * num_entries, htab->map.numa_node); if (!htab->elems) return -ENOMEM; -- cgit v1.2.3 From a32ded3389abcc51a39fc7cb5f1793f7e5abaa88 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Tue, 17 Nov 2020 06:37:03 +0100 Subject: ring-buffer: Remove obsolete rb_event_is_commit() Commit a389d86f7fd0 ("ring-buffer: Have nested events still record running time stamp") removed the only uses of rb_event_is_commit() in rb_update_event() and rb_update_write_stamp(). Hence, since then, make CC=clang W=1 warns: kernel/trace/ring_buffer.c:2763:1: warning: unused function 'rb_event_is_commit' [-Wunused-function] Remove this obsolete function. Link: https://lkml.kernel.org/r/20201117053703.11275-1-lukas.bulwahn@gmail.com Reviewed-by: Nathan Chancellor Signed-off-by: Lukas Bulwahn Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7cd888ee9ac7..1b9da155ff00 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2629,9 +2629,6 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs) return skip_time_extend(event); } -static inline bool rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event); - #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK static inline bool sched_clock_stable(void) { @@ -2759,20 +2756,6 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } -static __always_inline bool -rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer, - struct ring_buffer_event *event) -{ - unsigned long addr = (unsigned long)event; - unsigned long index; - - index = rb_event_index(event); - addr &= PAGE_MASK; - - return cpu_buffer->commit_page->page == (void *)addr && - rb_commit_index(cpu_buffer) == index; -} - static u64 rb_time_delta(struct ring_buffer_event *event) { switch (event->type_len) { -- cgit v1.2.3 From 888834903d362b48c879ce8ab9966428367360c9 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 12 Nov 2020 23:18:00 +0800 Subject: ring-buffer: Fix a typo in function description s/ring_buffer_commit_discard/ring_buffer_discard_commit/ Link: https://lkml.kernel.org/r/20201112151800.14382-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 1b9da155ff00..f09d3f5911cb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3645,7 +3645,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer, } /** - * ring_buffer_commit_discard - discard an event that has not been committed + * ring_buffer_discard_commit - discard an event that has not been committed * @buffer: the ring buffer * @event: non committed event to discard * -- cgit v1.2.3 From 2f4b03195fe8ed3b1e213f4a6cfe14cfc109d829 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 7 Dec 2020 13:37:20 +0100 Subject: bpf: Propagate __user annotations properly __htab_map_lookup_and_delete_batch() stores a user pointer in the local variable ubatch and uses that in copy_{from,to}_user(), but ubatch misses a __user annotation. So, sparse warns in the various assignments and uses of ubatch: kernel/bpf/hashtab.c:1415:24: warning: incorrect type in initializer (different address spaces) kernel/bpf/hashtab.c:1415:24: expected void *ubatch kernel/bpf/hashtab.c:1415:24: got void [noderef] __user * kernel/bpf/hashtab.c:1444:46: warning: incorrect type in argument 2 (different address spaces) kernel/bpf/hashtab.c:1444:46: expected void const [noderef] __user *from kernel/bpf/hashtab.c:1444:46: got void *ubatch kernel/bpf/hashtab.c:1608:16: warning: incorrect type in assignment (different address spaces) kernel/bpf/hashtab.c:1608:16: expected void *ubatch kernel/bpf/hashtab.c:1608:16: got void [noderef] __user * kernel/bpf/hashtab.c:1609:26: warning: incorrect type in argument 1 (different address spaces) kernel/bpf/hashtab.c:1609:26: expected void [noderef] __user *to kernel/bpf/hashtab.c:1609:26: got void *ubatch Add the __user annotation to repair this chain of propagating __user annotations in __htab_map_lookup_and_delete_batch(). Signed-off-by: Lukas Bulwahn Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20201207123720.19111-1-lukas.bulwahn@gmail.com --- kernel/bpf/hashtab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index f53cca70e215..7e848200cd26 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1412,7 +1412,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val; void __user *uvalues = u64_to_user_ptr(attr->batch.values); void __user *ukeys = u64_to_user_ptr(attr->batch.keys); - void *ubatch = u64_to_user_ptr(attr->batch.in_batch); + void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size; struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; -- cgit v1.2.3 From 8d143c610b62f2820fbc97dc441d54ac326abe1a Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 30 Nov 2020 13:49:15 +0100 Subject: printk: remove obsolete dead assignment Commit 849f3127bb46 ("switch /dev/kmsg to ->write_iter()") refactored devkmsg_write() and left over a dead assignment on the variable 'len'. Hence, make clang-analyzer warns: kernel/printk/printk.c:744:4: warning: Value stored to 'len' is never read [clang-analyzer-deadcode.DeadStores] len -= endp - line; ^ Simply remove this obsolete dead assignment here. Link: https://lore.kernel.org/r/20201130124915.7573-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index fe64a49344bf..6bffb01431dd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -741,7 +741,6 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) if (LOG_FACILITY(u) != 0) facility = LOG_FACILITY(u); endp++; - len -= endp - line; line = endp; } } -- cgit v1.2.3 From 8bdd8e275ede9786d845b3ec952836e61fd824e9 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 7 Dec 2020 22:43:26 -0800 Subject: bpf: Return -ENOTSUPP when attaching to non-kernel BTF Return -ENOTSUPP if tracing BPF program is attempted to be attached with specified attach_btf_obj_fd pointing to non-kernel (neither vmlinux nor module) BTF object. This scenario might be supported in the future and isn't outright invalid, so -EINVAL isn't the most appropriate error code. Suggested-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20201208064326.667389-1-andrii@kernel.org --- kernel/bpf/syscall.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0cd3cc2af9c1..287be337d5f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2121,8 +2121,11 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr) if (IS_ERR(attach_btf)) return -EINVAL; if (!btf_is_kernel(attach_btf)) { + /* attaching through specifying bpf_prog's BTF + * objects directly might be supported eventually + */ btf_put(attach_btf); - return -EINVAL; + return -ENOTSUPP; } } } else if (attr->attach_btf_id) { -- cgit v1.2.3 From b60da4955f53d1f50e44351a9c3a37a92503079e Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Tue, 8 Dec 2020 18:36:23 +0100 Subject: bpf: Only provide bpf_sock_from_file with CONFIG_NET This moves the bpf_sock_from_file definition into net/core/filter.c which only gets compiled with CONFIG_NET and also moves the helper proto usage next to other tracing helpers that are conditional on CONFIG_NET. This avoids ld: kernel/trace/bpf_trace.o: in function `bpf_sock_from_file': bpf_trace.c:(.text+0xe23): undefined reference to `sock_from_file' When compiling a kernel with BPF and without NET. Reported-by: kernel test robot Reported-by: Randy Dunlap Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: Randy Dunlap Acked-by: Martin KaFai Lau Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20201208173623.1136863-1-revest@chromium.org --- include/linux/bpf.h | 1 + kernel/trace/bpf_trace.c | 22 ++-------------------- net/core/filter.c | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index d05e75ed8c1b..07cb5d15e743 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1859,6 +1859,7 @@ extern const struct bpf_func_proto bpf_snprintf_btf_proto; extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; +extern const struct bpf_func_proto bpf_sock_from_file_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0cf0a6331482..52ddd217d6a1 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1270,24 +1270,6 @@ const struct bpf_func_proto bpf_snprintf_btf_proto = { .arg5_type = ARG_ANYTHING, }; -BPF_CALL_1(bpf_sock_from_file, struct file *, file) -{ - return (unsigned long) sock_from_file(file); -} - -BTF_ID_LIST(bpf_sock_from_file_btf_ids) -BTF_ID(struct, socket) -BTF_ID(struct, file) - -static const struct bpf_func_proto bpf_sock_from_file_proto = { - .func = bpf_sock_from_file, - .gpl_only = false, - .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, - .ret_btf_id = &bpf_sock_from_file_btf_ids[0], - .arg1_type = ARG_PTR_TO_BTF_ID, - .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], -}; - const struct bpf_func_proto * bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { @@ -1384,8 +1366,6 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_per_cpu_ptr_proto; case BPF_FUNC_bpf_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; - case BPF_FUNC_sock_from_file: - return &bpf_sock_from_file_proto; default: return NULL; } @@ -1778,6 +1758,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_get_tracing_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_tracing_proto; + case BPF_FUNC_sock_from_file: + return &bpf_sock_from_file_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index 77001a35768f..255aeee72402 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10413,6 +10413,24 @@ const struct bpf_func_proto bpf_skc_to_udp6_sock_proto = { .ret_btf_id = &btf_sock_ids[BTF_SOCK_TYPE_UDP6], }; +BPF_CALL_1(bpf_sock_from_file, struct file *, file) +{ + return (unsigned long)sock_from_file(file); +} + +BTF_ID_LIST(bpf_sock_from_file_btf_ids) +BTF_ID(struct, socket) +BTF_ID(struct, file) + +const struct bpf_func_proto bpf_sock_from_file_proto = { + .func = bpf_sock_from_file, + .gpl_only = false, + .ret_type = RET_PTR_TO_BTF_ID_OR_NULL, + .ret_btf_id = &bpf_sock_from_file_btf_ids[0], + .arg1_type = ARG_PTR_TO_BTF_ID, + .arg1_btf_id = &bpf_sock_from_file_btf_ids[1], +}; + static const struct bpf_func_proto * bpf_sk_base_func_proto(enum bpf_func_id func_id) { -- cgit v1.2.3 From 2ecedd7569080fd05c1a457e8af2165afecfa29f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 3 Dec 2020 21:07:04 -0800 Subject: membarrier: Add an actual barrier before rseq_preempt() It seems that most RSEQ membarrier users will expect any stores done before the membarrier() syscall to be visible to the target task(s). While this is extremely likely to be true in practice, nothing actually guarantees it by a strict reading of the x86 manuals. Rather than providing this guarantee by accident and potentially causing a problem down the road, just add an explicit barrier. Fixes: 70216e18e519 ("membarrier: Provide core serializing command, *_SYNC_CORE") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/d3e7197e034fa4852afcf370ca49c30496e58e40.1607058304.git.luto@kernel.org --- kernel/sched/membarrier.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index e23e74d52db5..7d98ef5d3bcd 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -40,6 +40,14 @@ static void ipi_mb(void *info) static void ipi_rseq(void *info) { + /* + * Ensure that all stores done by the calling thread are visible + * to the current task before the current task resumes. We could + * probably optimize this away on most architectures, but by the + * time we've already sent an IPI, the cost of the extra smp_mb() + * is negligible. + */ + smp_mb(); rseq_preempt(current); } -- cgit v1.2.3 From 758c9373d84168dc7d039cf85a0e920046b17b41 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 3 Dec 2020 21:07:05 -0800 Subject: membarrier: Explicitly sync remote cores when SYNC_CORE is requested membarrier() does not explicitly sync_core() remote CPUs; instead, it relies on the assumption that an IPI will result in a core sync. On x86, this may be true in practice, but it's not architecturally reliable. In particular, the SDM and APM do not appear to guarantee that interrupt delivery is serializing. While IRET does serialize, IPI return can schedule, thereby switching to another task in the same mm that was sleeping in a syscall. The new task could then SYSRET back to usermode without ever executing IRET. Make this more robust by explicitly calling sync_core_before_usermode() on remote cores. (This also helps people who search the kernel tree for instances of sync_core() and sync_core_before_usermode() -- one might be surprised that the core membarrier code doesn't currently show up in a such a search.) Fixes: 70216e18e519 ("membarrier: Provide core serializing command, *_SYNC_CORE") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/776b448d5f7bd6b12690707f5ed67bcda7f1d427.1607058304.git.luto@kernel.org --- kernel/sched/membarrier.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 7d98ef5d3bcd..1c278dff4f2d 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -38,6 +38,23 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_core(void *info) +{ + /* + * The smp_mb() in membarrier after all the IPIs is supposed to + * ensure that memory on remote CPUs that occur before the IPI + * become visible to membarrier()'s caller -- see scenario B in + * the big comment at the top of this file. + * + * A sync_core() would provide this guarantee, but + * sync_core_before_usermode() might end up being deferred until + * after membarrier()'s smp_mb(). + */ + smp_mb(); /* IPIs should be serializing but paranoid. */ + + sync_core_before_usermode(); +} + static void ipi_rseq(void *info) { /* @@ -162,6 +179,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + ipi_func = ipi_sync_core; } else if (flags == MEMBARRIER_FLAG_RSEQ) { if (!IS_ENABLED(CONFIG_RSEQ)) return -EINVAL; -- cgit v1.2.3 From e45cdc71d1fa5ac3a57b23acc31eb959e4f60135 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 3 Dec 2020 21:07:06 -0800 Subject: membarrier: Execute SYNC_CORE on the calling thread membarrier()'s MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE is documented as syncing the core on all sibling threads but not necessarily the calling thread. This behavior is fundamentally buggy and cannot be used safely. Suppose a user program has two threads. Thread A is on CPU 0 and thread B is on CPU 1. Thread A modifies some text and calls membarrier(MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE). Then thread B executes the modified code. If, at any point after membarrier() decides which CPUs to target, thread A could be preempted and replaced by thread B on CPU 0. This could even happen on exit from the membarrier() syscall. If this happens, thread B will end up running on CPU 0 without having synced. In principle, this could be fixed by arranging for the scheduler to issue sync_core_before_usermode() whenever switching between two threads in the same mm if there is any possibility of a concurrent membarrier() call, but this would have considerable overhead. Instead, make membarrier() sync the calling CPU as well. As an optimization, this avoids an extra smp_mb() in the default barrier-only mode and an extra rseq preempt on the caller. Fixes: 70216e18e519 ("membarrier: Provide core serializing command, *_SYNC_CORE") Signed-off-by: Andy Lutomirski Signed-off-by: Thomas Gleixner Reviewed-by: Mathieu Desnoyers Link: https://lore.kernel.org/r/250ded637696d490c69bef1877148db86066881c.1607058304.git.luto@kernel.org --- kernel/sched/membarrier.c | 51 ++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 1c278dff4f2d..9d8df34bea75 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -194,7 +194,8 @@ static int membarrier_private_expedited(int flags, int cpu_id) return -EPERM; } - if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) + if (flags != MEMBARRIER_FLAG_SYNC_CORE && + (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)) return 0; /* @@ -213,8 +214,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) goto out; - if (cpu_id == raw_smp_processor_id()) - goto out; rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu_id)->curr); if (!p || p->mm != mm) { @@ -229,16 +228,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) for_each_online_cpu(cpu) { struct task_struct *p; - /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == mm) __cpumask_set_cpu(cpu, tmpmask); @@ -246,12 +235,38 @@ static int membarrier_private_expedited(int flags, int cpu_id) rcu_read_unlock(); } - preempt_disable(); - if (cpu_id >= 0) + if (cpu_id >= 0) { + /* + * smp_call_function_single() will call ipi_func() if cpu_id + * is the calling CPU. + */ smp_call_function_single(cpu_id, ipi_func, NULL, 1); - else - smp_call_function_many(tmpmask, ipi_func, NULL, 1); - preempt_enable(); + } else { + /* + * For regular membarrier, we can save a few cycles by + * skipping the current cpu -- we're about to do smp_mb() + * below, and if we migrate to a different cpu, this cpu + * and the new cpu will execute a full barrier in the + * scheduler. + * + * For SYNC_CORE, we do need a barrier on the current cpu -- + * otherwise, if we are migrated and replaced by a different + * task in the same mm just before, during, or after + * membarrier, we will end up with some thread in the mm + * running without a core sync. + * + * For RSEQ, don't rseq_preempt() the caller. User code + * is not supposed to issue syscalls at all from inside an + * rseq critical section. + */ + if (flags != MEMBARRIER_FLAG_SYNC_CORE) { + preempt_disable(); + smp_call_function_many(tmpmask, ipi_func, NULL, true); + preempt_enable(); + } else { + on_each_cpu_mask(tmpmask, ipi_func, NULL, true); + } + } out: if (cpu_id < 0) -- cgit v1.2.3 From 38dc717e97153e46375ee21797aa54777e5498f3 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Fri, 27 Nov 2020 10:09:39 +0100 Subject: module: delay kobject uevent until after module init call Apparently there has been a longstanding race between udev/systemd and the module loader. Currently, the module loader sends a uevent right after sysfs initialization, but before the module calls its init function. However, some udev rules expect that the module has initialized already upon receiving the uevent. This race has been triggered recently (see link in references) in some systemd mount unit files. For instance, the configfs module creates the /sys/kernel/config mount point in its init function, however the module loader issues the uevent before this happens. sys-kernel-config.mount expects to be able to mount /sys/kernel/config upon receipt of the module loading uevent, but if the configfs module has not called its init function yet, then this directory will not exist and the mount unit fails. A similar situation exists for sys-fs-fuse-connections.mount, as the fuse sysfs mount point is created during the fuse module's init function. If udev is faster than module initialization then the mount unit would fail in a similar fashion. To fix this race, delay the module KOBJ_ADD uevent until after the module has finished calling its init routine. References: https://github.com/systemd/systemd/issues/17586 Reviewed-by: Greg Kroah-Hartman Tested-By: Nicolas Morey-Chaisemartin Signed-off-by: Jessica Yu --- kernel/module.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index a40ec708f8f2..e1dd0df57244 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1897,7 +1897,6 @@ static int mod_sysfs_init(struct module *mod) if (err) mod_kobject_put(mod); - /* delay uevent until full sysfs population */ out: return err; } @@ -1934,7 +1933,6 @@ static int mod_sysfs_setup(struct module *mod, add_sect_attrs(mod, info); add_notes_attrs(mod, info); - kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; out_unreg_modinfo_attrs: @@ -3656,6 +3654,9 @@ static noinline int do_init_module(struct module *mod) blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod); + /* Delay uevent until module has finished its init routine */ + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); + /* * We need to finish all async code before the module init sequence * is done. This has potential to deadlock. For example, a newly -- cgit v1.2.3 From 6b916706f8f09348cfa4fdd3642ebf87d6a2a26b Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 9 Dec 2020 01:50:52 +0106 Subject: printk: inline log_output(),log_store() in vprintk_store() In preparation for removing logbuf_lock, inline log_output() and log_store() into vprintk_store(). This will simplify dealing with the various code branches and fallbacks that are possible. Signed-off-by: John Ogness Reviewed-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20201209004453.17720-2-john.ogness@linutronix.de --- kernel/printk/printk.c | 145 +++++++++++++++++++++++-------------------------- 1 file changed, 67 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bc1e3b5a97bd..150bfde41ba1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -491,52 +491,6 @@ static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) *trunc_msg_len = 0; } -/* insert record into the buffer, discard old ones, update heads */ -static int log_store(u32 caller_id, int facility, int level, - enum log_flags flags, u64 ts_nsec, - const struct dev_printk_info *dev_info, - const char *text, u16 text_len) -{ - struct prb_reserved_entry e; - struct printk_record r; - u16 trunc_msg_len = 0; - - prb_rec_init_wr(&r, text_len); - - if (!prb_reserve(&e, prb, &r)) { - /* truncate the message if it is too long for empty buffer */ - truncate_msg(&text_len, &trunc_msg_len); - prb_rec_init_wr(&r, text_len + trunc_msg_len); - /* survive when the log buffer is too small for trunc_msg */ - if (!prb_reserve(&e, prb, &r)) - return 0; - } - - /* fill message */ - memcpy(&r.text_buf[0], text, text_len); - if (trunc_msg_len) - memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); - r.info->text_len = text_len + trunc_msg_len; - r.info->facility = facility; - r.info->level = level & 7; - r.info->flags = flags & 0x1f; - if (ts_nsec > 0) - r.info->ts_nsec = ts_nsec; - else - r.info->ts_nsec = local_clock(); - r.info->caller_id = caller_id; - if (dev_info) - memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); - - /* A message without a trailing newline can be continued. */ - if (!(flags & LOG_NEWLINE)) - prb_commit(&e); - else - prb_final_commit(&e); - - return (text_len + trunc_msg_len); -} - int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) @@ -1907,44 +1861,28 @@ static inline u32 printk_caller_id(void) 0x80000000 + raw_smp_processor_id(); } -static size_t log_output(int facility, int level, enum log_flags lflags, - const struct dev_printk_info *dev_info, - char *text, size_t text_len) -{ - const u32 caller_id = printk_caller_id(); - - if (lflags & LOG_CONT) { - struct prb_reserved_entry e; - struct printk_record r; - - prb_rec_init_wr(&r, text_len); - if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { - memcpy(&r.text_buf[r.info->text_len], text, text_len); - r.info->text_len += text_len; - if (lflags & LOG_NEWLINE) { - r.info->flags |= LOG_NEWLINE; - prb_final_commit(&e); - } else { - prb_commit(&e); - } - return text_len; - } - } - - /* Store it in the record log */ - return log_store(caller_id, facility, level, lflags, 0, - dev_info, text, text_len); -} - /* Must be called under logbuf_lock. */ int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { + const u32 caller_id = printk_caller_id(); static char textbuf[LOG_LINE_MAX]; - char *text = textbuf; - size_t text_len; + struct prb_reserved_entry e; enum log_flags lflags = 0; + struct printk_record r; + u16 trunc_msg_len = 0; + char *text = textbuf; + u16 text_len; + u64 ts_nsec; + + /* + * Since the duration of printk() can vary depending on the message + * and state of the ringbuffer, grab the timestamp now so that it is + * close to the call of printk(). This provides a more deterministic + * timestamp with respect to the caller. + */ + ts_nsec = local_clock(); /* * The printf needs to come first; we need the syslog @@ -1983,7 +1921,58 @@ int vprintk_store(int facility, int level, if (dev_info) lflags |= LOG_NEWLINE; - return log_output(facility, level, lflags, dev_info, text, text_len); + if (lflags & LOG_CONT) { + prb_rec_init_wr(&r, text_len); + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { + memcpy(&r.text_buf[r.info->text_len], text, text_len); + r.info->text_len += text_len; + + if (lflags & LOG_NEWLINE) { + r.info->flags |= LOG_NEWLINE; + prb_final_commit(&e); + } else { + prb_commit(&e); + } + + return text_len; + } + } + + /* + * Explicitly initialize the record before every prb_reserve() call. + * prb_reserve_in_last() and prb_reserve() purposely invalidate the + * structure when they fail. + */ + prb_rec_init_wr(&r, text_len); + if (!prb_reserve(&e, prb, &r)) { + /* truncate the message if it is too long for empty buffer */ + truncate_msg(&text_len, &trunc_msg_len); + + prb_rec_init_wr(&r, text_len + trunc_msg_len); + if (!prb_reserve(&e, prb, &r)) + return 0; + } + + /* fill message */ + memcpy(&r.text_buf[0], text, text_len); + if (trunc_msg_len) + memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); + r.info->text_len = text_len + trunc_msg_len; + r.info->facility = facility; + r.info->level = level & 7; + r.info->flags = lflags & 0x1f; + r.info->ts_nsec = ts_nsec; + r.info->caller_id = caller_id; + if (dev_info) + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); + + /* A message without a trailing newline can be continued. */ + if (!(lflags & LOG_NEWLINE)) + prb_commit(&e); + else + prb_final_commit(&e); + + return (text_len + trunc_msg_len); } asmlinkage int vprintk_emit(int facility, int level, -- cgit v1.2.3 From b031a684bfd01d633c79d281bd0cf11c2f834ada Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 9 Dec 2020 01:50:53 +0106 Subject: printk: remove logbuf_lock writer-protection of ringbuffer Since the ringbuffer is lockless, there is no need for it to be protected by @logbuf_lock. Remove @logbuf_lock writer-protection of the ringbuffer. The reader-protection is not removed because some variables, used by readers, are using @logbuf_lock for synchronization: @syslog_seq, @syslog_time, @syslog_partial, @console_seq, struct kmsg_dumper. For PRINTK_NMI_DIRECT_CONTEXT_MASK, @logbuf_lock usage is not removed because it may be used for dumper synchronization. Without @logbuf_lock synchronization of vprintk_store() it is no longer possible to use the single static buffer for temporarily sprint'ing the message. Instead, use vsnprintf() to determine the length and perform the real vscnprintf() using the area reserved from the ringbuffer. This leads to suboptimal packing of the message data, but will result in less wasted storage than multiple per-cpu buffers to support lockless temporary sprint'ing. Signed-off-by: John Ogness Reviewed-by: Sergey Senozhatsky Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20201209004453.17720-3-john.ogness@linutronix.de --- kernel/printk/printk.c | 138 +++++++++++++++++++++++++++++++++++-------------- 1 file changed, 98 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 150bfde41ba1..c8847ee571f0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1126,7 +1126,7 @@ void __init setup_log_buf(int early) new_descs, ilog2(new_descs_count), new_infos); - logbuf_lock_irqsave(flags); + printk_safe_enter_irqsave(flags); log_buf_len = new_log_buf_len; log_buf = new_log_buf; @@ -1143,7 +1143,7 @@ void __init setup_log_buf(int early) */ prb = &printk_rb_dynamic; - logbuf_unlock_irqrestore(flags); + printk_safe_exit_irqrestore(flags); if (seq != prb_next_seq(&printk_rb_static)) { pr_err("dropped %llu messages\n", @@ -1861,18 +1861,90 @@ static inline u32 printk_caller_id(void) 0x80000000 + raw_smp_processor_id(); } -/* Must be called under logbuf_lock. */ +/** + * parse_prefix - Parse level and control flags. + * + * @text: The terminated text message. + * @level: A pointer to the current level value, will be updated. + * @lflags: A pointer to the current log flags, will be updated. + * + * @level may be NULL if the caller is not interested in the parsed value. + * Otherwise the variable pointed to by @level must be set to + * LOGLEVEL_DEFAULT in order to be updated with the parsed value. + * + * @lflags may be NULL if the caller is not interested in the parsed value. + * Otherwise the variable pointed to by @lflags will be OR'd with the parsed + * value. + * + * Return: The length of the parsed level and control flags. + */ +static u16 parse_prefix(char *text, int *level, enum log_flags *lflags) +{ + u16 prefix_len = 0; + int kern_level; + + while (*text) { + kern_level = printk_get_level(text); + if (!kern_level) + break; + + switch (kern_level) { + case '0' ... '7': + if (level && *level == LOGLEVEL_DEFAULT) + *level = kern_level - '0'; + break; + case 'c': /* KERN_CONT */ + if (lflags) + *lflags |= LOG_CONT; + } + + prefix_len += 2; + text += 2; + } + + return prefix_len; +} + +static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags, + const char *fmt, va_list args) +{ + u16 text_len; + + text_len = vscnprintf(text, size, fmt, args); + + /* Mark and strip a trailing newline. */ + if (text_len && text[text_len - 1] == '\n') { + text_len--; + *lflags |= LOG_NEWLINE; + } + + /* Strip log level and control flags. */ + if (facility == 0) { + u16 prefix_len; + + prefix_len = parse_prefix(text, NULL, NULL); + if (prefix_len) { + text_len -= prefix_len; + memmove(text, text + prefix_len, text_len); + } + } + + return text_len; +} + +__printf(4, 0) int vprintk_store(int facility, int level, const struct dev_printk_info *dev_info, const char *fmt, va_list args) { const u32 caller_id = printk_caller_id(); - static char textbuf[LOG_LINE_MAX]; struct prb_reserved_entry e; enum log_flags lflags = 0; struct printk_record r; u16 trunc_msg_len = 0; - char *text = textbuf; + char prefix_buf[8]; + u16 reserve_size; + va_list args2; u16 text_len; u64 ts_nsec; @@ -1885,35 +1957,21 @@ int vprintk_store(int facility, int level, ts_nsec = local_clock(); /* - * The printf needs to come first; we need the syslog - * prefix which might be passed-in as a parameter. + * The sprintf needs to come first since the syslog prefix might be + * passed in as a parameter. An extra byte must be reserved so that + * later the vscnprintf() into the reserved buffer has room for the + * terminating '\0', which is not counted by vsnprintf(). */ - text_len = vscnprintf(text, sizeof(textbuf), fmt, args); - - /* mark and strip a trailing newline */ - if (text_len && text[text_len-1] == '\n') { - text_len--; - lflags |= LOG_NEWLINE; - } - - /* strip kernel syslog prefix and extract log level or control flags */ - if (facility == 0) { - int kern_level; + va_copy(args2, args); + reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; + va_end(args2); - while ((kern_level = printk_get_level(text)) != 0) { - switch (kern_level) { - case '0' ... '7': - if (level == LOGLEVEL_DEFAULT) - level = kern_level - '0'; - break; - case 'c': /* KERN_CONT */ - lflags |= LOG_CONT; - } + if (reserve_size > LOG_LINE_MAX) + reserve_size = LOG_LINE_MAX; - text_len -= 2; - text += 2; - } - } + /* Extract log level or control flags. */ + if (facility == 0) + parse_prefix(&prefix_buf[0], &level, &lflags); if (level == LOGLEVEL_DEFAULT) level = default_message_loglevel; @@ -1922,9 +1980,10 @@ int vprintk_store(int facility, int level, lflags |= LOG_NEWLINE; if (lflags & LOG_CONT) { - prb_rec_init_wr(&r, text_len); + prb_rec_init_wr(&r, reserve_size); if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { - memcpy(&r.text_buf[r.info->text_len], text, text_len); + text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, + facility, &lflags, fmt, args); r.info->text_len += text_len; if (lflags & LOG_NEWLINE) { @@ -1943,18 +2002,18 @@ int vprintk_store(int facility, int level, * prb_reserve_in_last() and prb_reserve() purposely invalidate the * structure when they fail. */ - prb_rec_init_wr(&r, text_len); + prb_rec_init_wr(&r, reserve_size); if (!prb_reserve(&e, prb, &r)) { /* truncate the message if it is too long for empty buffer */ - truncate_msg(&text_len, &trunc_msg_len); + truncate_msg(&reserve_size, &trunc_msg_len); - prb_rec_init_wr(&r, text_len + trunc_msg_len); + prb_rec_init_wr(&r, reserve_size + trunc_msg_len); if (!prb_reserve(&e, prb, &r)) return 0; } /* fill message */ - memcpy(&r.text_buf[0], text, text_len); + text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args); if (trunc_msg_len) memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); r.info->text_len = text_len + trunc_msg_len; @@ -1995,10 +2054,9 @@ asmlinkage int vprintk_emit(int facility, int level, boot_delay_msec(level); printk_delay(); - /* This stops the holder of console_sem just where we want him */ - logbuf_lock_irqsave(flags); + printk_safe_enter_irqsave(flags); printed_len = vprintk_store(facility, level, dev_info, fmt, args); - logbuf_unlock_irqrestore(flags); + printk_safe_exit_irqrestore(flags); /* If called from the scheduler, we can not call up(). */ if (!in_sched) { -- cgit v1.2.3 From 0f9368b5bf6db0c04afc5454b1be79022a681615 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2020 14:10:32 -0600 Subject: rwsem: Implement down_read_killable_nested In preparation for converting exec_update_mutex to a rwsem so that multiple readers can execute in parallel and not deadlock, add down_read_killable_nested. This is needed so that kcmp_lock can be converted from working on a mutexes to working on rw_semaphores. Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/87o8jabqh3.fsf@x220.int.ebiederm.org --- include/linux/rwsem.h | 2 ++ kernel/locking/rwsem.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'kernel') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 25e3fde85617..13021b08b2ed 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -171,6 +171,7 @@ extern void downgrade_write(struct rw_semaphore *sem); * See Documentation/locking/lockdep-design.rst for more details.) */ extern void down_read_nested(struct rw_semaphore *sem, int subclass); +extern int __must_check down_read_killable_nested(struct rw_semaphore *sem, int subclass); extern void down_write_nested(struct rw_semaphore *sem, int subclass); extern int down_write_killable_nested(struct rw_semaphore *sem, int subclass); extern void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest_lock); @@ -191,6 +192,7 @@ extern void down_read_non_owner(struct rw_semaphore *sem); extern void up_read_non_owner(struct rw_semaphore *sem); #else # define down_read_nested(sem, subclass) down_read(sem) +# define down_read_killable_nested(sem, subclass) down_read_killable(sem) # define down_write_nest_lock(sem, nest_lock) down_write(sem) # define down_write_nested(sem, subclass) down_write(sem) # define down_write_killable_nested(sem, subclass) down_write_killable(sem) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index f11b9bd3431d..54d11cb97551 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1605,6 +1605,20 @@ void down_read_nested(struct rw_semaphore *sem, int subclass) } EXPORT_SYMBOL(down_read_nested); +int down_read_killable_nested(struct rw_semaphore *sem, int subclass) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_killable)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_killable_nested); + void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest) { might_sleep(); -- cgit v1.2.3 From 31784cff7ee073b34d6eddabb95e3be2880a425c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2020 14:11:13 -0600 Subject: rwsem: Implement down_read_interruptible In preparation for converting exec_update_mutex to a rwsem so that multiple readers can execute in parallel and not deadlock, add down_read_interruptible. This is needed for perf_event_open to be converted (with no semantic changes) from working on a mutex to wroking on a rwsem. Signed-off-by: Eric W. Biederman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/87k0tybqfy.fsf@x220.int.ebiederm.org --- include/linux/rwsem.h | 1 + kernel/locking/rwsem.c | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+) (limited to 'kernel') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 13021b08b2ed..4c715be48717 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -123,6 +123,7 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem) * lock for reading */ extern void down_read(struct rw_semaphore *sem); +extern int __must_check down_read_interruptible(struct rw_semaphore *sem); extern int __must_check down_read_killable(struct rw_semaphore *sem); /* diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 54d11cb97551..a163542d178e 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1345,6 +1345,18 @@ static inline void __down_read(struct rw_semaphore *sem) } } +static inline int __down_read_interruptible(struct rw_semaphore *sem) +{ + if (!rwsem_read_trylock(sem)) { + if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_INTERRUPTIBLE))) + return -EINTR; + DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); + } else { + rwsem_set_reader_owned(sem); + } + return 0; +} + static inline int __down_read_killable(struct rw_semaphore *sem) { if (!rwsem_read_trylock(sem)) { @@ -1495,6 +1507,20 @@ void __sched down_read(struct rw_semaphore *sem) } EXPORT_SYMBOL(down_read); +int __sched down_read_interruptible(struct rw_semaphore *sem) +{ + might_sleep(); + rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); + + if (LOCK_CONTENDED_RETURN(sem, __down_read_trylock, __down_read_interruptible)) { + rwsem_release(&sem->dep_map, _RET_IP_); + return -EINTR; + } + + return 0; +} +EXPORT_SYMBOL(down_read_interruptible); + int __sched down_read_killable(struct rw_semaphore *sem) { might_sleep(); -- cgit v1.2.3 From 3379116a0ca965b00e6522c7ea3f16c9dbd8f9f9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Dec 2020 10:22:16 +0100 Subject: locking/rwsem: Better collate rwsem_read_trylock() All users of rwsem_read_trylock() do rwsem_set_reader_owned(sem) on success, move it into rwsem_read_trylock() proper. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201207090243.GE3040@hirez.programming.kicks-ass.net --- kernel/locking/rwsem.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index a163542d178e..5c0dc7ebace9 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -273,9 +273,16 @@ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) static inline bool rwsem_read_trylock(struct rw_semaphore *sem) { long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); + if (WARN_ON_ONCE(cnt < 0)) rwsem_set_nonspinnable(sem); - return !(cnt & RWSEM_READ_FAILED_MASK); + + if (!(cnt & RWSEM_READ_FAILED_MASK)) { + rwsem_set_reader_owned(sem); + return true; + } + + return false; } /* @@ -1340,8 +1347,6 @@ static inline void __down_read(struct rw_semaphore *sem) if (!rwsem_read_trylock(sem)) { rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } else { - rwsem_set_reader_owned(sem); } } @@ -1351,8 +1356,6 @@ static inline int __down_read_interruptible(struct rw_semaphore *sem) if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_INTERRUPTIBLE))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } else { - rwsem_set_reader_owned(sem); } return 0; } @@ -1363,8 +1366,6 @@ static inline int __down_read_killable(struct rw_semaphore *sem) if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } else { - rwsem_set_reader_owned(sem); } return 0; } -- cgit v1.2.3 From 285c61aedf6bc5d81b37e4dc48c19012e8ff9836 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Dec 2020 10:25:06 +0100 Subject: locking/rwsem: Introduce rwsem_write_trylock() One copy of this logic is better than three. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201207090243.GE3040@hirez.programming.kicks-ass.net --- kernel/locking/rwsem.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 5c0dc7ebace9..7915456b9dfa 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -285,6 +285,18 @@ static inline bool rwsem_read_trylock(struct rw_semaphore *sem) return false; } +static inline bool rwsem_write_trylock(struct rw_semaphore *sem) +{ + long tmp = RWSEM_UNLOCKED_VALUE; + + if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, RWSEM_WRITER_LOCKED)) { + rwsem_set_owner(sem); + return true; + } + + return false; +} + /* * Return just the real task structure pointer of the owner */ @@ -1395,42 +1407,24 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write(struct rw_semaphore *sem) { - long tmp = RWSEM_UNLOCKED_VALUE; - - if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED))) + if (unlikely(!rwsem_write_trylock(sem))) rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); - else - rwsem_set_owner(sem); } static inline int __down_write_killable(struct rw_semaphore *sem) { - long tmp = RWSEM_UNLOCKED_VALUE; - - if (unlikely(!atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED))) { + if (unlikely(!rwsem_write_trylock(sem))) { if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) return -EINTR; - } else { - rwsem_set_owner(sem); } + return 0; } static inline int __down_write_trylock(struct rw_semaphore *sem) { - long tmp; - DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); - - tmp = RWSEM_UNLOCKED_VALUE; - if (atomic_long_try_cmpxchg_acquire(&sem->count, &tmp, - RWSEM_WRITER_LOCKED)) { - rwsem_set_owner(sem); - return true; - } - return false; + return rwsem_write_trylock(sem); } /* -- cgit v1.2.3 From c995e638ccbbc65a76d1713c4fdcf927e7e2cb83 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 8 Dec 2020 10:27:41 +0100 Subject: locking/rwsem: Fold __down_{read,write}*() There's a lot needless duplication in __down_{read,write}*(), cure that with a helper. Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201207090243.GE3040@hirez.programming.kicks-ass.net --- kernel/locking/rwsem.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 7915456b9dfa..67ae366d08dd 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1354,32 +1354,29 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) /* * lock for reading */ -static inline void __down_read(struct rw_semaphore *sem) +static inline int __down_read_common(struct rw_semaphore *sem, int state) { if (!rwsem_read_trylock(sem)) { - rwsem_down_read_slowpath(sem, TASK_UNINTERRUPTIBLE); + if (IS_ERR(rwsem_down_read_slowpath(sem, state))) + return -EINTR; DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } + return 0; +} + +static inline void __down_read(struct rw_semaphore *sem) +{ + __down_read_common(sem, TASK_UNINTERRUPTIBLE); } static inline int __down_read_interruptible(struct rw_semaphore *sem) { - if (!rwsem_read_trylock(sem)) { - if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_INTERRUPTIBLE))) - return -EINTR; - DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } - return 0; + return __down_read_common(sem, TASK_INTERRUPTIBLE); } static inline int __down_read_killable(struct rw_semaphore *sem) { - if (!rwsem_read_trylock(sem)) { - if (IS_ERR(rwsem_down_read_slowpath(sem, TASK_KILLABLE))) - return -EINTR; - DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); - } - return 0; + return __down_read_common(sem, TASK_KILLABLE); } static inline int __down_read_trylock(struct rw_semaphore *sem) @@ -1405,22 +1402,26 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) /* * lock for writing */ -static inline void __down_write(struct rw_semaphore *sem) -{ - if (unlikely(!rwsem_write_trylock(sem))) - rwsem_down_write_slowpath(sem, TASK_UNINTERRUPTIBLE); -} - -static inline int __down_write_killable(struct rw_semaphore *sem) +static inline int __down_write_common(struct rw_semaphore *sem, int state) { if (unlikely(!rwsem_write_trylock(sem))) { - if (IS_ERR(rwsem_down_write_slowpath(sem, TASK_KILLABLE))) + if (IS_ERR(rwsem_down_write_slowpath(sem, state))) return -EINTR; } return 0; } +static inline void __down_write(struct rw_semaphore *sem) +{ + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} + +static inline int __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); +} + static inline int __down_write_trylock(struct rw_semaphore *sem) { DEBUG_RWSEMS_WARN_ON(sem->magic != sem, sem); -- cgit v1.2.3 From c8fe8b0564388f41147326f31e4587171aacccd4 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 20 Nov 2020 23:14:12 -0500 Subject: locking/rwsem: Pass the current atomic count to rwsem_down_read_slowpath() The atomic count value right after reader count increment can be useful to determine the rwsem state at trylock time. So the count value is passed down to rwsem_down_read_slowpath() to be used when appropriate. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Link: https://lkml.kernel.org/r/20201121041416.12285-2-longman@redhat.com --- kernel/locking/rwsem.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 67ae366d08dd..5768b90223c0 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -270,14 +270,14 @@ static inline void rwsem_set_nonspinnable(struct rw_semaphore *sem) owner | RWSEM_NONSPINNABLE)); } -static inline bool rwsem_read_trylock(struct rw_semaphore *sem) +static inline bool rwsem_read_trylock(struct rw_semaphore *sem, long *cntp) { - long cnt = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); + *cntp = atomic_long_add_return_acquire(RWSEM_READER_BIAS, &sem->count); - if (WARN_ON_ONCE(cnt < 0)) + if (WARN_ON_ONCE(*cntp < 0)) rwsem_set_nonspinnable(sem); - if (!(cnt & RWSEM_READ_FAILED_MASK)) { + if (!(*cntp & RWSEM_READ_FAILED_MASK)) { rwsem_set_reader_owned(sem); return true; } @@ -1008,9 +1008,9 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) * Wait for the read lock to be granted */ static struct rw_semaphore __sched * -rwsem_down_read_slowpath(struct rw_semaphore *sem, int state) +rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state) { - long count, adjustment = -RWSEM_READER_BIAS; + long adjustment = -RWSEM_READER_BIAS; struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); bool wake = false; @@ -1356,8 +1356,10 @@ static struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) */ static inline int __down_read_common(struct rw_semaphore *sem, int state) { - if (!rwsem_read_trylock(sem)) { - if (IS_ERR(rwsem_down_read_slowpath(sem, state))) + long count; + + if (!rwsem_read_trylock(sem, &count)) { + if (IS_ERR(rwsem_down_read_slowpath(sem, count, state))) return -EINTR; DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); } -- cgit v1.2.3 From 2f06f702925b512a95b95dca3855549c047eef58 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 20 Nov 2020 23:14:13 -0500 Subject: locking/rwsem: Prevent potential lock starvation The lock handoff bit is added in commit 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation") to avoid lock starvation. However, allowing readers to do optimistic spinning does introduce an unlikely scenario where lock starvation can happen. The lock handoff bit may only be set when a waiter is being woken up. In the case of reader unlock, wakeup happens only when the reader count reaches 0. If there is a continuous stream of incoming readers acquiring read lock via optimistic spinning, it is possible that the reader count may never reach 0 and so the handoff bit will never be asserted. One way to prevent this scenario from happening is to disallow optimistic spinning if the rwsem is currently owned by readers. If the previous or current owner is a writer, optimistic spinning will be allowed. If the previous owner is a reader but the reader count has reached 0 before, a wakeup should have been issued. So the handoff mechanism will be kicked in to prevent lock starvation. As a result, it should be OK to do optimistic spinning in this case. This patch may have some impact on reader performance as it reduces reader optimistic spinning especially if the lock critical sections are short the number of contending readers are small. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Link: https://lkml.kernel.org/r/20201121041416.12285-3-longman@redhat.com --- kernel/locking/rwsem.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 5768b90223c0..c055f4b28b23 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -1010,16 +1010,27 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) static struct rw_semaphore __sched * rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state) { - long adjustment = -RWSEM_READER_BIAS; + long owner, adjustment = -RWSEM_READER_BIAS; + long rcnt = (count >> RWSEM_READER_SHIFT); struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); bool wake = false; + /* + * To prevent a constant stream of readers from starving a sleeping + * waiter, don't attempt optimistic spinning if the lock is currently + * owned by readers. + */ + owner = atomic_long_read(&sem->owner); + if ((owner & RWSEM_READER_OWNED) && (rcnt > 1) && + !(count & RWSEM_WRITER_LOCKED)) + goto queue; + /* * Save the current read-owner of rwsem, if available, and the * reader nonspinnable bit. */ - waiter.last_rowner = atomic_long_read(&sem->owner); + waiter.last_rowner = owner; if (!(waiter.last_rowner & RWSEM_READER_OWNED)) waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; -- cgit v1.2.3 From 1a728dff855a318bb58bcc1259b1826a7ad9f0bd Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 20 Nov 2020 23:14:14 -0500 Subject: locking/rwsem: Enable reader optimistic lock stealing If the optimistic spinning queue is empty and the rwsem does not have the handoff or write-lock bits set, it is actually not necessary to call rwsem_optimistic_spin() to spin on it. Instead, it can steal the lock directly as its reader bias is in the count already. If it is the first reader in this state, it will try to wake up other readers in the wait queue. With this patch applied, the following were the lock event counts after rebooting a 2-socket system and a "make -j96" kernel rebuild. rwsem_opt_rlock=4437 rwsem_rlock=29 rwsem_rlock_steal=19 So lock stealing represents about 0.4% of all the read locks acquired in the slow path. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Link: https://lkml.kernel.org/r/20201121041416.12285-4-longman@redhat.com --- kernel/locking/lock_events_list.h | 1 + kernel/locking/rwsem.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 239039d0ce21..270a0d351932 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -63,6 +63,7 @@ LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */ LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */ LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ +LOCK_EVENT(rwsem_rlock_steal) /* # of read locks by lock stealing */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ LOCK_EVENT(rwsem_rlock_fail) /* # of failed read lock acquisitions */ LOCK_EVENT(rwsem_rlock_handoff) /* # of read lock handoffs */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c055f4b28b23..ba5e239d08e7 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -976,6 +976,12 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, } return false; } + +static inline bool rwsem_no_spinners(struct rw_semaphore *sem) +{ + return !osq_is_locked(&sem->osq); +} + #else static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) @@ -996,6 +1002,11 @@ static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, return false; } +static inline bool rwsem_no_spinners(sem) +{ + return false; +} + static inline int rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) { @@ -1026,6 +1037,22 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state) !(count & RWSEM_WRITER_LOCKED)) goto queue; + /* + * Reader optimistic lock stealing + * + * We can take the read lock directly without doing + * rwsem_optimistic_spin() if the conditions are right. + * Also wake up other readers if it is the first reader. + */ + if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF)) && + rwsem_no_spinners(sem)) { + rwsem_set_reader_owned(sem); + lockevent_inc(rwsem_rlock_steal); + if (rcnt == 1) + goto wake_readers; + return sem; + } + /* * Save the current read-owner of rwsem, if available, and the * reader nonspinnable bit. @@ -1048,6 +1075,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state) * Wake up other readers in the wait list if the front * waiter is a reader. */ +wake_readers: if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) { raw_spin_lock_irq(&sem->wait_lock); if (!list_empty(&sem->wait_list)) -- cgit v1.2.3 From 617f3ef95177840c77f59c2aec1029d27d5547d6 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 20 Nov 2020 23:14:16 -0500 Subject: locking/rwsem: Remove reader optimistic spinning Reader optimistic spinning is helpful when the reader critical section is short and there aren't that many readers around. It also improves the chance that a reader can get the lock as writer optimistic spinning disproportionally favors writers much more than readers. Since commit d3681e269fff ("locking/rwsem: Wake up almost all readers in wait queue"), all the waiting readers are woken up so that they can all get the read lock and run in parallel. When the number of contending readers is large, allowing reader optimistic spinning will likely cause reader fragmentation where multiple smaller groups of readers can get the read lock in a sequential manner separated by writers. That reduces reader parallelism. One possible way to address that drawback is to limit the number of readers (preferably one) that can do optimistic spinning. These readers act as representatives of all the waiting readers in the wait queue as they will wake up all those waiting readers once they get the lock. Alternatively, as reader optimistic lock stealing has already enhanced fairness to readers, it may be easier to just remove reader optimistic spinning and simplifying the optimistic spinning code as a result. Performance measurements (locking throughput kops/s) using a locking microbenchmark with 50/50 reader/writer distribution and turbo-boost disabled was done on a 2-socket Cascade Lake system (48-core 96-thread) to see the impacts of these changes: 1) Vanilla - 5.10-rc3 kernel 2) Before - 5.10-rc3 kernel with previous patches in this series 2) limit-rspin - 5.10-rc3 kernel with limited reader spinning patch 3) no-rspin - 5.10-rc3 kernel with reader spinning disabled # of threads CS Load Vanilla Before limit-rspin no-rspin ------------ ------- ------- ------ ----------- -------- 2 1 5,185 5,662 5,214 5,077 4 1 5,107 4,983 5,188 4,760 8 1 4,782 4,564 4,720 4,628 16 1 4,680 4,053 4,567 3,402 32 1 4,299 1,115 1,118 1,098 64 1 3,218 983 1,001 957 96 1 1,938 944 957 930 2 20 2,008 2,128 2,264 1,665 4 20 1,390 1,033 1,046 1,101 8 20 1,472 1,155 1,098 1,213 16 20 1,332 1,077 1,089 1,122 32 20 967 914 917 980 64 20 787 874 891 858 96 20 730 836 847 844 2 100 372 356 360 355 4 100 492 425 434 392 8 100 533 537 529 538 16 100 548 572 568 598 32 100 499 520 527 537 64 100 466 517 526 512 96 100 406 497 506 509 The column "CS Load" represents the number of pause instructions issued in the locking critical section. A CS load of 1 is extremely short and is not likey in real situations. A load of 20 (moderate) and 100 (long) are more realistic. It can be seen that the previous patches in this series have reduced performance in general except in highly contended cases with moderate or long critical sections that performance improves a bit. This change is mostly caused by the "Prevent potential lock starvation" patch that reduce reader optimistic spinning and hence reduce reader fragmentation. The patch that further limit reader optimistic spinning doesn't seem to have too much impact on overall performance as shown in the benchmark data. The patch that disables reader optimistic spinning shows reduced performance at lightly loaded cases, but comparable or slightly better performance on with heavier contention. This patch just removes reader optimistic spinning for now. As readers are not going to do optimistic spinning anymore, we don't need to consider if the OSQ is empty or not when doing lock stealing. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Davidlohr Bueso Link: https://lkml.kernel.org/r/20201121041416.12285-6-longman@redhat.com --- kernel/locking/lock_events_list.h | 5 +- kernel/locking/rwsem.c | 284 +++++++------------------------------- 2 files changed, 49 insertions(+), 240 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lock_events_list.h b/kernel/locking/lock_events_list.h index 270a0d351932..97fb6f3f840a 100644 --- a/kernel/locking/lock_events_list.h +++ b/kernel/locking/lock_events_list.h @@ -56,12 +56,9 @@ LOCK_EVENT(rwsem_sleep_reader) /* # of reader sleeps */ LOCK_EVENT(rwsem_sleep_writer) /* # of writer sleeps */ LOCK_EVENT(rwsem_wake_reader) /* # of reader wakeups */ LOCK_EVENT(rwsem_wake_writer) /* # of writer wakeups */ -LOCK_EVENT(rwsem_opt_rlock) /* # of opt-acquired read locks */ -LOCK_EVENT(rwsem_opt_wlock) /* # of opt-acquired write locks */ +LOCK_EVENT(rwsem_opt_lock) /* # of opt-acquired write locks */ LOCK_EVENT(rwsem_opt_fail) /* # of failed optspins */ LOCK_EVENT(rwsem_opt_nospin) /* # of disabled optspins */ -LOCK_EVENT(rwsem_opt_norspin) /* # of disabled reader-only optspins */ -LOCK_EVENT(rwsem_opt_rlock2) /* # of opt-acquired 2ndary read locks */ LOCK_EVENT(rwsem_rlock) /* # of read locks acquired */ LOCK_EVENT(rwsem_rlock_steal) /* # of read locks by lock stealing */ LOCK_EVENT(rwsem_rlock_fast) /* # of fast read locks acquired */ diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index ba5e239d08e7..ba67600c7b2c 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -31,19 +31,13 @@ #include "lock_events.h" /* - * The least significant 3 bits of the owner value has the following + * The least significant 2 bits of the owner value has the following * meanings when set. * - Bit 0: RWSEM_READER_OWNED - The rwsem is owned by readers - * - Bit 1: RWSEM_RD_NONSPINNABLE - Readers cannot spin on this lock. - * - Bit 2: RWSEM_WR_NONSPINNABLE - Writers cannot spin on this lock. + * - Bit 1: RWSEM_NONSPINNABLE - Cannot spin on a reader-owned lock * - * When the rwsem is either owned by an anonymous writer, or it is - * reader-owned, but a spinning writer has timed out, both nonspinnable - * bits will be set to disable optimistic spinning by readers and writers. - * In the later case, the last unlocking reader should then check the - * writer nonspinnable bit and clear it only to give writers preference - * to acquire the lock via optimistic spinning, but not readers. Similar - * action is also done in the reader slowpath. + * When the rwsem is reader-owned and a spinning writer has timed out, + * the nonspinnable bit will be set to disable optimistic spinning. * When a writer acquires a rwsem, it puts its task_struct pointer * into the owner field. It is cleared after an unlock. @@ -59,46 +53,14 @@ * is involved. Ideally we would like to track all the readers that own * a rwsem, but the overhead is simply too big. * - * Reader optimistic spinning is helpful when the reader critical section - * is short and there aren't that many readers around. It makes readers - * relatively more preferred than writers. When a writer times out spinning - * on a reader-owned lock and set the nospinnable bits, there are two main - * reasons for that. - * - * 1) The reader critical section is long, perhaps the task sleeps after - * acquiring the read lock. - * 2) There are just too many readers contending the lock causing it to - * take a while to service all of them. - * - * In the former case, long reader critical section will impede the progress - * of writers which is usually more important for system performance. In - * the later case, reader optimistic spinning tends to make the reader - * groups that contain readers that acquire the lock together smaller - * leading to more of them. That may hurt performance in some cases. In - * other words, the setting of nonspinnable bits indicates that reader - * optimistic spinning may not be helpful for those workloads that cause - * it. - * - * Therefore, any writers that had observed the setting of the writer - * nonspinnable bit for a given rwsem after they fail to acquire the lock - * via optimistic spinning will set the reader nonspinnable bit once they - * acquire the write lock. Similarly, readers that observe the setting - * of reader nonspinnable bit at slowpath entry will set the reader - * nonspinnable bits when they acquire the read lock via the wakeup path. - * - * Once the reader nonspinnable bit is on, it will only be reset when - * a writer is able to acquire the rwsem in the fast path or somehow a - * reader or writer in the slowpath doesn't observe the nonspinable bit. - * - * This is to discourage reader optmistic spinning on that particular - * rwsem and make writers more preferred. This adaptive disabling of reader - * optimistic spinning will alleviate the negative side effect of this - * feature. + * A fast path reader optimistic lock stealing is supported when the rwsem + * is previously owned by a writer and the following conditions are met: + * - OSQ is empty + * - rwsem is not currently writer owned + * - the handoff isn't set. */ #define RWSEM_READER_OWNED (1UL << 0) -#define RWSEM_RD_NONSPINNABLE (1UL << 1) -#define RWSEM_WR_NONSPINNABLE (1UL << 2) -#define RWSEM_NONSPINNABLE (RWSEM_RD_NONSPINNABLE | RWSEM_WR_NONSPINNABLE) +#define RWSEM_NONSPINNABLE (1UL << 1) #define RWSEM_OWNER_FLAGS_MASK (RWSEM_READER_OWNED | RWSEM_NONSPINNABLE) #ifdef CONFIG_DEBUG_RWSEMS @@ -203,7 +165,7 @@ static inline void __rwsem_set_reader_owned(struct rw_semaphore *sem, struct task_struct *owner) { unsigned long val = (unsigned long)owner | RWSEM_READER_OWNED | - (atomic_long_read(&sem->owner) & RWSEM_RD_NONSPINNABLE); + (atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE); atomic_long_set(&sem->owner, val); } @@ -372,7 +334,6 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; - unsigned long last_rowner; }; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -486,10 +447,6 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * the reader is copied over. */ owner = waiter->task; - if (waiter->last_rowner & RWSEM_RD_NONSPINNABLE) { - owner = (void *)((unsigned long)owner | RWSEM_RD_NONSPINNABLE); - lockevent_inc(rwsem_opt_norspin); - } __rwsem_set_reader_owned(sem, owner); } @@ -620,30 +577,6 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, } #ifdef CONFIG_RWSEM_SPIN_ON_OWNER -/* - * Try to acquire read lock before the reader is put on wait queue. - * Lock acquisition isn't allowed if the rwsem is locked or a writer handoff - * is ongoing. - */ -static inline bool rwsem_try_read_lock_unqueued(struct rw_semaphore *sem) -{ - long count = atomic_long_read(&sem->count); - - if (count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF)) - return false; - - count = atomic_long_fetch_add_acquire(RWSEM_READER_BIAS, &sem->count); - if (!(count & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { - rwsem_set_reader_owned(sem); - lockevent_inc(rwsem_opt_rlock); - return true; - } - - /* Back out the change */ - atomic_long_add(-RWSEM_READER_BIAS, &sem->count); - return false; -} - /* * Try to acquire write lock before the writer has been put on wait queue. */ @@ -655,7 +588,7 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) if (atomic_long_try_cmpxchg_acquire(&sem->count, &count, count | RWSEM_WRITER_LOCKED)) { rwsem_set_owner(sem); - lockevent_inc(rwsem_opt_wlock); + lockevent_inc(rwsem_opt_lock); return true; } } @@ -671,8 +604,7 @@ static inline bool owner_on_cpu(struct task_struct *owner) return owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); } -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, - unsigned long nonspinnable) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; unsigned long flags; @@ -689,7 +621,7 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, /* * Don't check the read-owner as the entry may be stale. */ - if ((flags & nonspinnable) || + if ((flags & RWSEM_NONSPINNABLE) || (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) ret = false; rcu_read_unlock(); @@ -719,9 +651,9 @@ enum owner_state { #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) static inline enum owner_state -rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long nonspinnable) +rwsem_owner_state(struct task_struct *owner, unsigned long flags) { - if (flags & nonspinnable) + if (flags & RWSEM_NONSPINNABLE) return OWNER_NONSPINNABLE; if (flags & RWSEM_READER_OWNED) @@ -731,14 +663,14 @@ rwsem_owner_state(struct task_struct *owner, unsigned long flags, unsigned long } static noinline enum owner_state -rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +rwsem_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *new, *owner; unsigned long flags, new_flags; enum owner_state state; owner = rwsem_owner_flags(sem, &flags); - state = rwsem_owner_state(owner, flags, nonspinnable); + state = rwsem_owner_state(owner, flags); if (state != OWNER_WRITER) return state; @@ -752,7 +684,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) */ new = rwsem_owner_flags(sem, &new_flags); if ((new != owner) || (new_flags != flags)) { - state = rwsem_owner_state(new, new_flags, nonspinnable); + state = rwsem_owner_state(new, new_flags); break; } @@ -801,14 +733,12 @@ static inline u64 rwsem_rspin_threshold(struct rw_semaphore *sem) return sched_clock() + delta; } -static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) +static bool rwsem_optimistic_spin(struct rw_semaphore *sem) { bool taken = false; int prev_owner_state = OWNER_NULL; int loop = 0; u64 rspin_threshold = 0; - unsigned long nonspinnable = wlock ? RWSEM_WR_NONSPINNABLE - : RWSEM_RD_NONSPINNABLE; preempt_disable(); @@ -825,15 +755,14 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) for (;;) { enum owner_state owner_state; - owner_state = rwsem_spin_on_owner(sem, nonspinnable); + owner_state = rwsem_spin_on_owner(sem); if (!(owner_state & OWNER_SPINNABLE)) break; /* * Try to acquire the lock */ - taken = wlock ? rwsem_try_write_lock_unqueued(sem) - : rwsem_try_read_lock_unqueued(sem); + taken = rwsem_try_write_lock_unqueued(sem); if (taken) break; @@ -841,7 +770,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) /* * Time-based reader-owned rwsem optimistic spinning */ - if (wlock && (owner_state == OWNER_READER)) { + if (owner_state == OWNER_READER) { /* * Re-initialize rspin_threshold every time when * the owner state changes from non-reader to reader. @@ -850,7 +779,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) * the beginning of the 2nd reader phase. */ if (prev_owner_state != OWNER_READER) { - if (rwsem_test_oflags(sem, nonspinnable)) + if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)) break; rspin_threshold = rwsem_rspin_threshold(sem); loop = 0; @@ -926,89 +855,30 @@ done: } /* - * Clear the owner's RWSEM_WR_NONSPINNABLE bit if it is set. This should + * Clear the owner's RWSEM_NONSPINNABLE bit if it is set. This should * only be called when the reader count reaches 0. - * - * This give writers better chance to acquire the rwsem first before - * readers when the rwsem was being held by readers for a relatively long - * period of time. Race can happen that an optimistic spinner may have - * just stolen the rwsem and set the owner, but just clearing the - * RWSEM_WR_NONSPINNABLE bit will do no harm anyway. - */ -static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) -{ - if (rwsem_test_oflags(sem, RWSEM_WR_NONSPINNABLE)) - atomic_long_andnot(RWSEM_WR_NONSPINNABLE, &sem->owner); -} - -/* - * This function is called when the reader fails to acquire the lock via - * optimistic spinning. In this case we will still attempt to do a trylock - * when comparing the rwsem state right now with the state when entering - * the slowpath indicates that the reader is still in a valid reader phase. - * This happens when the following conditions are true: - * - * 1) The lock is currently reader owned, and - * 2) The lock is previously not reader-owned or the last read owner changes. - * - * In the former case, we have transitioned from a writer phase to a - * reader-phase while spinning. In the latter case, it means the reader - * phase hasn't ended when we entered the optimistic spinning loop. In - * both cases, the reader is eligible to acquire the lock. This is the - * secondary path where a read lock is acquired optimistically. - * - * The reader non-spinnable bit wasn't set at time of entry or it will - * not be here at all. */ -static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, - unsigned long last_rowner) +static inline void clear_nonspinnable(struct rw_semaphore *sem) { - unsigned long owner = atomic_long_read(&sem->owner); - - if (!(owner & RWSEM_READER_OWNED)) - return false; - - if (((owner ^ last_rowner) & ~RWSEM_OWNER_FLAGS_MASK) && - rwsem_try_read_lock_unqueued(sem)) { - lockevent_inc(rwsem_opt_rlock2); - lockevent_add(rwsem_opt_fail, -1); - return true; - } - return false; -} - -static inline bool rwsem_no_spinners(struct rw_semaphore *sem) -{ - return !osq_is_locked(&sem->osq); + if (rwsem_test_oflags(sem, RWSEM_NONSPINNABLE)) + atomic_long_andnot(RWSEM_NONSPINNABLE, &sem->owner); } #else -static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem, - unsigned long nonspinnable) +static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { return false; } -static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem, bool wlock) +static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem) { return false; } -static inline void clear_wr_nonspinnable(struct rw_semaphore *sem) { } - -static inline bool rwsem_reader_phase_trylock(struct rw_semaphore *sem, - unsigned long last_rowner) -{ - return false; -} - -static inline bool rwsem_no_spinners(sem) -{ - return false; -} +static inline void clear_nonspinnable(struct rw_semaphore *sem) { } static inline int -rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) +rwsem_spin_on_owner(struct rw_semaphore *sem) { return 0; } @@ -1021,7 +891,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem, unsigned long nonspinnable) static struct rw_semaphore __sched * rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state) { - long owner, adjustment = -RWSEM_READER_BIAS; + long adjustment = -RWSEM_READER_BIAS; long rcnt = (count >> RWSEM_READER_SHIFT); struct rwsem_waiter waiter; DEFINE_WAKE_Q(wake_q); @@ -1029,54 +899,25 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, long count, int state) /* * To prevent a constant stream of readers from starving a sleeping - * waiter, don't attempt optimistic spinning if the lock is currently - * owned by readers. + * waiter, don't attempt optimistic lock stealing if the lock is + * currently owned by readers. */ - owner = atomic_long_read(&sem->owner); - if ((owner & RWSEM_READER_OWNED) && (rcnt > 1) && - !(count & RWSEM_WRITER_LOCKED)) + if ((atomic_long_read(&sem->owner) & RWSEM_READER_OWNED) && + (rcnt > 1) && !(count & RWSEM_WRITER_LOCKED)) goto queue; /* - * Reader optimistic lock stealing - * - * We can take the read lock directly without doing - * rwsem_optimistic_spin() if the conditions are right. - * Also wake up other readers if it is the first reader. + * Reader optimistic lock stealing. */ - if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF)) && - rwsem_no_spinners(sem)) { + if (!(count & (RWSEM_WRITER_LOCKED | RWSEM_FLAG_HANDOFF))) { rwsem_set_reader_owned(sem); lockevent_inc(rwsem_rlock_steal); - if (rcnt == 1) - goto wake_readers; - return sem; - } - /* - * Save the current read-owner of rwsem, if available, and the - * reader nonspinnable bit. - */ - waiter.last_rowner = owner; - if (!(waiter.last_rowner & RWSEM_READER_OWNED)) - waiter.last_rowner &= RWSEM_RD_NONSPINNABLE; - - if (!rwsem_can_spin_on_owner(sem, RWSEM_RD_NONSPINNABLE)) - goto queue; - - /* - * Undo read bias from down_read() and do optimistic spinning. - */ - atomic_long_add(-RWSEM_READER_BIAS, &sem->count); - adjustment = 0; - if (rwsem_optimistic_spin(sem, false)) { - /* rwsem_optimistic_spin() implies ACQUIRE on success */ /* - * Wake up other readers in the wait list if the front - * waiter is a reader. + * Wake up other readers in the wait queue if it is + * the first reader. */ -wake_readers: - if ((atomic_long_read(&sem->count) & RWSEM_FLAG_WAITERS)) { + if ((rcnt == 1) && (count & RWSEM_FLAG_WAITERS)) { raw_spin_lock_irq(&sem->wait_lock); if (!list_empty(&sem->wait_list)) rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, @@ -1085,9 +926,6 @@ wake_readers: wake_up_q(&wake_q); } return sem; - } else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) { - /* rwsem_reader_phase_trylock() implies ACQUIRE on success */ - return sem; } queue: @@ -1103,7 +941,7 @@ queue: * exit the slowpath and return immediately as its * RWSEM_READER_BIAS has already been set in the count. */ - if (adjustment && !(atomic_long_read(&sem->count) & + if (!(atomic_long_read(&sem->count) & (RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) { /* Provide lock ACQUIRE */ smp_acquire__after_ctrl_dep(); @@ -1117,10 +955,7 @@ queue: list_add_tail(&waiter.list, &sem->wait_list); /* we're now waiting on the lock, but no longer actively locking */ - if (adjustment) - count = atomic_long_add_return(adjustment, &sem->count); - else - count = atomic_long_read(&sem->count); + count = atomic_long_add_return(adjustment, &sem->count); /* * If there are no active locks, wake the front queued process(es). @@ -1129,7 +964,7 @@ queue: * wake our own waiter to join the existing active readers ! */ if (!(count & RWSEM_LOCK_MASK)) { - clear_wr_nonspinnable(sem); + clear_nonspinnable(sem); wake = true; } if (wake || (!(count & RWSEM_WRITER_MASK) && @@ -1174,19 +1009,6 @@ out_nolock: return ERR_PTR(-EINTR); } -/* - * This function is called by the a write lock owner. So the owner value - * won't get changed by others. - */ -static inline void rwsem_disable_reader_optspin(struct rw_semaphore *sem, - bool disable) -{ - if (unlikely(disable)) { - atomic_long_or(RWSEM_RD_NONSPINNABLE, &sem->owner); - lockevent_inc(rwsem_opt_norspin); - } -} - /* * Wait until we successfully acquire the write lock */ @@ -1194,26 +1016,17 @@ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; - bool disable_rspin; enum writer_wait_state wstate; struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ - if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) && - rwsem_optimistic_spin(sem, true)) { + if (rwsem_can_spin_on_owner(sem) && rwsem_optimistic_spin(sem)) { /* rwsem_optimistic_spin() implies ACQUIRE on success */ return sem; } - /* - * Disable reader optimistic spinning for this rwsem after - * acquiring the write lock when the setting of the nonspinnable - * bits are observed. - */ - disable_rspin = atomic_long_read(&sem->owner) & RWSEM_NONSPINNABLE; - /* * Optimistic spinning failed, proceed to the slowpath * and block until we can acquire the sem. @@ -1282,7 +1095,7 @@ wait: * without sleeping. */ if (wstate == WRITER_HANDOFF && - rwsem_spin_on_owner(sem, RWSEM_NONSPINNABLE) == OWNER_NULL) + rwsem_spin_on_owner(sem) == OWNER_NULL) goto trylock_again; /* Block until there are no active lockers. */ @@ -1324,7 +1137,6 @@ trylock_again: } __set_current_state(TASK_RUNNING); list_del(&waiter.list); - rwsem_disable_reader_optspin(sem, disable_rspin); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); @@ -1484,7 +1296,7 @@ static inline void __up_read(struct rw_semaphore *sem) DEBUG_RWSEMS_WARN_ON(tmp < 0, sem); if (unlikely((tmp & (RWSEM_LOCK_MASK|RWSEM_FLAG_WAITERS)) == RWSEM_FLAG_WAITERS)) { - clear_wr_nonspinnable(sem); + clear_nonspinnable(sem); rwsem_wake(sem, tmp); } } -- cgit v1.2.3 From 78af4dc949daaa37b3fcd5f348f373085b4e858f Mon Sep 17 00:00:00 2001 From: "peterz@infradead.org" Date: Fri, 28 Aug 2020 14:37:20 +0200 Subject: perf: Break deadlock involving exec_update_mutex Syzbot reported a lock inversion involving perf. The sore point being perf holding exec_update_mutex() for a very long time, specifically across a whole bunch of filesystem ops in pmu::event_init() (uprobes) and anon_inode_getfile(). This then inverts against procfs code trying to take exec_update_mutex. Move the permission checks later, such that we need to hold the mutex over less code. Reported-by: syzbot+db9cdf3dd1f64252c6ef@syzkaller.appspotmail.com Signed-off-by: Peter Zijlstra (Intel) --- kernel/events/core.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index a21b0be2f22c..19ae6c931c52 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -11832,24 +11832,6 @@ SYSCALL_DEFINE5(perf_event_open, goto err_task; } - if (task) { - err = mutex_lock_interruptible(&task->signal->exec_update_mutex); - if (err) - goto err_task; - - /* - * Preserve ptrace permission check for backwards compatibility. - * - * We must hold exec_update_mutex across this and any potential - * perf_install_in_context() call for this new event to - * serialize against exec() altering our credentials (and the - * perf_event_exit_task() that could imply). - */ - err = -EACCES; - if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) - goto err_cred; - } - if (flags & PERF_FLAG_PID_CGROUP) cgroup_fd = pid; @@ -11857,7 +11839,7 @@ SYSCALL_DEFINE5(perf_event_open, NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_cred; + goto err_task; } if (is_sampling_event(event)) { @@ -11976,6 +11958,24 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } + if (task) { + err = mutex_lock_interruptible(&task->signal->exec_update_mutex); + if (err) + goto err_file; + + /* + * Preserve ptrace permission check for backwards compatibility. + * + * We must hold exec_update_mutex across this and any potential + * perf_install_in_context() call for this new event to + * serialize against exec() altering our credentials (and the + * perf_event_exit_task() that could imply). + */ + err = -EACCES; + if (!perfmon_capable() && !ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) + goto err_cred; + } + if (move_group) { gctx = __perf_event_ctx_lock_double(group_leader, ctx); @@ -12151,7 +12151,10 @@ err_locked: if (move_group) perf_event_ctx_unlock(group_leader, gctx); mutex_unlock(&ctx->mutex); -/* err_file: */ +err_cred: + if (task) + mutex_unlock(&task->signal->exec_update_mutex); +err_file: fput(event_file); err_context: perf_unpin_context(ctx); @@ -12163,9 +12166,6 @@ err_alloc: */ if (!event_file) free_event(event); -err_cred: - if (task) - mutex_unlock(&task->signal->exec_update_mutex); err_task: if (task) put_task_struct(task); -- cgit v1.2.3 From 01bb86b380a306bd937c96da36f66429f3362137 Mon Sep 17 00:00:00 2001 From: Saravana Kannan Date: Fri, 20 Nov 2020 18:02:22 -0800 Subject: driver core: Add fwnode_init() There are multiple locations in the kernel where a struct fwnode_handle is initialized. Add fwnode_init() so that we have one way of initializing a fwnode_handle. Acked-by: Rob Herring Signed-off-by: Saravana Kannan Link: https://lore.kernel.org/r/20201121020232.908850-8-saravanak@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/property.c | 2 +- drivers/acpi/scan.c | 2 +- drivers/base/swnode.c | 2 +- drivers/firmware/efi/efi-init.c | 8 ++++---- include/linux/fwnode.h | 6 ++++++ include/linux/of.h | 2 +- kernel/irq/irqdomain.c | 2 +- 7 files changed, 15 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index d04de10a63e4..24e87b630573 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -76,7 +76,7 @@ static bool acpi_nondev_subnode_extract(const union acpi_object *desc, return false; dn->name = link->package.elements[0].string.pointer; - dn->fwnode.ops = &acpi_data_fwnode_ops; + fwnode_init(&dn->fwnode, &acpi_data_fwnode_ops); dn->parent = parent; INIT_LIST_HEAD(&dn->data.properties); INIT_LIST_HEAD(&dn->data.subnodes); diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index a896e5e87c93..0ac19f9274b8 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -1589,7 +1589,7 @@ void acpi_init_device_object(struct acpi_device *device, acpi_handle handle, device->device_type = type; device->handle = handle; device->parent = acpi_bus_get_parent(handle); - device->fwnode.ops = &acpi_device_fwnode_ops; + fwnode_init(&device->fwnode, &acpi_device_fwnode_ops); acpi_set_device_status(device, sta); acpi_device_get_busid(device); acpi_set_pnp_ids(handle, &device->pnp, type); diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index 010828fc785b..4a4b2008fbc2 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -653,7 +653,7 @@ swnode_register(const struct software_node *node, struct swnode *parent, swnode->parent = parent; swnode->allocated = allocated; swnode->kobj.kset = swnode_kset; - swnode->fwnode.ops = &software_node_ops; + fwnode_init(&swnode->fwnode, &software_node_ops); ida_init(&swnode->child_ids); INIT_LIST_HEAD(&swnode->entry); diff --git a/drivers/firmware/efi/efi-init.c b/drivers/firmware/efi/efi-init.c index f55a92ff12c0..b148f1459fb3 100644 --- a/drivers/firmware/efi/efi-init.c +++ b/drivers/firmware/efi/efi-init.c @@ -359,9 +359,7 @@ static const struct fwnode_operations efifb_fwnode_ops = { .add_links = efifb_add_links, }; -static struct fwnode_handle efifb_fwnode = { - .ops = &efifb_fwnode_ops, -}; +static struct fwnode_handle efifb_fwnode; static int __init register_gop_device(void) { @@ -375,8 +373,10 @@ static int __init register_gop_device(void) if (!pd) return -ENOMEM; - if (IS_ENABLED(CONFIG_PCI)) + if (IS_ENABLED(CONFIG_PCI)) { + fwnode_init(&efifb_fwnode, &efifb_fwnode_ops); pd->dev.fwnode = &efifb_fwnode; + } err = platform_device_add_data(pd, &screen_info, sizeof(screen_info)); if (err) diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index e0abafbb17f8..5589799708b5 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -170,6 +170,12 @@ struct fwnode_operations { } while (false) #define get_dev_from_fwnode(fwnode) get_device((fwnode)->dev) +static inline void fwnode_init(struct fwnode_handle *fwnode, + const struct fwnode_operations *ops) +{ + fwnode->ops = ops; +} + extern u32 fw_devlink_get_flags(void); #endif diff --git a/include/linux/of.h b/include/linux/of.h index af655d264f10..df71a3a1bb8d 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -108,7 +108,7 @@ static inline void of_node_init(struct device_node *node) #if defined(CONFIG_OF_KOBJ) kobject_init(&node->kobj, &of_node_ktype); #endif - node->fwnode.ops = &of_fwnode_ops; + fwnode_init(&node->fwnode, &of_fwnode_ops); } #if defined(CONFIG_OF_KOBJ) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index cf8b374b892d..06fce7e39033 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -91,7 +91,7 @@ struct fwnode_handle *__irq_domain_alloc_fwnode(unsigned int type, int id, fwid->type = type; fwid->name = n; fwid->pa = pa; - fwid->fwnode.ops = &irqchip_fwnode_ops; + fwnode_init(&fwid->fwnode, &irqchip_fwnode_ops); return &fwid->fwnode; } EXPORT_SYMBOL_GPL(__irq_domain_alloc_fwnode); -- cgit v1.2.3 From b2058cd93d930d7b9f76f34590c0d432cd6470c7 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 9 Dec 2020 17:26:35 -0800 Subject: Input: gtco - remove driver The driver has its own HID descriptor parsing code, that had and still has several issues discovered by syzbot and other tools. Ideally we should move the driver over to the HID subsystem, so that it uses proven parsing code. However the devices in question are EOL, and GTCO is not willing to extend resources for that, so let's simply remove the driver. Note that our HID support has greatly improved over the last 10 years, we may also consider reverting 6f8d9e26e7de ("hid-core.c: Adds all GTCO CalComp Digitizers and InterWrite School Products to blacklist") and see if GTCO devices actually work with normal HID drivers. Link: https://lore.kernel.org/r/X8wbBtO5KidME17K@google.com Signed-off-by: Dmitry Torokhov --- arch/powerpc/configs/ppc6xx_defconfig | 1 - drivers/input/tablet/Kconfig | 12 - drivers/input/tablet/Makefile | 1 - drivers/input/tablet/gtco.c | 1043 ----------------------------- kernel/configs/android-recommended.config | 1 - 5 files changed, 1058 deletions(-) delete mode 100644 drivers/input/tablet/gtco.c (limited to 'kernel') diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 66e9a0fd64ff..0ad0f6d41980 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -579,7 +579,6 @@ CONFIG_JOYSTICK_XPAD_LEDS=y CONFIG_INPUT_TABLET=y CONFIG_TABLET_USB_ACECAD=m CONFIG_TABLET_USB_AIPTEK=m -CONFIG_TABLET_USB_GTCO=m CONFIG_TABLET_USB_KBTAB=m CONFIG_INPUT_MISC=y CONFIG_INPUT_PCSPKR=m diff --git a/drivers/input/tablet/Kconfig b/drivers/input/tablet/Kconfig index 51c339182017..ec27eff6ae37 100644 --- a/drivers/input/tablet/Kconfig +++ b/drivers/input/tablet/Kconfig @@ -38,18 +38,6 @@ config TABLET_USB_AIPTEK To compile this driver as a module, choose M here: the module will be called aiptek. -config TABLET_USB_GTCO - tristate "GTCO CalComp/InterWrite USB Support" - depends on USB && INPUT - help - Say Y here if you want to use the USB version of the GTCO - CalComp/InterWrite Tablet. Make sure to say Y to "Mouse support" - (CONFIG_INPUT_MOUSEDEV) and/or "Event interface support" - (CONFIG_INPUT_EVDEV) as well. - - To compile this driver as a module, choose M here: the - module will be called gtco. - config TABLET_USB_HANWANG tristate "Hanwang Art Master III tablet support (USB)" depends on USB_ARCH_HAS_HCD diff --git a/drivers/input/tablet/Makefile b/drivers/input/tablet/Makefile index 8279ccc18b0a..adb636430717 100644 --- a/drivers/input/tablet/Makefile +++ b/drivers/input/tablet/Makefile @@ -6,7 +6,6 @@ obj-$(CONFIG_TABLET_USB_ACECAD) += acecad.o obj-$(CONFIG_TABLET_USB_AIPTEK) += aiptek.o -obj-$(CONFIG_TABLET_USB_GTCO) += gtco.o obj-$(CONFIG_TABLET_USB_HANWANG) += hanwang.o obj-$(CONFIG_TABLET_USB_KBTAB) += kbtab.o obj-$(CONFIG_TABLET_USB_PEGASUS) += pegasus_notetaker.o diff --git a/drivers/input/tablet/gtco.c b/drivers/input/tablet/gtco.c deleted file mode 100644 index 44bb1f69b4b2..000000000000 --- a/drivers/input/tablet/gtco.c +++ /dev/null @@ -1,1043 +0,0 @@ -/* -*- linux-c -*- - -GTCO digitizer USB driver - -TO CHECK: Is pressure done right on report 5? - -Copyright (C) 2006 GTCO CalComp - -This program is free software; you can redistribute it and/or -modify it under the terms of the GNU General Public License -as published by the Free Software Foundation; version 2 -of the License. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -Permission to use, copy, modify, distribute, and sell this software and its -documentation for any purpose is hereby granted without fee, provided that -the above copyright notice appear in all copies and that both that -copyright notice and this permission notice appear in supporting -documentation, and that the name of GTCO-CalComp not be used in advertising -or publicity pertaining to distribution of the software without specific, -written prior permission. GTCO-CalComp makes no representations about the -suitability of this software for any purpose. It is provided "as is" -without express or implied warranty. - -GTCO-CALCOMP DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, -INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO -EVENT SHALL GTCO-CALCOMP BE LIABLE FOR ANY SPECIAL, INDIRECT OR -CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, -DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER -TORTIOUS ACTIONS, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -PERFORMANCE OF THIS SOFTWARE. - -GTCO CalComp, Inc. -7125 Riverwood Drive -Columbia, MD 21046 - -Jeremy Roberson jroberson@gtcocalcomp.com -Scott Hill shill@gtcocalcomp.com -*/ - - - -/*#define DEBUG*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* Version with a Major number of 2 is for kernel inclusion only. */ -#define GTCO_VERSION "2.00.0006" - - -/* MACROS */ - -#define VENDOR_ID_GTCO 0x078C -#define PID_400 0x400 -#define PID_401 0x401 -#define PID_1000 0x1000 -#define PID_1001 0x1001 -#define PID_1002 0x1002 - -/* Max size of a single report */ -#define REPORT_MAX_SIZE 10 -#define MAX_COLLECTION_LEVELS 10 - - -/* Bitmask whether pen is in range */ -#define MASK_INRANGE 0x20 -#define MASK_BUTTON 0x01F - -#define PATHLENGTH 64 - -/* DATA STRUCTURES */ - -/* Device table */ -static const struct usb_device_id gtco_usbid_table[] = { - { USB_DEVICE(VENDOR_ID_GTCO, PID_400) }, - { USB_DEVICE(VENDOR_ID_GTCO, PID_401) }, - { USB_DEVICE(VENDOR_ID_GTCO, PID_1000) }, - { USB_DEVICE(VENDOR_ID_GTCO, PID_1001) }, - { USB_DEVICE(VENDOR_ID_GTCO, PID_1002) }, - { } -}; -MODULE_DEVICE_TABLE (usb, gtco_usbid_table); - - -/* Structure to hold all of our device specific stuff */ -struct gtco { - - struct input_dev *inputdevice; /* input device struct pointer */ - struct usb_interface *intf; /* the usb interface for this device */ - struct urb *urbinfo; /* urb for incoming reports */ - dma_addr_t buf_dma; /* dma addr of the data buffer*/ - unsigned char * buffer; /* databuffer for reports */ - - char usbpath[PATHLENGTH]; - int openCount; - - /* Information pulled from Report Descriptor */ - u32 usage; - u32 min_X; - u32 max_X; - u32 min_Y; - u32 max_Y; - s8 mintilt_X; - s8 maxtilt_X; - s8 mintilt_Y; - s8 maxtilt_Y; - u32 maxpressure; - u32 minpressure; -}; - - - -/* Code for parsing the HID REPORT DESCRIPTOR */ - -/* From HID1.11 spec */ -struct hid_descriptor -{ - struct usb_descriptor_header header; - __le16 bcdHID; - u8 bCountryCode; - u8 bNumDescriptors; - u8 bDescriptorType; - __le16 wDescriptorLength; -} __attribute__ ((packed)); - - -#define HID_DESCRIPTOR_SIZE 9 -#define HID_DEVICE_TYPE 33 -#define REPORT_DEVICE_TYPE 34 - - -#define PREF_TAG(x) ((x)>>4) -#define PREF_TYPE(x) ((x>>2)&0x03) -#define PREF_SIZE(x) ((x)&0x03) - -#define TYPE_MAIN 0 -#define TYPE_GLOBAL 1 -#define TYPE_LOCAL 2 -#define TYPE_RESERVED 3 - -#define TAG_MAIN_INPUT 0x8 -#define TAG_MAIN_OUTPUT 0x9 -#define TAG_MAIN_FEATURE 0xB -#define TAG_MAIN_COL_START 0xA -#define TAG_MAIN_COL_END 0xC - -#define TAG_GLOB_USAGE 0 -#define TAG_GLOB_LOG_MIN 1 -#define TAG_GLOB_LOG_MAX 2 -#define TAG_GLOB_PHYS_MIN 3 -#define TAG_GLOB_PHYS_MAX 4 -#define TAG_GLOB_UNIT_EXP 5 -#define TAG_GLOB_UNIT 6 -#define TAG_GLOB_REPORT_SZ 7 -#define TAG_GLOB_REPORT_ID 8 -#define TAG_GLOB_REPORT_CNT 9 -#define TAG_GLOB_PUSH 10 -#define TAG_GLOB_POP 11 - -#define TAG_GLOB_MAX 12 - -#define DIGITIZER_USAGE_TIP_PRESSURE 0x30 -#define DIGITIZER_USAGE_TILT_X 0x3D -#define DIGITIZER_USAGE_TILT_Y 0x3E - - -/* - * This is an abbreviated parser for the HID Report Descriptor. We - * know what devices we are talking to, so this is by no means meant - * to be generic. We can make some safe assumptions: - * - * - We know there are no LONG tags, all short - * - We know that we have no MAIN Feature and MAIN Output items - * - We know what the IRQ reports are supposed to look like. - * - * The main purpose of this is to use the HID report desc to figure - * out the mins and maxs of the fields in the IRQ reports. The IRQ - * reports for 400/401 change slightly if the max X is bigger than 64K. - * - */ -static void parse_hid_report_descriptor(struct gtco *device, char * report, - int length) -{ - struct device *ddev = &device->intf->dev; - int x, i = 0; - - /* Tag primitive vars */ - __u8 prefix; - __u8 size; - __u8 tag; - __u8 type; - __u8 data = 0; - __u16 data16 = 0; - __u32 data32 = 0; - - /* For parsing logic */ - int inputnum = 0; - __u32 usage = 0; - - /* Global Values, indexed by TAG */ - __u32 globalval[TAG_GLOB_MAX]; - __u32 oldval[TAG_GLOB_MAX]; - - /* Debug stuff */ - char maintype = 'x'; - char globtype[12]; - int indent = 0; - char indentstr[MAX_COLLECTION_LEVELS + 1] = { 0 }; - - dev_dbg(ddev, "======>>>>>>PARSE<<<<<<======\n"); - - /* Walk this report and pull out the info we need */ - while (i < length) { - prefix = report[i++]; - - /* Determine data size and save the data in the proper variable */ - size = (1U << PREF_SIZE(prefix)) >> 1; - if (i + size > length) { - dev_err(ddev, - "Not enough data (need %d, have %d)\n", - i + size, length); - break; - } - - switch (size) { - case 1: - data = report[i]; - break; - case 2: - data16 = get_unaligned_le16(&report[i]); - break; - case 4: - data32 = get_unaligned_le32(&report[i]); - break; - } - - /* Skip size of data */ - i += size; - - /* What we do depends on the tag type */ - tag = PREF_TAG(prefix); - type = PREF_TYPE(prefix); - switch (type) { - case TYPE_MAIN: - strcpy(globtype, ""); - switch (tag) { - - case TAG_MAIN_INPUT: - /* - * The INPUT MAIN tag signifies this is - * information from a report. We need to - * figure out what it is and store the - * min/max values - */ - - maintype = 'I'; - if (data == 2) - strcpy(globtype, "Variable"); - else if (data == 3) - strcpy(globtype, "Var|Const"); - - dev_dbg(ddev, "::::: Saving Report: %d input #%d Max: 0x%X(%d) Min:0x%X(%d) of %d bits\n", - globalval[TAG_GLOB_REPORT_ID], inputnum, - globalval[TAG_GLOB_LOG_MAX], globalval[TAG_GLOB_LOG_MAX], - globalval[TAG_GLOB_LOG_MIN], globalval[TAG_GLOB_LOG_MIN], - globalval[TAG_GLOB_REPORT_SZ] * globalval[TAG_GLOB_REPORT_CNT]); - - - /* - We can assume that the first two input items - are always the X and Y coordinates. After - that, we look for everything else by - local usage value - */ - switch (inputnum) { - case 0: /* X coord */ - dev_dbg(ddev, "GER: X Usage: 0x%x\n", usage); - if (device->max_X == 0) { - device->max_X = globalval[TAG_GLOB_LOG_MAX]; - device->min_X = globalval[TAG_GLOB_LOG_MIN]; - } - break; - - case 1: /* Y coord */ - dev_dbg(ddev, "GER: Y Usage: 0x%x\n", usage); - if (device->max_Y == 0) { - device->max_Y = globalval[TAG_GLOB_LOG_MAX]; - device->min_Y = globalval[TAG_GLOB_LOG_MIN]; - } - break; - - default: - /* Tilt X */ - if (usage == DIGITIZER_USAGE_TILT_X) { - if (device->maxtilt_X == 0) { - device->maxtilt_X = globalval[TAG_GLOB_LOG_MAX]; - device->mintilt_X = globalval[TAG_GLOB_LOG_MIN]; - } - } - - /* Tilt Y */ - if (usage == DIGITIZER_USAGE_TILT_Y) { - if (device->maxtilt_Y == 0) { - device->maxtilt_Y = globalval[TAG_GLOB_LOG_MAX]; - device->mintilt_Y = globalval[TAG_GLOB_LOG_MIN]; - } - } - - /* Pressure */ - if (usage == DIGITIZER_USAGE_TIP_PRESSURE) { - if (device->maxpressure == 0) { - device->maxpressure = globalval[TAG_GLOB_LOG_MAX]; - device->minpressure = globalval[TAG_GLOB_LOG_MIN]; - } - } - - break; - } - - inputnum++; - break; - - case TAG_MAIN_OUTPUT: - maintype = 'O'; - break; - - case TAG_MAIN_FEATURE: - maintype = 'F'; - break; - - case TAG_MAIN_COL_START: - maintype = 'S'; - - if (indent == MAX_COLLECTION_LEVELS) { - dev_err(ddev, "Collection level %d would exceed limit of %d\n", - indent + 1, - MAX_COLLECTION_LEVELS); - break; - } - - if (data == 0) { - dev_dbg(ddev, "======>>>>>> Physical\n"); - strcpy(globtype, "Physical"); - } else - dev_dbg(ddev, "======>>>>>>\n"); - - /* Indent the debug output */ - indent++; - for (x = 0; x < indent; x++) - indentstr[x] = '-'; - indentstr[x] = 0; - - /* Save global tags */ - for (x = 0; x < TAG_GLOB_MAX; x++) - oldval[x] = globalval[x]; - - break; - - case TAG_MAIN_COL_END: - maintype = 'E'; - - if (indent == 0) { - dev_err(ddev, "Collection level already at zero\n"); - break; - } - - dev_dbg(ddev, "<<<<<<======\n"); - - indent--; - for (x = 0; x < indent; x++) - indentstr[x] = '-'; - indentstr[x] = 0; - - /* Copy global tags back */ - for (x = 0; x < TAG_GLOB_MAX; x++) - globalval[x] = oldval[x]; - - break; - } - - switch (size) { - case 1: - dev_dbg(ddev, "%sMAINTAG:(%d) %c SIZE: %d Data: %s 0x%x\n", - indentstr, tag, maintype, size, globtype, data); - break; - - case 2: - dev_dbg(ddev, "%sMAINTAG:(%d) %c SIZE: %d Data: %s 0x%x\n", - indentstr, tag, maintype, size, globtype, data16); - break; - - case 4: - dev_dbg(ddev, "%sMAINTAG:(%d) %c SIZE: %d Data: %s 0x%x\n", - indentstr, tag, maintype, size, globtype, data32); - break; - } - break; - - case TYPE_GLOBAL: - switch (tag) { - case TAG_GLOB_USAGE: - /* - * First time we hit the global usage tag, - * it should tell us the type of device - */ - if (device->usage == 0) - device->usage = data; - - strcpy(globtype, "USAGE"); - break; - - case TAG_GLOB_LOG_MIN: - strcpy(globtype, "LOG_MIN"); - break; - - case TAG_GLOB_LOG_MAX: - strcpy(globtype, "LOG_MAX"); - break; - - case TAG_GLOB_PHYS_MIN: - strcpy(globtype, "PHYS_MIN"); - break; - - case TAG_GLOB_PHYS_MAX: - strcpy(globtype, "PHYS_MAX"); - break; - - case TAG_GLOB_UNIT_EXP: - strcpy(globtype, "EXP"); - break; - - case TAG_GLOB_UNIT: - strcpy(globtype, "UNIT"); - break; - - case TAG_GLOB_REPORT_SZ: - strcpy(globtype, "REPORT_SZ"); - break; - - case TAG_GLOB_REPORT_ID: - strcpy(globtype, "REPORT_ID"); - /* New report, restart numbering */ - inputnum = 0; - break; - - case TAG_GLOB_REPORT_CNT: - strcpy(globtype, "REPORT_CNT"); - break; - - case TAG_GLOB_PUSH: - strcpy(globtype, "PUSH"); - break; - - case TAG_GLOB_POP: - strcpy(globtype, "POP"); - break; - } - - /* Check to make sure we have a good tag number - so we don't overflow array */ - if (tag < TAG_GLOB_MAX) { - switch (size) { - case 1: - dev_dbg(ddev, "%sGLOBALTAG:%s(%d) SIZE: %d Data: 0x%x\n", - indentstr, globtype, tag, size, data); - globalval[tag] = data; - break; - - case 2: - dev_dbg(ddev, "%sGLOBALTAG:%s(%d) SIZE: %d Data: 0x%x\n", - indentstr, globtype, tag, size, data16); - globalval[tag] = data16; - break; - - case 4: - dev_dbg(ddev, "%sGLOBALTAG:%s(%d) SIZE: %d Data: 0x%x\n", - indentstr, globtype, tag, size, data32); - globalval[tag] = data32; - break; - } - } else { - dev_dbg(ddev, "%sGLOBALTAG: ILLEGAL TAG:%d SIZE: %d\n", - indentstr, tag, size); - } - break; - - case TYPE_LOCAL: - switch (tag) { - case TAG_GLOB_USAGE: - strcpy(globtype, "USAGE"); - /* Always 1 byte */ - usage = data; - break; - - case TAG_GLOB_LOG_MIN: - strcpy(globtype, "MIN"); - break; - - case TAG_GLOB_LOG_MAX: - strcpy(globtype, "MAX"); - break; - - default: - strcpy(globtype, "UNKNOWN"); - break; - } - - switch (size) { - case 1: - dev_dbg(ddev, "%sLOCALTAG:(%d) %s SIZE: %d Data: 0x%x\n", - indentstr, tag, globtype, size, data); - break; - - case 2: - dev_dbg(ddev, "%sLOCALTAG:(%d) %s SIZE: %d Data: 0x%x\n", - indentstr, tag, globtype, size, data16); - break; - - case 4: - dev_dbg(ddev, "%sLOCALTAG:(%d) %s SIZE: %d Data: 0x%x\n", - indentstr, tag, globtype, size, data32); - break; - } - - break; - } - } -} - -/* INPUT DRIVER Routines */ - -/* - * Called when opening the input device. This will submit the URB to - * the usb system so we start getting reports - */ -static int gtco_input_open(struct input_dev *inputdev) -{ - struct gtco *device = input_get_drvdata(inputdev); - - device->urbinfo->dev = interface_to_usbdev(device->intf); - if (usb_submit_urb(device->urbinfo, GFP_KERNEL)) - return -EIO; - - return 0; -} - -/* - * Called when closing the input device. This will unlink the URB - */ -static void gtco_input_close(struct input_dev *inputdev) -{ - struct gtco *device = input_get_drvdata(inputdev); - - usb_kill_urb(device->urbinfo); -} - - -/* - * Setup input device capabilities. Tell the input system what this - * device is capable of generating. - * - * This information is based on what is read from the HID report and - * placed in the struct gtco structure - * - */ -static void gtco_setup_caps(struct input_dev *inputdev) -{ - struct gtco *device = input_get_drvdata(inputdev); - - /* Which events */ - inputdev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_ABS) | - BIT_MASK(EV_MSC); - - /* Misc event menu block */ - inputdev->mscbit[0] = BIT_MASK(MSC_SCAN) | BIT_MASK(MSC_SERIAL) | - BIT_MASK(MSC_RAW); - - /* Absolute values based on HID report info */ - input_set_abs_params(inputdev, ABS_X, device->min_X, device->max_X, - 0, 0); - input_set_abs_params(inputdev, ABS_Y, device->min_Y, device->max_Y, - 0, 0); - - /* Proximity */ - input_set_abs_params(inputdev, ABS_DISTANCE, 0, 1, 0, 0); - - /* Tilt & pressure */ - input_set_abs_params(inputdev, ABS_TILT_X, device->mintilt_X, - device->maxtilt_X, 0, 0); - input_set_abs_params(inputdev, ABS_TILT_Y, device->mintilt_Y, - device->maxtilt_Y, 0, 0); - input_set_abs_params(inputdev, ABS_PRESSURE, device->minpressure, - device->maxpressure, 0, 0); - - /* Transducer */ - input_set_abs_params(inputdev, ABS_MISC, 0, 0xFF, 0, 0); -} - -/* USB Routines */ - -/* - * URB callback routine. Called when we get IRQ reports from the - * digitizer. - * - * This bridges the USB and input device worlds. It generates events - * on the input device based on the USB reports. - */ -static void gtco_urb_callback(struct urb *urbinfo) -{ - struct gtco *device = urbinfo->context; - struct input_dev *inputdev; - int rc; - u32 val = 0; - char le_buffer[2]; - - inputdev = device->inputdevice; - - /* Was callback OK? */ - if (urbinfo->status == -ECONNRESET || - urbinfo->status == -ENOENT || - urbinfo->status == -ESHUTDOWN) { - - /* Shutdown is occurring. Return and don't queue up any more */ - return; - } - - if (urbinfo->status != 0) { - /* - * Some unknown error. Hopefully temporary. Just go and - * requeue an URB - */ - goto resubmit; - } - - /* - * Good URB, now process - */ - - /* PID dependent when we interpret the report */ - if (inputdev->id.product == PID_1000 || - inputdev->id.product == PID_1001 || - inputdev->id.product == PID_1002) { - - /* - * Switch on the report ID - * Conveniently, the reports have more information, the higher - * the report number. We can just fall through the case - * statements if we start with the highest number report - */ - switch (device->buffer[0]) { - case 5: - /* Pressure is 9 bits */ - val = ((u16)(device->buffer[8]) << 1); - val |= (u16)(device->buffer[7] >> 7); - input_report_abs(inputdev, ABS_PRESSURE, - device->buffer[8]); - - /* Mask out the Y tilt value used for pressure */ - device->buffer[7] = (u8)((device->buffer[7]) & 0x7F); - fallthrough; - - case 4: - /* Tilt */ - input_report_abs(inputdev, ABS_TILT_X, - sign_extend32(device->buffer[6], 6)); - - input_report_abs(inputdev, ABS_TILT_Y, - sign_extend32(device->buffer[7], 6)); - fallthrough; - - case 2: - case 3: - /* Convert buttons, only 5 bits possible */ - val = (device->buffer[5]) & MASK_BUTTON; - - /* We don't apply any meaning to the bitmask, - just report */ - input_event(inputdev, EV_MSC, MSC_SERIAL, val); - fallthrough; - - case 1: - /* All reports have X and Y coords in the same place */ - val = get_unaligned_le16(&device->buffer[1]); - input_report_abs(inputdev, ABS_X, val); - - val = get_unaligned_le16(&device->buffer[3]); - input_report_abs(inputdev, ABS_Y, val); - - /* Ditto for proximity bit */ - val = device->buffer[5] & MASK_INRANGE ? 1 : 0; - input_report_abs(inputdev, ABS_DISTANCE, val); - - /* Report 1 is an exception to how we handle buttons */ - /* Buttons are an index, not a bitmask */ - if (device->buffer[0] == 1) { - - /* - * Convert buttons, 5 bit index - * Report value of index set as one, - * the rest as 0 - */ - val = device->buffer[5] & MASK_BUTTON; - dev_dbg(&device->intf->dev, - "======>>>>>>REPORT 1: val 0x%X(%d)\n", - val, val); - - /* - * We don't apply any meaning to the button - * index, just report it - */ - input_event(inputdev, EV_MSC, MSC_SERIAL, val); - } - break; - - case 7: - /* Menu blocks */ - input_event(inputdev, EV_MSC, MSC_SCAN, - device->buffer[1]); - break; - } - } - - /* Other pid class */ - if (inputdev->id.product == PID_400 || - inputdev->id.product == PID_401) { - - /* Report 2 */ - if (device->buffer[0] == 2) { - /* Menu blocks */ - input_event(inputdev, EV_MSC, MSC_SCAN, device->buffer[1]); - } - - /* Report 1 */ - if (device->buffer[0] == 1) { - char buttonbyte; - - /* IF X max > 64K, we still a bit from the y report */ - if (device->max_X > 0x10000) { - - val = (u16)(((u16)(device->buffer[2] << 8)) | (u8)device->buffer[1]); - val |= (u32)(((u8)device->buffer[3] & 0x1) << 16); - - input_report_abs(inputdev, ABS_X, val); - - le_buffer[0] = (u8)((u8)(device->buffer[3]) >> 1); - le_buffer[0] |= (u8)((device->buffer[3] & 0x1) << 7); - - le_buffer[1] = (u8)(device->buffer[4] >> 1); - le_buffer[1] |= (u8)((device->buffer[5] & 0x1) << 7); - - val = get_unaligned_le16(le_buffer); - input_report_abs(inputdev, ABS_Y, val); - - /* - * Shift the button byte right by one to - * make it look like the standard report - */ - buttonbyte = device->buffer[5] >> 1; - } else { - - val = get_unaligned_le16(&device->buffer[1]); - input_report_abs(inputdev, ABS_X, val); - - val = get_unaligned_le16(&device->buffer[3]); - input_report_abs(inputdev, ABS_Y, val); - - buttonbyte = device->buffer[5]; - } - - /* BUTTONS and PROXIMITY */ - val = buttonbyte & MASK_INRANGE ? 1 : 0; - input_report_abs(inputdev, ABS_DISTANCE, val); - - /* Convert buttons, only 4 bits possible */ - val = buttonbyte & 0x0F; -#ifdef USE_BUTTONS - for (i = 0; i < 5; i++) - input_report_key(inputdev, BTN_DIGI + i, val & (1 << i)); -#else - /* We don't apply any meaning to the bitmask, just report */ - input_event(inputdev, EV_MSC, MSC_SERIAL, val); -#endif - - /* TRANSDUCER */ - input_report_abs(inputdev, ABS_MISC, device->buffer[6]); - } - } - - /* Everybody gets report ID's */ - input_event(inputdev, EV_MSC, MSC_RAW, device->buffer[0]); - - /* Sync it up */ - input_sync(inputdev); - - resubmit: - rc = usb_submit_urb(urbinfo, GFP_ATOMIC); - if (rc != 0) - dev_err(&device->intf->dev, - "usb_submit_urb failed rc=0x%x\n", rc); -} - -/* - * The probe routine. This is called when the kernel find the matching USB - * vendor/product. We do the following: - * - * - Allocate mem for a local structure to manage the device - * - Request a HID Report Descriptor from the device and parse it to - * find out the device parameters - * - Create an input device and assign it attributes - * - Allocate an URB so the device can talk to us when the input - * queue is open - */ -static int gtco_probe(struct usb_interface *usbinterface, - const struct usb_device_id *id) -{ - - struct gtco *gtco; - struct input_dev *input_dev; - struct hid_descriptor *hid_desc; - char *report; - int result = 0, retry; - int error; - struct usb_endpoint_descriptor *endpoint; - struct usb_device *udev = interface_to_usbdev(usbinterface); - - /* Allocate memory for device structure */ - gtco = kzalloc(sizeof(struct gtco), GFP_KERNEL); - input_dev = input_allocate_device(); - if (!gtco || !input_dev) { - dev_err(&usbinterface->dev, "No more memory\n"); - error = -ENOMEM; - goto err_free_devs; - } - - /* Set pointer to the input device */ - gtco->inputdevice = input_dev; - - /* Save interface information */ - gtco->intf = usbinterface; - - /* Allocate some data for incoming reports */ - gtco->buffer = usb_alloc_coherent(udev, REPORT_MAX_SIZE, - GFP_KERNEL, >co->buf_dma); - if (!gtco->buffer) { - dev_err(&usbinterface->dev, "No more memory for us buffers\n"); - error = -ENOMEM; - goto err_free_devs; - } - - /* Allocate URB for reports */ - gtco->urbinfo = usb_alloc_urb(0, GFP_KERNEL); - if (!gtco->urbinfo) { - dev_err(&usbinterface->dev, "Failed to allocate URB\n"); - error = -ENOMEM; - goto err_free_buf; - } - - /* Sanity check that a device has an endpoint */ - if (usbinterface->cur_altsetting->desc.bNumEndpoints < 1) { - dev_err(&usbinterface->dev, - "Invalid number of endpoints\n"); - error = -EINVAL; - goto err_free_urb; - } - - endpoint = &usbinterface->cur_altsetting->endpoint[0].desc; - - /* Some debug */ - dev_dbg(&usbinterface->dev, "gtco # interfaces: %d\n", usbinterface->num_altsetting); - dev_dbg(&usbinterface->dev, "num endpoints: %d\n", usbinterface->cur_altsetting->desc.bNumEndpoints); - dev_dbg(&usbinterface->dev, "interface class: %d\n", usbinterface->cur_altsetting->desc.bInterfaceClass); - dev_dbg(&usbinterface->dev, "endpoint: attribute:0x%x type:0x%x\n", endpoint->bmAttributes, endpoint->bDescriptorType); - if (usb_endpoint_xfer_int(endpoint)) - dev_dbg(&usbinterface->dev, "endpoint: we have interrupt endpoint\n"); - - dev_dbg(&usbinterface->dev, "interface extra len:%d\n", - usbinterface->cur_altsetting->extralen); - - /* - * Find the HID descriptor so we can find out the size of the - * HID report descriptor - */ - if (usb_get_extra_descriptor(usbinterface->cur_altsetting, - HID_DEVICE_TYPE, &hid_desc) != 0) { - dev_err(&usbinterface->dev, - "Can't retrieve exta USB descriptor to get hid report descriptor length\n"); - error = -EIO; - goto err_free_urb; - } - - dev_dbg(&usbinterface->dev, - "Extra descriptor success: type:%d len:%d\n", - hid_desc->bDescriptorType, hid_desc->wDescriptorLength); - - report = kzalloc(le16_to_cpu(hid_desc->wDescriptorLength), GFP_KERNEL); - if (!report) { - dev_err(&usbinterface->dev, "No more memory for report\n"); - error = -ENOMEM; - goto err_free_urb; - } - - /* Couple of tries to get reply */ - for (retry = 0; retry < 3; retry++) { - result = usb_control_msg(udev, - usb_rcvctrlpipe(udev, 0), - USB_REQ_GET_DESCRIPTOR, - USB_RECIP_INTERFACE | USB_DIR_IN, - REPORT_DEVICE_TYPE << 8, - 0, /* interface */ - report, - le16_to_cpu(hid_desc->wDescriptorLength), - 5000); /* 5 secs */ - - dev_dbg(&usbinterface->dev, "usb_control_msg result: %d\n", result); - if (result == le16_to_cpu(hid_desc->wDescriptorLength)) { - parse_hid_report_descriptor(gtco, report, result); - break; - } - } - - kfree(report); - - /* If we didn't get the report, fail */ - if (result != le16_to_cpu(hid_desc->wDescriptorLength)) { - dev_err(&usbinterface->dev, - "Failed to get HID Report Descriptor of size: %d\n", - hid_desc->wDescriptorLength); - error = -EIO; - goto err_free_urb; - } - - /* Create a device file node */ - usb_make_path(udev, gtco->usbpath, sizeof(gtco->usbpath)); - strlcat(gtco->usbpath, "/input0", sizeof(gtco->usbpath)); - - /* Set Input device functions */ - input_dev->open = gtco_input_open; - input_dev->close = gtco_input_close; - - /* Set input device information */ - input_dev->name = "GTCO_CalComp"; - input_dev->phys = gtco->usbpath; - - input_set_drvdata(input_dev, gtco); - - /* Now set up all the input device capabilities */ - gtco_setup_caps(input_dev); - - /* Set input device required ID information */ - usb_to_input_id(udev, &input_dev->id); - input_dev->dev.parent = &usbinterface->dev; - - /* Setup the URB, it will be posted later on open of input device */ - usb_fill_int_urb(gtco->urbinfo, - udev, - usb_rcvintpipe(udev, - endpoint->bEndpointAddress), - gtco->buffer, - REPORT_MAX_SIZE, - gtco_urb_callback, - gtco, - endpoint->bInterval); - - gtco->urbinfo->transfer_dma = gtco->buf_dma; - gtco->urbinfo->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; - - /* Save gtco pointer in USB interface gtco */ - usb_set_intfdata(usbinterface, gtco); - - /* All done, now register the input device */ - error = input_register_device(input_dev); - if (error) - goto err_free_urb; - - return 0; - - err_free_urb: - usb_free_urb(gtco->urbinfo); - err_free_buf: - usb_free_coherent(udev, REPORT_MAX_SIZE, - gtco->buffer, gtco->buf_dma); - err_free_devs: - input_free_device(input_dev); - kfree(gtco); - return error; -} - -/* - * This function is a standard USB function called when the USB device - * is disconnected. We will get rid of the URV, de-register the input - * device, and free up allocated memory - */ -static void gtco_disconnect(struct usb_interface *interface) -{ - /* Grab private device ptr */ - struct gtco *gtco = usb_get_intfdata(interface); - struct usb_device *udev = interface_to_usbdev(interface); - - /* Now reverse all the registration stuff */ - if (gtco) { - input_unregister_device(gtco->inputdevice); - usb_kill_urb(gtco->urbinfo); - usb_free_urb(gtco->urbinfo); - usb_free_coherent(udev, REPORT_MAX_SIZE, - gtco->buffer, gtco->buf_dma); - kfree(gtco); - } - - dev_info(&interface->dev, "gtco driver disconnected\n"); -} - -/* STANDARD MODULE LOAD ROUTINES */ - -static struct usb_driver gtco_driverinfo_table = { - .name = "gtco", - .id_table = gtco_usbid_table, - .probe = gtco_probe, - .disconnect = gtco_disconnect, -}; - -module_usb_driver(gtco_driverinfo_table); - -MODULE_DESCRIPTION("GTCO digitizer USB driver"); -MODULE_LICENSE("GPL"); diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 81e9af7dcec2..53d688bdd894 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -111,7 +111,6 @@ CONFIG_STRICT_KERNEL_RWX=y CONFIG_SUSPEND_TIME=y CONFIG_TABLET_USB_ACECAD=y CONFIG_TABLET_USB_AIPTEK=y -CONFIG_TABLET_USB_GTCO=y CONFIG_TABLET_USB_HANWANG=y CONFIG_TABLET_USB_KBTAB=y CONFIG_TASKSTATS=y -- cgit v1.2.3 From 1f702603e7125a390b5cdf5ce00539781cfcc86a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:19 -0600 Subject: exec: Simplify unshare_files Now that exec no longer needs to return the unshared files to their previous value there is no reason to return displaced. Instead when unshare_fd creates a copy of the file table, call put_files_struct before returning from unshare_files. Acked-by: Christian Brauner v1: https://lkml.kernel.org/r/20200817220425.9389-2-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20201120231441.29911-2-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- fs/coredump.c | 5 +---- fs/exec.c | 5 +---- include/linux/fdtable.h | 2 +- kernel/fork.c | 12 ++++++------ 4 files changed, 9 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/fs/coredump.c b/fs/coredump.c index 0cd9056d79cc..abf807235262 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -585,7 +585,6 @@ void do_coredump(const kernel_siginfo_t *siginfo) int ispipe; size_t *argv = NULL; int argc = 0; - struct files_struct *displaced; /* require nonrelative corefile path and be extra careful */ bool need_suid_safe = false; bool core_dumped = false; @@ -791,11 +790,9 @@ void do_coredump(const kernel_siginfo_t *siginfo) } /* get us an unshared descriptor table; almost always a no-op */ - retval = unshare_files(&displaced); + retval = unshare_files(); if (retval) goto close_fail; - if (displaced) - put_files_struct(displaced); if (!dump_interrupted()) { /* * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would diff --git a/fs/exec.c b/fs/exec.c index 0d6533ab1c97..48fa4fc1b116 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1238,7 +1238,6 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) int begin_new_exec(struct linux_binprm * bprm) { struct task_struct *me = current; - struct files_struct *displaced; int retval; /* Once we are committed compute the creds */ @@ -1259,11 +1258,9 @@ int begin_new_exec(struct linux_binprm * bprm) goto out; /* Ensure the files table is not shared. */ - retval = unshare_files(&displaced); + retval = unshare_files(); if (retval) goto out; - if (displaced) - put_files_struct(displaced); /* * Must be called _before_ exec_mmap() as bprm->mm is diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index a32bf47c593e..f46a084b60b2 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -109,7 +109,7 @@ struct task_struct; struct files_struct *get_files_struct(struct task_struct *); void put_files_struct(struct files_struct *fs); void reset_files_struct(struct files_struct *); -int unshare_files(struct files_struct **); +int unshare_files(void); struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy; void do_close_on_exec(struct files_struct *); int iterate_fd(struct files_struct *, unsigned, diff --git a/kernel/fork.c b/kernel/fork.c index 32083db7a2a2..837b546528c8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -3023,21 +3023,21 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) * the exec layer of the kernel. */ -int unshare_files(struct files_struct **displaced) +int unshare_files(void) { struct task_struct *task = current; - struct files_struct *copy = NULL; + struct files_struct *old, *copy = NULL; int error; error = unshare_fd(CLONE_FILES, NR_OPEN_MAX, ©); - if (error || !copy) { - *displaced = NULL; + if (error || !copy) return error; - } - *displaced = task->files; + + old = task->files; task_lock(task); task->files = copy; task_unlock(task); + put_files_struct(old); return 0; } -- cgit v1.2.3 From f43c283a89a7dc531a47d4b1e001503cf3dc3234 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:21 -0600 Subject: kcmp: In kcmp_epoll_target use fget_task Use the helper fget_task and simplify the code. As well as simplifying the code this removes one unnecessary increment of struct files_struct. This unnecessary increment of files_struct.count can result in exec unnecessarily unsharing files_struct and breaking posix locks, and it can result in fget_light having to fallback to fget reducing performance. Suggested-by: Oleg Nesterov Reviewed-by: Cyrill Gorcunov v1: https://lkml.kernel.org/r/20200817220425.9389-4-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20201120231441.29911-4-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- kernel/kcmp.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/kcmp.c b/kernel/kcmp.c index b3ff9288c6cc..87c48c0104ad 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -107,7 +107,6 @@ static int kcmp_epoll_target(struct task_struct *task1, { struct file *filp, *filp_epoll, *filp_tgt; struct kcmp_epoll_slot slot; - struct files_struct *files; if (copy_from_user(&slot, uslot, sizeof(slot))) return -EFAULT; @@ -116,23 +115,12 @@ static int kcmp_epoll_target(struct task_struct *task1, if (!filp) return -EBADF; - files = get_files_struct(task2); - if (!files) + filp_epoll = fget_task(task2, slot.efd); + if (!filp_epoll) return -EBADF; - spin_lock(&files->file_lock); - filp_epoll = fcheck_files(files, slot.efd); - if (filp_epoll) - get_file(filp_epoll); - else - filp_tgt = ERR_PTR(-EBADF); - spin_unlock(&files->file_lock); - put_files_struct(files); - - if (filp_epoll) { - filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); - fput(filp_epoll); - } + filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff); + fput(filp_epoll); if (IS_ERR(filp_tgt)) return PTR_ERR(filp_tgt); -- cgit v1.2.3 From b48845af0152d790a54b8ab78cc2b7c07485fc98 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:22 -0600 Subject: bpf: In bpf_task_fd_query use fget_task Use the helper fget_task to simplify bpf_task_fd_query. As well as simplifying the code this removes one unnecessary increment of struct files_struct. This unnecessary increment of files_struct.count can result in exec unnecessarily unsharing files_struct and breaking posix locks, and it can result in fget_light having to fallback to fget reducing performance. This simplification comes from the observation that none of the callers of get_files_struct actually need to call get_files_struct that was made when discussing[1] exec and posix file locks. [1] https://lkml.kernel.org/r/20180915160423.GA31461@redhat.com Suggested-by: Oleg Nesterov v1: https://lkml.kernel.org/r/20200817220425.9389-5-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20201120231441.29911-5-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- kernel/bpf/syscall.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 8f50c9c19f1b..6d49c2e1634c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -3878,7 +3878,6 @@ static int bpf_task_fd_query(const union bpf_attr *attr, pid_t pid = attr->task_fd_query.pid; u32 fd = attr->task_fd_query.fd; const struct perf_event *event; - struct files_struct *files; struct task_struct *task; struct file *file; int err; @@ -3896,23 +3895,11 @@ static int bpf_task_fd_query(const union bpf_attr *attr, if (!task) return -ENOENT; - files = get_files_struct(task); - put_task_struct(task); - if (!files) - return -ENOENT; - err = 0; - spin_lock(&files->file_lock); - file = fcheck_files(files, fd); + file = fget_task(task, fd); + put_task_struct(task); if (!file) - err = -EBADF; - else - get_file(file); - spin_unlock(&files->file_lock); - put_files_struct(files); - - if (err) - goto out; + return -EBADF; if (file->f_op == &bpf_link_fops) { struct bpf_link *link = file->private_data; @@ -3952,7 +3939,6 @@ out_not_supp: err = -ENOTSUPP; put_file: fput(file); -out: return err; } -- cgit v1.2.3 From f36c2943274199cb8aef32ac96531ffb7c4b43d0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:26 -0600 Subject: file: Replace fcheck_files with files_lookup_fd_rcu This change renames fcheck_files to files_lookup_fd_rcu. All of the remaining callers take the rcu_read_lock before calling this function so the _rcu suffix is appropriate. This change also tightens up the debug check to verify that all callers hold the rcu_read_lock. All callers that used to call files_check with the files->file_lock held have now been changed to call files_lookup_fd_locked. This change of name has helped remind me of which locks and which guarantees are in place helping me to catch bugs later in the patchset. The need for better names became apparent in the last round of discussion of this set of changes[1]. [1] https://lkml.kernel.org/r/CAHk-=wj8BQbgJFLa+J0e=iT-1qpmCRTbPAJ8gd6MJQ=kbRPqyQ@mail.gmail.com Link: https://lkml.kernel.org/r/20201120231441.29911-9-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- Documentation/filesystems/files.rst | 6 +++--- fs/file.c | 4 ++-- fs/proc/fd.c | 4 ++-- include/linux/fdtable.h | 7 +++---- kernel/bpf/task_iter.c | 2 +- kernel/kcmp.c | 2 +- 6 files changed, 12 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/Documentation/filesystems/files.rst b/Documentation/filesystems/files.rst index cbf8e57376bf..ea75acdb632c 100644 --- a/Documentation/filesystems/files.rst +++ b/Documentation/filesystems/files.rst @@ -62,7 +62,7 @@ the fdtable structure - be held. 4. To look up the file structure given an fd, a reader - must use either fcheck() or fcheck_files() APIs. These + must use either fcheck() or files_lookup_fd_rcu() APIs. These take care of barrier requirements due to lock-free lookup. An example:: @@ -84,7 +84,7 @@ the fdtable structure - on ->f_count:: rcu_read_lock(); - file = fcheck_files(files, fd); + file = files_lookup_fd_rcu(files, fd); if (file) { if (atomic_long_inc_not_zero(&file->f_count)) *fput_needed = 1; @@ -104,7 +104,7 @@ the fdtable structure - lock-free, they must be installed using rcu_assign_pointer() API. If they are looked up lock-free, rcu_dereference() must be used. However it is advisable to use files_fdtable() - and fcheck()/fcheck_files() which take care of these issues. + and fcheck()/files_lookup_fd_rcu() which take care of these issues. 7. While updating, the fdtable pointer must be looked up while holding files->file_lock. If ->file_lock is dropped, then diff --git a/fs/file.c b/fs/file.c index 9d0e91168be1..5861c4f89419 100644 --- a/fs/file.c +++ b/fs/file.c @@ -814,7 +814,7 @@ static struct file *__fget_files(struct files_struct *files, unsigned int fd, rcu_read_lock(); loop: - file = fcheck_files(files, fd); + file = files_lookup_fd_rcu(files, fd); if (file) { /* File object ref couldn't be taken. * dup2() atomicity guarantee is the reason @@ -1127,7 +1127,7 @@ SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd) int retval = oldfd; rcu_read_lock(); - if (!fcheck_files(files, oldfd)) + if (!files_lookup_fd_rcu(files, oldfd)) retval = -EBADF; rcu_read_unlock(); return retval; diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 2cca9bca3b3a..3dec44d7c5c5 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -90,7 +90,7 @@ static bool tid_fd_mode(struct task_struct *task, unsigned fd, fmode_t *mode) return false; rcu_read_lock(); - file = fcheck_files(files, fd); + file = files_lookup_fd_rcu(files, fd); if (file) *mode = file->f_mode; rcu_read_unlock(); @@ -243,7 +243,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, char name[10 + 1]; unsigned int len; - f = fcheck_files(files, fd); + f = files_lookup_fd_rcu(files, fd); if (!f) continue; data.mode = f->f_mode; diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h index fda4b81dd735..fa8c402a7790 100644 --- a/include/linux/fdtable.h +++ b/include/linux/fdtable.h @@ -98,10 +98,9 @@ static inline struct file *files_lookup_fd_locked(struct files_struct *files, un return files_lookup_fd_raw(files, fd); } -static inline struct file *fcheck_files(struct files_struct *files, unsigned int fd) +static inline struct file *files_lookup_fd_rcu(struct files_struct *files, unsigned int fd) { - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && - !lockdep_is_held(&files->file_lock), + RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "suspicious rcu_dereference_check() usage"); return files_lookup_fd_raw(files, fd); } @@ -109,7 +108,7 @@ static inline struct file *fcheck_files(struct files_struct *files, unsigned int /* * Check whether the specified fd has an open file. */ -#define fcheck(fd) fcheck_files(current->files, fd) +#define fcheck(fd) files_lookup_fd_rcu(current->files, fd) struct task_struct; diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 5b6af30bfbcd..5ab2ccfb96cb 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -183,7 +183,7 @@ again: for (; curr_fd < max_fds; curr_fd++) { struct file *f; - f = fcheck_files(curr_files, curr_fd); + f = files_lookup_fd_rcu(curr_files, curr_fd); if (!f) continue; if (!get_file_rcu(f)) diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 87c48c0104ad..990717c1aed3 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -67,7 +67,7 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) rcu_read_lock(); if (task->files) - file = fcheck_files(task->files, idx); + file = files_lookup_fd_rcu(task->files, idx); rcu_read_unlock(); task_unlock(task); -- cgit v1.2.3 From ed77e80e14a3cd55c73848b9e8043020e717ce12 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:30 -0600 Subject: kcmp: In get_file_raw_ptr use task_lookup_fd_rcu Modify get_file_raw_ptr to use task_lookup_fd_rcu. The helper task_lookup_fd_rcu does the work of taking the task lock and verifying that task->files != NULL and then calls files_lookup_fd_rcu. So let use the helper to make a simpler implementation of get_file_raw_ptr. Acked-by: Cyrill Gorcunov Link: https://lkml.kernel.org/r/20201120231441.29911-13-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- kernel/kcmp.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 990717c1aed3..36e58eb5a11d 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -61,16 +61,11 @@ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) static struct file * get_file_raw_ptr(struct task_struct *task, unsigned int idx) { - struct file *file = NULL; + struct file *file; - task_lock(task); rcu_read_lock(); - - if (task->files) - file = files_lookup_fd_rcu(task->files, idx); - + file = task_lookup_fd_rcu(task, idx); rcu_read_unlock(); - task_unlock(task); return file; } -- cgit v1.2.3 From 66ed594409a10b1cc6fa1e8d22bc8aed2a080d0c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 20 Nov 2020 17:14:33 -0600 Subject: bpf/task_iter: In task_file_seq_get_next use task_lookup_next_fd_rcu When discussing[1] exec and posix file locks it was realized that none of the callers of get_files_struct fundamentally needed to call get_files_struct, and that by switching them to helper functions instead it will both simplify their code and remove unnecessary increments of files_struct.count. Those unnecessary increments can result in exec unnecessarily unsharing files_struct which breaking posix locks, and it can result in fget_light having to fallback to fget reducing system performance. Using task_lookup_next_fd_rcu simplifies task_file_seq_get_next, by moving the checking for the maximum file descritor into the generic code, and by remvoing the need for capturing and releasing a reference on files_struct. As the reference count of files_struct no longer needs to be maintained bpf_iter_seq_task_file_info can have it's files member removed and task_file_seq_get_next no longer needs it's fstruct argument. The curr_fd local variable does need to become unsigned to be used with fnext_task. As curr_fd is assigned from and assigned a u32 making curr_fd an unsigned int won't cause problems and might prevent them. [1] https://lkml.kernel.org/r/20180915160423.GA31461@redhat.com Suggested-by: Oleg Nesterov v1: https://lkml.kernel.org/r/20200817220425.9389-11-ebiederm@xmission.com Link: https://lkml.kernel.org/r/20201120231441.29911-16-ebiederm@xmission.com Signed-off-by: Eric W. Biederman --- kernel/bpf/task_iter.c | 44 ++++++++++---------------------------------- 1 file changed, 10 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 5ab2ccfb96cb..4ec63170c741 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -130,45 +130,33 @@ struct bpf_iter_seq_task_file_info { */ struct bpf_iter_seq_task_common common; struct task_struct *task; - struct files_struct *files; u32 tid; u32 fd; }; static struct file * task_file_seq_get_next(struct bpf_iter_seq_task_file_info *info, - struct task_struct **task, struct files_struct **fstruct) + struct task_struct **task) { struct pid_namespace *ns = info->common.ns; - u32 curr_tid = info->tid, max_fds; - struct files_struct *curr_files; + u32 curr_tid = info->tid; struct task_struct *curr_task; - int curr_fd = info->fd; + unsigned int curr_fd = info->fd; /* If this function returns a non-NULL file object, - * it held a reference to the task/files_struct/file. + * it held a reference to the task/file. * Otherwise, it does not hold any reference. */ again: if (*task) { curr_task = *task; - curr_files = *fstruct; curr_fd = info->fd; } else { curr_task = task_seq_get_next(ns, &curr_tid, true); if (!curr_task) return NULL; - curr_files = get_files_struct(curr_task); - if (!curr_files) { - put_task_struct(curr_task); - curr_tid = ++(info->tid); - info->fd = 0; - goto again; - } - - /* set *fstruct, *task and info->tid */ - *fstruct = curr_files; + /* set *task and info->tid */ *task = curr_task; if (curr_tid == info->tid) { curr_fd = info->fd; @@ -179,13 +167,11 @@ again: } rcu_read_lock(); - max_fds = files_fdtable(curr_files)->max_fds; - for (; curr_fd < max_fds; curr_fd++) { + for (;; curr_fd++) { struct file *f; - - f = files_lookup_fd_rcu(curr_files, curr_fd); + f = task_lookup_next_fd_rcu(curr_task, &curr_fd); if (!f) - continue; + break; if (!get_file_rcu(f)) continue; @@ -197,10 +183,8 @@ again: /* the current task is done, go to the next task */ rcu_read_unlock(); - put_files_struct(curr_files); put_task_struct(curr_task); *task = NULL; - *fstruct = NULL; info->fd = 0; curr_tid = ++(info->tid); goto again; @@ -209,13 +193,11 @@ again: static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) { struct bpf_iter_seq_task_file_info *info = seq->private; - struct files_struct *files = NULL; struct task_struct *task = NULL; struct file *file; - file = task_file_seq_get_next(info, &task, &files); + file = task_file_seq_get_next(info, &task); if (!file) { - info->files = NULL; info->task = NULL; return NULL; } @@ -223,7 +205,6 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) if (*pos == 0) ++*pos; info->task = task; - info->files = files; return file; } @@ -231,22 +212,19 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos) static void *task_file_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct bpf_iter_seq_task_file_info *info = seq->private; - struct files_struct *files = info->files; struct task_struct *task = info->task; struct file *file; ++*pos; ++info->fd; fput((struct file *)v); - file = task_file_seq_get_next(info, &task, &files); + file = task_file_seq_get_next(info, &task); if (!file) { - info->files = NULL; info->task = NULL; return NULL; } info->task = task; - info->files = files; return file; } @@ -295,9 +273,7 @@ static void task_file_seq_stop(struct seq_file *seq, void *v) (void)__task_file_seq_show(seq, v, true); } else { fput((struct file *)v); - put_files_struct(info->files); put_task_struct(info->task); - info->files = NULL; info->task = NULL; } } -- cgit v1.2.3 From f7cfd871ae0c5008d94b6f66834e7845caa93c15 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 3 Dec 2020 14:12:00 -0600 Subject: exec: Transform exec_update_mutex into a rw_semaphore Recently syzbot reported[0] that there is a deadlock amongst the users of exec_update_mutex. The problematic lock ordering found by lockdep was: perf_event_open (exec_update_mutex -> ovl_i_mutex) chown (ovl_i_mutex -> sb_writes) sendfile (sb_writes -> p->lock) by reading from a proc file and writing to overlayfs proc_pid_syscall (p->lock -> exec_update_mutex) While looking at possible solutions it occured to me that all of the users and possible users involved only wanted to state of the given process to remain the same. They are all readers. The only writer is exec. There is no reason for readers to block on each other. So fix this deadlock by transforming exec_update_mutex into a rw_semaphore named exec_update_lock that only exec takes for writing. Cc: Jann Horn Cc: Vasiliy Kulikov Cc: Al Viro Cc: Bernd Edlinger Cc: Oleg Nesterov Cc: Christopher Yeoh Cc: Cyrill Gorcunov Cc: Sargun Dhillon Cc: Christian Brauner Cc: Arnd Bergmann Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Arnaldo Carvalho de Melo Fixes: eea9673250db ("exec: Add exec_update_mutex to replace cred_guard_mutex") [0] https://lkml.kernel.org/r/00000000000063640c05ade8e3de@google.com Reported-by: syzbot+db9cdf3dd1f64252c6ef@syzkaller.appspotmail.com Link: https://lkml.kernel.org/r/87ft4mbqen.fsf@x220.int.ebiederm.org Signed-off-by: Eric W. Biederman --- fs/exec.c | 12 ++++++------ fs/proc/base.c | 10 +++++----- include/linux/sched/signal.h | 11 ++++++----- init/init_task.c | 2 +- kernel/events/core.c | 12 ++++++------ kernel/fork.c | 6 +++--- kernel/kcmp.c | 30 +++++++++++++++--------------- kernel/pid.c | 4 ++-- 8 files changed, 44 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/fs/exec.c b/fs/exec.c index 547a2390baf5..ca89e0e3ef10 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -965,8 +965,8 @@ EXPORT_SYMBOL(read_code); /* * Maps the mm_struct mm into the current task struct. - * On success, this function returns with the mutex - * exec_update_mutex locked. + * On success, this function returns with exec_update_lock + * held for writing. */ static int exec_mmap(struct mm_struct *mm) { @@ -981,7 +981,7 @@ static int exec_mmap(struct mm_struct *mm) if (old_mm) sync_mm_rss(old_mm); - ret = mutex_lock_killable(&tsk->signal->exec_update_mutex); + ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) return ret; @@ -995,7 +995,7 @@ static int exec_mmap(struct mm_struct *mm) mmap_read_lock(old_mm); if (unlikely(old_mm->core_state)) { mmap_read_unlock(old_mm); - mutex_unlock(&tsk->signal->exec_update_mutex); + up_write(&tsk->signal->exec_update_lock); return -EINTR; } } @@ -1382,7 +1382,7 @@ int begin_new_exec(struct linux_binprm * bprm) return 0; out_unlock: - mutex_unlock(&me->signal->exec_update_mutex); + up_write(&me->signal->exec_update_lock); out: return retval; } @@ -1423,7 +1423,7 @@ void setup_new_exec(struct linux_binprm * bprm) * some architectures like powerpc */ me->mm->task_size = TASK_SIZE; - mutex_unlock(&me->signal->exec_update_mutex); + up_write(&me->signal->exec_update_lock); mutex_unlock(&me->signal->cred_guard_mutex); } EXPORT_SYMBOL(setup_new_exec); diff --git a/fs/proc/base.c b/fs/proc/base.c index b362523a9829..55ce0ee9c5c7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -405,11 +405,11 @@ print0: static int lock_trace(struct task_struct *task) { - int err = mutex_lock_killable(&task->signal->exec_update_mutex); + int err = down_read_killable(&task->signal->exec_update_lock); if (err) return err; if (!ptrace_may_access(task, PTRACE_MODE_ATTACH_FSCREDS)) { - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return -EPERM; } return 0; @@ -417,7 +417,7 @@ static int lock_trace(struct task_struct *task) static void unlock_trace(struct task_struct *task) { - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); } #ifdef CONFIG_STACKTRACE @@ -2930,7 +2930,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh unsigned long flags; int result; - result = mutex_lock_killable(&task->signal->exec_update_mutex); + result = down_read_killable(&task->signal->exec_update_lock); if (result) return result; @@ -2966,7 +2966,7 @@ static int do_io_accounting(struct task_struct *task, struct seq_file *m, int wh result = 0; out_unlock: - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return result; } diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 1bad18a1d8ba..4b6a8234d7fc 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -228,12 +228,13 @@ struct signal_struct { * credential calculations * (notably. ptrace) * Deprecated do not use in new code. - * Use exec_update_mutex instead. - */ - struct mutex exec_update_mutex; /* Held while task_struct is being - * updated during exec, and may have - * inconsistent permissions. + * Use exec_update_lock instead. */ + struct rw_semaphore exec_update_lock; /* Held while task_struct is + * being updated during exec, + * and may have inconsistent + * permissions. + */ } __randomize_layout; /* diff --git a/init/init_task.c b/init/init_task.c index a56f0abb63e9..15f6eb93a04f 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -26,7 +26,7 @@ static struct signal_struct init_signals = { .multiprocess = HLIST_HEAD_INIT, .rlim = INIT_RLIMITS, .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex), - .exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex), + .exec_update_lock = __RWSEM_INITIALIZER(init_signals.exec_update_lock), #ifdef CONFIG_POSIX_TIMERS .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers), .cputimer = { diff --git a/kernel/events/core.c b/kernel/events/core.c index dc568ca295bd..55b2330b556c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1325,7 +1325,7 @@ static void put_ctx(struct perf_event_context *ctx) * function. * * Lock order: - * exec_update_mutex + * exec_update_lock * task_struct::perf_event_mutex * perf_event_context::mutex * perf_event::child_mutex; @@ -11721,14 +11721,14 @@ SYSCALL_DEFINE5(perf_event_open, } if (task) { - err = mutex_lock_interruptible(&task->signal->exec_update_mutex); + err = down_read_interruptible(&task->signal->exec_update_lock); if (err) goto err_task; /* * Preserve ptrace permission check for backwards compatibility. * - * We must hold exec_update_mutex across this and any potential + * We must hold exec_update_lock across this and any potential * perf_install_in_context() call for this new event to * serialize against exec() altering our credentials (and the * perf_event_exit_task() that could imply). @@ -12017,7 +12017,7 @@ SYSCALL_DEFINE5(perf_event_open, mutex_unlock(&ctx->mutex); if (task) { - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); put_task_struct(task); } @@ -12053,7 +12053,7 @@ err_alloc: free_event(event); err_cred: if (task) - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); err_task: if (task) put_task_struct(task); @@ -12358,7 +12358,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) /* * When a child task exits, feed back event values to parent events. * - * Can be called with exec_update_mutex held when called from + * Can be called with exec_update_lock held when called from * setup_new_exec(). */ void perf_event_exit_task(struct task_struct *child) diff --git a/kernel/fork.c b/kernel/fork.c index 6d266388d380..e8cb80b266d2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1221,7 +1221,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) struct mm_struct *mm; int err; - err = mutex_lock_killable(&task->signal->exec_update_mutex); + err = down_read_killable(&task->signal->exec_update_lock); if (err) return ERR_PTR(err); @@ -1231,7 +1231,7 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mmput(mm); mm = ERR_PTR(-EACCES); } - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return mm; } @@ -1591,7 +1591,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) sig->oom_score_adj_min = current->signal->oom_score_adj_min; mutex_init(&sig->cred_guard_mutex); - mutex_init(&sig->exec_update_mutex); + init_rwsem(&sig->exec_update_lock); return 0; } diff --git a/kernel/kcmp.c b/kernel/kcmp.c index b3ff9288c6cc..c0d2ad9b4705 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -75,25 +75,25 @@ get_file_raw_ptr(struct task_struct *task, unsigned int idx) return file; } -static void kcmp_unlock(struct mutex *m1, struct mutex *m2) +static void kcmp_unlock(struct rw_semaphore *l1, struct rw_semaphore *l2) { - if (likely(m2 != m1)) - mutex_unlock(m2); - mutex_unlock(m1); + if (likely(l2 != l1)) + up_read(l2); + up_read(l1); } -static int kcmp_lock(struct mutex *m1, struct mutex *m2) +static int kcmp_lock(struct rw_semaphore *l1, struct rw_semaphore *l2) { int err; - if (m2 > m1) - swap(m1, m2); + if (l2 > l1) + swap(l1, l2); - err = mutex_lock_killable(m1); - if (!err && likely(m1 != m2)) { - err = mutex_lock_killable_nested(m2, SINGLE_DEPTH_NESTING); + err = down_read_killable(l1); + if (!err && likely(l1 != l2)) { + err = down_read_killable_nested(l2, SINGLE_DEPTH_NESTING); if (err) - mutex_unlock(m1); + up_read(l1); } return err; @@ -173,8 +173,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, /* * One should have enough rights to inspect task details. */ - ret = kcmp_lock(&task1->signal->exec_update_mutex, - &task2->signal->exec_update_mutex); + ret = kcmp_lock(&task1->signal->exec_update_lock, + &task2->signal->exec_update_lock); if (ret) goto err; if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || @@ -229,8 +229,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, } err_unlock: - kcmp_unlock(&task1->signal->exec_update_mutex, - &task2->signal->exec_update_mutex); + kcmp_unlock(&task1->signal->exec_update_lock, + &task2->signal->exec_update_lock); err: put_task_struct(task1); put_task_struct(task2); diff --git a/kernel/pid.c b/kernel/pid.c index a96bc4bf4f86..4856818c9de1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -628,7 +628,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) struct file *file; int ret; - ret = mutex_lock_killable(&task->signal->exec_update_mutex); + ret = down_read_killable(&task->signal->exec_update_lock); if (ret) return ERR_PTR(ret); @@ -637,7 +637,7 @@ static struct file *__pidfd_fget(struct task_struct *task, int fd) else file = ERR_PTR(-EPERM); - mutex_unlock(&task->signal->exec_update_mutex); + up_read(&task->signal->exec_update_lock); return file ?: ERR_PTR(-EBADF); } -- cgit v1.2.3 From b02709587ea3d699a608568ee8157d8db4fd8cae Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 8 Dec 2020 19:01:51 +0100 Subject: bpf: Fix propagation of 32-bit signed bounds from 64-bit bounds. The 64-bit signed bounds should not affect 32-bit signed bounds unless the verifier knows that upper 32-bits are either all 1s or all 0s. For example the register with smin_value==1 doesn't mean that s32_min_value is also equal to 1, since smax_value could be larger than 32-bit subregister can hold. The verifier refines the smax/s32_max return value from certain helpers in do_refine_retval_range(). Teach the verifier to recognize that smin/s32_min value is also bounded. When both smin and smax bounds fit into 32-bit subregister the verifier can propagate those bounds. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Jean-Philippe Brucker Acked-by: John Fastabend Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1388bf733071..53fe6ef6d931 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1298,9 +1298,7 @@ static void __reg_combine_32_into_64(struct bpf_reg_state *reg) static bool __reg64_bound_s32(s64 a) { - if (a > S32_MIN && a < S32_MAX) - return true; - return false; + return a > S32_MIN && a < S32_MAX; } static bool __reg64_bound_u32(u64 a) @@ -1314,10 +1312,10 @@ static void __reg_combine_64_into_32(struct bpf_reg_state *reg) { __mark_reg32_unbounded(reg); - if (__reg64_bound_s32(reg->smin_value)) + if (__reg64_bound_s32(reg->smin_value) && __reg64_bound_s32(reg->smax_value)) { reg->s32_min_value = (s32)reg->smin_value; - if (__reg64_bound_s32(reg->smax_value)) reg->s32_max_value = (s32)reg->smax_value; + } if (__reg64_bound_u32(reg->umin_value)) reg->u32_min_value = (u32)reg->umin_value; if (__reg64_bound_u32(reg->umax_value)) @@ -4895,6 +4893,8 @@ static void do_refine_retval_range(struct bpf_reg_state *regs, int ret_type, ret_reg->smax_value = meta->msize_max_value; ret_reg->s32_max_value = meta->msize_max_value; + ret_reg->smin_value = -MAX_ERRNO; + ret_reg->s32_min_value = -MAX_ERRNO; __reg_deduce_bounds(ret_reg); __reg_bound_offset(ret_reg); __update_reg_bounds(ret_reg); -- cgit v1.2.3 From 59a74b1544e1c07ffbfd1edff5fd73ce7d3d3146 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 1 Dec 2020 13:09:06 +0100 Subject: sched: Fix kernel-doc markup Kernel-doc requires that a kernel-doc markup to be immediately below the function prototype, as otherwise it will rename it. So, move sys_sched_yield() markup to the right place. Also fix the cpu_util() markup: Kernel-doc markups should use this format: identifier - description Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/50cd6f460aeb872ebe518a8e9cfffda2df8bdb0a.1606823973.git.mchehab+huawei@kernel.org --- kernel/sched/core.c | 16 ++++++++-------- kernel/sched/fair.c | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a7abbba98083..7af80c3fce12 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6611,14 +6611,6 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, return ret; } -/** - * sys_sched_yield - yield the current processor to other threads. - * - * This function yields the current CPU to other tasks. If there are no - * other threads running on this CPU then this function will return. - * - * Return: 0. - */ static void do_sched_yield(void) { struct rq_flags rf; @@ -6636,6 +6628,14 @@ static void do_sched_yield(void) schedule(); } +/** + * sys_sched_yield - yield the current processor to other threads. + * + * This function yields the current CPU to other tasks. If there are no + * other threads running on this CPU then this function will return. + * + * Return: 0. + */ SYSCALL_DEFINE0(sched_yield) { do_sched_yield(); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e7e21ac479a2..f5dcedacc104 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6330,7 +6330,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) } /** - * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks + * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks. * @cpu: the CPU to get the utilization of * * The unit of the return value must be the one of capacity so we can compare -- cgit v1.2.3 From 13d5a5e9f9b8515da3c04305ae1bb03ab91be7a7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 30 Nov 2020 14:40:20 +0000 Subject: sched/fair: Clear SMT siblings after determining the core is not idle The clearing of SMT siblings from the SIS mask before checking for an idle core is a small but unnecessary cost. Defer the clearing of the siblings until the scan moves to the next potential target. The cost of this was not measured as it is borderline noise but it should be self-evident. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20201130144020.GS3371@techsingularity.net --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f5dcedacc104..efac224c703f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6086,10 +6086,11 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int break; } } - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); if (idle) return core; + + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); } /* -- cgit v1.2.3 From 5b78f2dc315354c05300795064f587366a02c6ff Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 3 Dec 2020 11:06:41 +1300 Subject: sched/fair: Trivial correction of the newidle_balance() comment idle_balance() has been renamed to newidle_balance(). To differentiate with nohz_idle_balance, it seems refining the comment will be helpful for the readers of the code. Signed-off-by: Barry Song Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20201202220641.22752-1-song.bao.hua@hisilicon.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index efac224c703f..04a3ce20da67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10550,7 +10550,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * idle_balance is called by schedule() if this_cpu is about to become + * newidle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: -- cgit v1.2.3 From c9e6189fb03123a7dfb93589280347b46f30b161 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Dec 2020 22:46:18 +0100 Subject: ntp: Make the RTC synchronization more reliable Miroslav reported that the periodic RTC synchronization in the NTP code fails more often than not to hit the specified update window. The reason is that the code uses delayed_work to schedule the update which needs to be in thread context as the underlying RTC might be connected via a slow bus, e.g. I2C. In the update function it verifies whether the current time is correct vs. the requirements of the underlying RTC. But delayed_work is using the timer wheel for scheduling which is inaccurate by design. Depending on the distance to the expiry the wheel gets less granular to allow batching and to avoid the cascading of the original timer wheel. See 500462a9de65 ("timers: Switch to a non-cascading wheel") and the code for further details. The code already deals with this by splitting the 660 seconds period into a long 659 seconds timer and then retrying with a smaller delta. But looking at the actual granularities of the timer wheel (which depend on the HZ configuration) the 659 seconds timer ends up in an outer wheel level and is affected by a worst case granularity of: HZ Granularity 1000 32s 250 16s 100 40s So the initial timer can be already off by max 12.5% which is not a big issue as the period of the sync is defined as ~11 minutes. The fine grained second attempt schedules to the desired update point with a timer expiring less than a second from now. Depending on the actual delta and the HZ setting even the second attempt can end up in outer wheel levels which have a large enough granularity to make the correctness check fail. As this is a fundamental property of the timer wheel there is no way to make this more accurate short of iterating in one jiffies steps towards the update point. Switch it to an hrtimer instead which schedules the actual update work. The hrtimer will expire precisely (max 1 jiffie delay when high resolution timers are not available). The actual scheduling delay of the work is the same as before. The update is triggered from do_adjtimex() which is a bit racy but not much more racy than it was before: if (ntp_synced()) queue_delayed_work(system_power_efficient_wq, &sync_work, 0); which is racy when the work is currently executed and has not managed to reschedule itself. This becomes now: if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer)) queue_work(system_power_efficient_wq, &sync_work, 0); which is racy when the hrtimer has expired and the work is currently executed and has not yet managed to rearm the hrtimer. Not a big problem as it just schedules work for nothing. The new implementation has a safe guard in place to catch the case where the hrtimer is queued on entry to the work function and avoids an extra update attempt of the RTC that way. Reported-by: Miroslav Lichvar Signed-off-by: Thomas Gleixner Tested-by: Miroslav Lichvar Reviewed-by: Jason Gunthorpe Acked-by: Alexandre Belloni Link: https://lore.kernel.org/r/20201206220542.062910520@linutronix.de --- include/linux/timex.h | 1 - kernel/time/ntp.c | 90 ++++++++++++++++++++++++---------------------- kernel/time/ntp_internal.h | 7 ++++ 3 files changed, 55 insertions(+), 43 deletions(-) (limited to 'kernel') diff --git a/include/linux/timex.h b/include/linux/timex.h index ce0859763670..9c2e54faf9b7 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -157,7 +157,6 @@ extern int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex * extern void hardpps(const struct timespec64 *, const struct timespec64 *); int read_current_timer(unsigned long *timer_val); -void ntp_notify_cmos_timer(void); /* The clock frequency of the i8253/i8254 PIT */ #define PIT_TICK_RATE 1193182ul diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 069ca78fb0bf..ff1a7b8ec4ef 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -494,65 +494,55 @@ out: return leap; } +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) static void sync_hw_clock(struct work_struct *work); -static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock); - -static void sched_sync_hw_clock(struct timespec64 now, - unsigned long target_nsec, bool fail) +static DECLARE_WORK(sync_work, sync_hw_clock); +static struct hrtimer sync_hrtimer; +#define SYNC_PERIOD_NS (11UL * 60 * NSEC_PER_SEC) +static enum hrtimer_restart sync_timer_callback(struct hrtimer *timer) { - struct timespec64 next; + queue_work(system_power_efficient_wq, &sync_work); - ktime_get_real_ts64(&next); - if (!fail) - next.tv_sec = 659; - else { - /* - * Try again as soon as possible. Delaying long periods - * decreases the accuracy of the work queue timer. Due to this - * the algorithm is very likely to require a short-sleep retry - * after the above long sleep to synchronize ts_nsec. - */ - next.tv_sec = 0; - } + return HRTIMER_NORESTART; +} - /* Compute the needed delay that will get to tv_nsec == target_nsec */ - next.tv_nsec = target_nsec - next.tv_nsec; - if (next.tv_nsec <= 0) - next.tv_nsec += NSEC_PER_SEC; - if (next.tv_nsec >= NSEC_PER_SEC) { - next.tv_sec++; - next.tv_nsec -= NSEC_PER_SEC; - } +static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry) +{ + ktime_t exp = ktime_set(ktime_get_real_seconds(), 0); + + if (retry) + exp = ktime_add_ns(exp, 2 * NSEC_PER_SEC - offset_nsec); + else + exp = ktime_add_ns(exp, SYNC_PERIOD_NS - offset_nsec); - queue_delayed_work(system_power_efficient_wq, &sync_work, - timespec64_to_jiffies(&next)); + hrtimer_start(&sync_hrtimer, exp, HRTIMER_MODE_ABS); } static void sync_rtc_clock(void) { - unsigned long target_nsec; - struct timespec64 adjust, now; + unsigned long offset_nsec; + struct timespec64 adjust; int rc; if (!IS_ENABLED(CONFIG_RTC_SYSTOHC)) return; - ktime_get_real_ts64(&now); + ktime_get_real_ts64(&adjust); - adjust = now; if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); /* - * The current RTC in use will provide the target_nsec it wants to be - * called at, and does rtc_tv_nsec_ok internally. + * The current RTC in use will provide the nanoseconds offset prior + * to a full second it wants to be called at, and invokes + * rtc_tv_nsec_ok() internally. */ - rc = rtc_set_ntp_time(adjust, &target_nsec); + rc = rtc_set_ntp_time(adjust, &offset_nsec); if (rc == -ENODEV) return; - sched_sync_hw_clock(now, target_nsec, rc); + sched_sync_hw_clock(offset_nsec, rc != 0); } #ifdef CONFIG_GENERIC_CMOS_UPDATE @@ -599,7 +589,7 @@ static bool sync_cmos_clock(void) } } - sched_sync_hw_clock(now, target_nsec, rc); + sched_sync_hw_clock(target_nsec, rc != 0); return true; } @@ -613,7 +603,12 @@ static bool sync_cmos_clock(void) */ static void sync_hw_clock(struct work_struct *work) { - if (!ntp_synced()) + /* + * Don't update if STA_UNSYNC is set and if ntp_notify_cmos_timer() + * managed to schedule the work between the timer firing and the + * work being able to rearm the timer. Wait for the timer to expire. + */ + if (!ntp_synced() || hrtimer_is_queued(&sync_hrtimer)) return; if (sync_cmos_clock()) @@ -624,13 +619,23 @@ static void sync_hw_clock(struct work_struct *work) void ntp_notify_cmos_timer(void) { - if (!ntp_synced()) - return; + /* + * When the work is currently executed but has not yet the timer + * rearmed this queues the work immediately again. No big issue, + * just a pointless work scheduled. + */ + if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer)) + queue_work(system_power_efficient_wq, &sync_work); +} - if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) || - IS_ENABLED(CONFIG_RTC_SYSTOHC)) - queue_delayed_work(system_power_efficient_wq, &sync_work, 0); +static void __init ntp_init_cmos_sync(void) +{ + hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + sync_hrtimer.function = sync_timer_callback; } +#else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ +static inline void __init ntp_init_cmos_sync(void) { } +#endif /* !CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */ /* * Propagate a new txc->status value into the NTP state: @@ -1044,4 +1049,5 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup); void __init ntp_init(void) { ntp_clear(); + ntp_init_cmos_sync(); } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index 908ecaa65fc3..23d1b74c3065 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -12,4 +12,11 @@ extern int __do_adjtimex(struct __kernel_timex *txc, const struct timespec64 *ts, s32 *time_tai, struct audit_ntp_data *ad); extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts); + +#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) +extern void ntp_notify_cmos_timer(void); +#else +static inline void ntp_notify_cmos_timer(void) { } +#endif + #endif /* _LINUX_NTP_INTERNAL_H */ -- cgit v1.2.3 From 33e62e832384c8cb523044e0e9d99d7133f98e93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Dec 2020 22:46:19 +0100 Subject: ntp, rtc: Move rtc_set_ntp_time() to ntp code rtc_set_ntp_time() is not really RTC functionality as the code is just a user of RTC. Move it into the NTP code which allows further cleanups. Requested-by: Alexandre Belloni Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Acked-by: Alexandre Belloni Link: https://lore.kernel.org/r/20201206220542.166871172@linutronix.de --- drivers/rtc/Makefile | 1 - drivers/rtc/systohc.c | 61 ----------------------------------- include/linux/rtc.h | 34 -------------------- kernel/time/ntp.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 85 insertions(+), 99 deletions(-) delete mode 100644 drivers/rtc/systohc.c (limited to 'kernel') diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile index bfb57464118d..bb8f319b09fb 100644 --- a/drivers/rtc/Makefile +++ b/drivers/rtc/Makefile @@ -6,7 +6,6 @@ ccflags-$(CONFIG_RTC_DEBUG) := -DDEBUG obj-$(CONFIG_RTC_LIB) += lib.o -obj-$(CONFIG_RTC_SYSTOHC) += systohc.o obj-$(CONFIG_RTC_CLASS) += rtc-core.o obj-$(CONFIG_RTC_MC146818_LIB) += rtc-mc146818-lib.o rtc-core-y := class.o interface.o diff --git a/drivers/rtc/systohc.c b/drivers/rtc/systohc.c deleted file mode 100644 index 8b70f0520e13..000000000000 --- a/drivers/rtc/systohc.c +++ /dev/null @@ -1,61 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include - -/** - * rtc_set_ntp_time - Save NTP synchronized time to the RTC - * @now: Current time of day - * @target_nsec: pointer for desired now->tv_nsec value - * - * Replacement for the NTP platform function update_persistent_clock64 - * that stores time for later retrieval by rtc_hctosys. - * - * Returns 0 on successful RTC update, -ENODEV if a RTC update is not - * possible at all, and various other -errno for specific temporary failure - * cases. - * - * -EPROTO is returned if now.tv_nsec is not close enough to *target_nsec. - * - * If temporary failure is indicated the caller should try again 'soon' - */ -int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec) -{ - struct rtc_device *rtc; - struct rtc_time tm; - struct timespec64 to_set; - int err = -ENODEV; - bool ok; - - rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE); - if (!rtc) - goto out_err; - - if (!rtc->ops || !rtc->ops->set_time) - goto out_close; - - /* Compute the value of tv_nsec we require the caller to supply in - * now.tv_nsec. This is the value such that (now + - * set_offset_nsec).tv_nsec == 0. - */ - set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec); - *target_nsec = to_set.tv_nsec; - - /* The ntp code must call this with the correct value in tv_nsec, if - * it does not we update target_nsec and return EPROTO to make the ntp - * code try again later. - */ - ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now); - if (!ok) { - err = -EPROTO; - goto out_close; - } - - rtc_time64_to_tm(to_set.tv_sec, &tm); - - err = rtc_set_time(rtc, &tm); - -out_close: - rtc_class_close(rtc); -out_err: - return err; -} diff --git a/include/linux/rtc.h b/include/linux/rtc.h index 22d1575e4991..ff62680b48ca 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -165,7 +165,6 @@ int __rtc_register_device(struct module *owner, struct rtc_device *rtc); extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm); extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm); -extern int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec); int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm); extern int rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alrm); @@ -205,39 +204,6 @@ static inline bool is_leap_year(unsigned int year) return (!(year % 4) && (year % 100)) || !(year % 400); } -/* Determine if we can call to driver to set the time. Drivers can only be - * called to set a second aligned time value, and the field set_offset_nsec - * specifies how far away from the second aligned time to call the driver. - * - * This also computes 'to_set' which is the time we are trying to set, and has - * a zero in tv_nsecs, such that: - * to_set - set_delay_nsec == now +/- FUZZ - * - */ -static inline bool rtc_tv_nsec_ok(s64 set_offset_nsec, - struct timespec64 *to_set, - const struct timespec64 *now) -{ - /* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */ - const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5; - struct timespec64 delay = {.tv_sec = 0, - .tv_nsec = set_offset_nsec}; - - *to_set = timespec64_add(*now, delay); - - if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) { - to_set->tv_nsec = 0; - return true; - } - - if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) { - to_set->tv_sec++; - to_set->tv_nsec = 0; - return true; - } - return false; -} - #define rtc_register_device(device) \ __rtc_register_device(THIS_MODULE, device) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index ff1a7b8ec4ef..84a554622cee 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -519,15 +519,94 @@ static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry) hrtimer_start(&sync_hrtimer, exp, HRTIMER_MODE_ABS); } +/* + * Determine if we can call to driver to set the time. Drivers can only be + * called to set a second aligned time value, and the field set_offset_nsec + * specifies how far away from the second aligned time to call the driver. + * + * This also computes 'to_set' which is the time we are trying to set, and has + * a zero in tv_nsecs, such that: + * to_set - set_delay_nsec == now +/- FUZZ + * + */ +static inline bool rtc_tv_nsec_ok(long set_offset_nsec, + struct timespec64 *to_set, + const struct timespec64 *now) +{ + /* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */ + const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5; + struct timespec64 delay = {.tv_sec = 0, + .tv_nsec = set_offset_nsec}; + + *to_set = timespec64_add(*now, delay); + + if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) { + to_set->tv_nsec = 0; + return true; + } + + if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) { + to_set->tv_sec++; + to_set->tv_nsec = 0; + return true; + } + return false; +} + +#ifdef CONFIG_RTC_SYSTOHC +/* + * rtc_set_ntp_time - Save NTP synchronized time to the RTC + */ +static int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec) +{ + struct rtc_device *rtc; + struct rtc_time tm; + struct timespec64 to_set; + int err = -ENODEV; + bool ok; + + rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE); + if (!rtc) + goto out_err; + + if (!rtc->ops || !rtc->ops->set_time) + goto out_close; + + /* + * Compute the value of tv_nsec we require the caller to supply in + * now.tv_nsec. This is the value such that (now + + * set_offset_nsec).tv_nsec == 0. + */ + set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec); + *target_nsec = to_set.tv_nsec; + + /* + * The ntp code must call this with the correct value in tv_nsec, if + * it does not we update target_nsec and return EPROTO to make the ntp + * code try again later. + */ + ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now); + if (!ok) { + err = -EPROTO; + goto out_close; + } + + rtc_time64_to_tm(to_set.tv_sec, &tm); + + err = rtc_set_time(rtc, &tm); + +out_close: + rtc_class_close(rtc); +out_err: + return err; +} + static void sync_rtc_clock(void) { unsigned long offset_nsec; struct timespec64 adjust; int rc; - if (!IS_ENABLED(CONFIG_RTC_SYSTOHC)) - return; - ktime_get_real_ts64(&adjust); if (persistent_clock_is_local) @@ -544,6 +623,9 @@ static void sync_rtc_clock(void) sched_sync_hw_clock(offset_nsec, rc != 0); } +#else +static inline void sync_rtc_clock(void) { } +#endif #ifdef CONFIG_GENERIC_CMOS_UPDATE int __weak update_persistent_clock64(struct timespec64 now64) -- cgit v1.2.3 From 69eca258c85000564577642ba28335eb4e1df8f0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Dec 2020 22:46:20 +0100 Subject: ntp: Make the RTC sync offset less obscure The current RTC set_offset_nsec value is not really intuitive to understand. tsched twrite(t2.tv_sec - 1) t2 (seconds increment) The offset is calculated from twrite based on the assumption that t2 - twrite == 1s. That means for the MC146818 RTC the offset needs to be negative so that the write happens 500ms before t2. It's easier to understand when the whole calculation is based on t2. That avoids negative offsets and the meaning is obvious: t2 - twrite: The time defined by the chip when seconds increment after the write. twrite - tsched: The time for the transport to the point where the chip is updated. ==> set_offset_nsec = t2 - tsched ttransport = twrite - tsched tRTCinc = t2 - twrite ==> set_offset_nsec = ttransport + tRTCinc tRTCinc is a chip property and can be obtained from the data sheet. ttransport depends on how the RTC is connected. It is close to 0 for directly accessible RTCs. For RTCs behind a slow bus, e.g. i2c, it's the time required to send the update over the bus. This can be estimated or even calibrated, but that's a different problem. Adjust the implementation and update comments accordingly. Signed-off-by: Thomas Gleixner Acked-by: Alexandre Belloni Link: https://lore.kernel.org/r/20201206220542.263204937@linutronix.de --- drivers/rtc/class.c | 9 +++++++-- drivers/rtc/rtc-cmos.c | 2 +- include/linux/rtc.h | 35 +++++++++++++++++++++++++++++------ kernel/time/ntp.c | 47 ++++++++++++++++++++++++----------------------- 4 files changed, 61 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index d7957376eb96..5855aa2eef62 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -200,8 +200,13 @@ static struct rtc_device *rtc_allocate_device(void) device_initialize(&rtc->dev); - /* Drivers can revise this default after allocating the device. */ - rtc->set_offset_nsec = 5 * NSEC_PER_MSEC; + /* + * Drivers can revise this default after allocating the device. + * The default is what most RTCs do: Increment seconds exactly one + * second after the write happened. This adds a default transport + * time of 5ms which is at least halfways close to reality. + */ + rtc->set_offset_nsec = NSEC_PER_SEC + 5 * NSEC_PER_MSEC; rtc->irq_freq = 1; rtc->max_user_freq = 64; diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c index 7728faca01b4..c5bcd2adc9fe 100644 --- a/drivers/rtc/rtc-cmos.c +++ b/drivers/rtc/rtc-cmos.c @@ -869,7 +869,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq) goto cleanup2; /* Set the sync offset for the periodic 11min update correct */ - cmos_rtc.rtc->set_offset_nsec = -(NSEC_PER_SEC / 2); + cmos_rtc.rtc->set_offset_nsec = NSEC_PER_SEC / 2; /* export at least the first block of NVRAM */ nvmem_cfg.size = address_space - NVRAM_OFFSET; diff --git a/include/linux/rtc.h b/include/linux/rtc.h index ff62680b48ca..b829382de6c3 100644 --- a/include/linux/rtc.h +++ b/include/linux/rtc.h @@ -110,13 +110,36 @@ struct rtc_device { /* Some hardware can't support UIE mode */ int uie_unsupported; - /* Number of nsec it takes to set the RTC clock. This influences when - * the set ops are called. An offset: - * - of 0.5 s will call RTC set for wall clock time 10.0 s at 9.5 s - * - of 1.5 s will call RTC set for wall clock time 10.0 s at 8.5 s - * - of -0.5 s will call RTC set for wall clock time 10.0 s at 10.5 s + /* + * This offset specifies the update timing of the RTC. + * + * tsched t1 write(t2.tv_sec - 1sec)) t2 RTC increments seconds + * + * The offset defines how tsched is computed so that the write to + * the RTC (t2.tv_sec - 1sec) is correct versus the time required + * for the transport of the write and the time which the RTC needs + * to increment seconds the first time after the write (t2). + * + * For direct accessible RTCs tsched ~= t1 because the write time + * is negligible. For RTCs behind slow busses the transport time is + * significant and has to be taken into account. + * + * The time between the write (t1) and the first increment after + * the write (t2) is RTC specific. For a MC146818 RTC it's 500ms, + * for many others it's exactly 1 second. Consult the datasheet. + * + * The value of this offset is also used to calculate the to be + * written value (t2.tv_sec - 1sec) at tsched. + * + * The default value for this is NSEC_PER_SEC + 10 msec default + * transport time. The offset can be adjusted by drivers so the + * calculation for the to be written value at tsched becomes + * correct: + * + * newval = tsched + set_offset_nsec - NSEC_PER_SEC + * and (tsched + set_offset_nsec) % NSEC_PER_SEC == 0 */ - long set_offset_nsec; + unsigned long set_offset_nsec; bool registered; diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 84a554622cee..a34ac069335f 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -520,22 +520,33 @@ static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry) } /* - * Determine if we can call to driver to set the time. Drivers can only be - * called to set a second aligned time value, and the field set_offset_nsec - * specifies how far away from the second aligned time to call the driver. + * Check whether @now is correct versus the required time to update the RTC + * and calculate the value which needs to be written to the RTC so that the + * next seconds increment of the RTC after the write is aligned with the next + * seconds increment of clock REALTIME. * - * This also computes 'to_set' which is the time we are trying to set, and has - * a zero in tv_nsecs, such that: - * to_set - set_delay_nsec == now +/- FUZZ + * tsched t1 write(t2.tv_sec - 1sec)) t2 RTC increments seconds * + * t2.tv_nsec == 0 + * tsched = t2 - set_offset_nsec + * newval = t2 - NSEC_PER_SEC + * + * ==> neval = tsched + set_offset_nsec - NSEC_PER_SEC + * + * As the execution of this code is not guaranteed to happen exactly at + * tsched this allows it to happen within a fuzzy region: + * + * abs(now - tsched) < FUZZ + * + * If @now is not inside the allowed window the function returns false. */ -static inline bool rtc_tv_nsec_ok(long set_offset_nsec, +static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec, struct timespec64 *to_set, const struct timespec64 *now) { /* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */ const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5; - struct timespec64 delay = {.tv_sec = 0, + struct timespec64 delay = {.tv_sec = -1, .tv_nsec = set_offset_nsec}; *to_set = timespec64_add(*now, delay); @@ -557,11 +568,11 @@ static inline bool rtc_tv_nsec_ok(long set_offset_nsec, /* * rtc_set_ntp_time - Save NTP synchronized time to the RTC */ -static int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec) +static int rtc_set_ntp_time(struct timespec64 now, unsigned long *offset_nsec) { + struct timespec64 to_set; struct rtc_device *rtc; struct rtc_time tm; - struct timespec64 to_set; int err = -ENODEV; bool ok; @@ -572,19 +583,9 @@ static int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec) if (!rtc->ops || !rtc->ops->set_time) goto out_close; - /* - * Compute the value of tv_nsec we require the caller to supply in - * now.tv_nsec. This is the value such that (now + - * set_offset_nsec).tv_nsec == 0. - */ - set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec); - *target_nsec = to_set.tv_nsec; + /* Store the update offset for this RTC */ + *offset_nsec = rtc->set_offset_nsec; - /* - * The ntp code must call this with the correct value in tv_nsec, if - * it does not we update target_nsec and return EPROTO to make the ntp - * code try again later. - */ ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now); if (!ok) { err = -EPROTO; @@ -657,7 +658,7 @@ static bool sync_cmos_clock(void) * implement this legacy API. */ ktime_get_real_ts64(&now); - if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) { + if (rtc_tv_nsec_ok(target_nsec, &adjust, &now)) { if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); rc = update_persistent_clock64(adjust); -- cgit v1.2.3 From 76e87d96b30b5fee91b381fbc444a3eabcd9469a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Dec 2020 22:46:21 +0100 Subject: ntp: Consolidate the RTC update implementation The code for the legacy RTC and the RTC class based update are pretty much the same. Consolidate the common parts into one function and just invoke the actual setter functions. For RTC class based devices the update code checks whether the offset is valid for the device, which is usually not the case for the first invocation. If it's not the same it stores the correct offset and lets the caller try again. That's not much different from the previous approach where the first invocation had a pretty low probability to actually hit the allowed window. Signed-off-by: Thomas Gleixner Reviewed-by: Jason Gunthorpe Acked-by: Alexandre Belloni Link: https://lore.kernel.org/r/20201206220542.355743355@linutronix.de --- kernel/time/ntp.c | 144 ++++++++++++++++++++---------------------------------- 1 file changed, 52 insertions(+), 92 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index a34ac069335f..7404d3831527 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -564,118 +564,53 @@ static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec, return false; } +#ifdef CONFIG_GENERIC_CMOS_UPDATE +int __weak update_persistent_clock64(struct timespec64 now64) +{ + return -ENODEV; +} +#else +static inline int update_persistent_clock64(struct timespec64 now64) +{ + return -ENODEV; +} +#endif + #ifdef CONFIG_RTC_SYSTOHC -/* - * rtc_set_ntp_time - Save NTP synchronized time to the RTC - */ -static int rtc_set_ntp_time(struct timespec64 now, unsigned long *offset_nsec) +/* Save NTP synchronized time to the RTC */ +static int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec) { - struct timespec64 to_set; struct rtc_device *rtc; struct rtc_time tm; int err = -ENODEV; - bool ok; rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE); if (!rtc) - goto out_err; + return -ENODEV; if (!rtc->ops || !rtc->ops->set_time) goto out_close; - /* Store the update offset for this RTC */ - *offset_nsec = rtc->set_offset_nsec; - - ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now); - if (!ok) { - err = -EPROTO; - goto out_close; + /* First call might not have the correct offset */ + if (*offset_nsec == rtc->set_offset_nsec) { + rtc_time64_to_tm(to_set->tv_sec, &tm); + err = rtc_set_time(rtc, &tm); + } else { + /* Store the update offset and let the caller try again */ + *offset_nsec = rtc->set_offset_nsec; + err = -EAGAIN; } - - rtc_time64_to_tm(to_set.tv_sec, &tm); - - err = rtc_set_time(rtc, &tm); - out_close: rtc_class_close(rtc); -out_err: return err; } - -static void sync_rtc_clock(void) -{ - unsigned long offset_nsec; - struct timespec64 adjust; - int rc; - - ktime_get_real_ts64(&adjust); - - if (persistent_clock_is_local) - adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); - - /* - * The current RTC in use will provide the nanoseconds offset prior - * to a full second it wants to be called at, and invokes - * rtc_tv_nsec_ok() internally. - */ - rc = rtc_set_ntp_time(adjust, &offset_nsec); - if (rc == -ENODEV) - return; - - sched_sync_hw_clock(offset_nsec, rc != 0); -} #else -static inline void sync_rtc_clock(void) { } -#endif - -#ifdef CONFIG_GENERIC_CMOS_UPDATE -int __weak update_persistent_clock64(struct timespec64 now64) +static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec) { return -ENODEV; } #endif -static bool sync_cmos_clock(void) -{ - static bool no_cmos; - struct timespec64 now; - struct timespec64 adjust; - int rc = -EPROTO; - long target_nsec = NSEC_PER_SEC / 2; - - if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE)) - return false; - - if (no_cmos) - return false; - - /* - * Historically update_persistent_clock64() has followed x86 - * semantics, which match the MC146818A/etc RTC. This RTC will store - * 'adjust' and then in .5s it will advance once second. - * - * Architectures are strongly encouraged to use rtclib and not - * implement this legacy API. - */ - ktime_get_real_ts64(&now); - if (rtc_tv_nsec_ok(target_nsec, &adjust, &now)) { - if (persistent_clock_is_local) - adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); - rc = update_persistent_clock64(adjust); - /* - * The machine does not support update_persistent_clock64 even - * though it defines CONFIG_GENERIC_CMOS_UPDATE. - */ - if (rc == -ENODEV) { - no_cmos = true; - return false; - } - } - - sched_sync_hw_clock(target_nsec, rc != 0); - return true; -} - /* * If we have an externally synchronized Linux clock, then update RTC clock * accordingly every ~11 minutes. Generally RTCs can only store second @@ -686,6 +621,15 @@ static bool sync_cmos_clock(void) */ static void sync_hw_clock(struct work_struct *work) { + /* + * The default synchronization offset is 500ms for the deprecated + * update_persistent_clock64() under the assumption that it uses + * the infamous CMOS clock (MC146818). + */ + static unsigned long offset_nsec = NSEC_PER_SEC / 2; + struct timespec64 now, to_set; + int res = -EAGAIN; + /* * Don't update if STA_UNSYNC is set and if ntp_notify_cmos_timer() * managed to schedule the work between the timer firing and the @@ -694,10 +638,26 @@ static void sync_hw_clock(struct work_struct *work) if (!ntp_synced() || hrtimer_is_queued(&sync_hrtimer)) return; - if (sync_cmos_clock()) - return; + ktime_get_real_ts64(&now); + /* If @now is not in the allowed window, try again */ + if (!rtc_tv_nsec_ok(offset_nsec, &to_set, &now)) + goto rearm; - sync_rtc_clock(); + /* Take timezone adjusted RTCs into account */ + if (persistent_clock_is_local) + to_set.tv_sec -= (sys_tz.tz_minuteswest * 60); + + /* Try the legacy RTC first. */ + res = update_persistent_clock64(to_set); + if (res != -ENODEV) + goto rearm; + + /* Try the RTC class */ + res = update_rtc(&to_set, &offset_nsec); + if (res == -ENODEV) + return; +rearm: + sched_sync_hw_clock(offset_nsec, res != 0); } void ntp_notify_cmos_timer(void) -- cgit v1.2.3 From b388fa50142510fb6477f130bb1b3f05a0a263a1 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Mon, 9 Nov 2020 09:41:21 +0000 Subject: Revert "genirq: Add fasteoi IPI flow" handle_percpu_devid_fasteoi_ipi() has no more users, and handle_percpu_devid_irq() can do all that it was supposed to do. Get rid of it. This reverts commit c5e5ec033c4ab25c53f1fd217849e75deb0bf7bf. Signed-off-by: Valentin Schneider Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201109094121.29975-6-valentin.schneider@arm.com --- include/linux/irq.h | 1 - kernel/irq/chip.c | 27 --------------------------- 2 files changed, 28 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index c54365309e97..ca26bec51cec 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -647,7 +647,6 @@ static inline int irq_set_parent(int irq, int parent_irq) */ extern void handle_level_irq(struct irq_desc *desc); extern void handle_fasteoi_irq(struct irq_desc *desc); -extern void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc); extern void handle_edge_irq(struct irq_desc *desc); extern void handle_edge_eoi_irq(struct irq_desc *desc); extern void handle_simple_irq(struct irq_desc *desc); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index b9b9618e1aca..0ae308efa604 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -944,33 +944,6 @@ void handle_percpu_devid_irq(struct irq_desc *desc) chip->irq_eoi(&desc->irq_data); } -/** - * handle_percpu_devid_fasteoi_ipi - Per CPU local IPI handler with per cpu - * dev ids - * @desc: the interrupt description structure for this irq - * - * The biggest difference with the IRQ version is that the interrupt is - * EOIed early, as the IPI could result in a context switch, and we need to - * make sure the IPI can fire again. We also assume that the arch code has - * registered an action. If not, we are positively doomed. - */ -void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc) -{ - struct irq_chip *chip = irq_desc_get_chip(desc); - struct irqaction *action = desc->action; - unsigned int irq = irq_desc_get_irq(desc); - irqreturn_t res; - - __kstat_incr_irqs_this_cpu(desc); - - if (chip->irq_eoi) - chip->irq_eoi(&desc->irq_data); - - trace_irq_handler_entry(irq, action); - res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id)); - trace_irq_handler_exit(irq, action, res); -} - /** * handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu * dev ids -- cgit v1.2.3 From 1d3aec89286254487df7641c30f1b14ad1d127a5 Mon Sep 17 00:00:00 2001 From: John Garry Date: Wed, 2 Dec 2020 18:36:53 +0800 Subject: genirq/affinity: Add irq_update_affinity_desc() Add a function to allow the affinity of an interrupt be switched to managed, such that interrupts allocated for platform devices may be managed. This new interface has certain limitations, and attempts to use it in the following circumstances will fail: - For when the kernel is configured for generic IRQ reservation mode (in config GENERIC_IRQ_RESERVATION_MODE). The reason being that it could conflict with managed vs. non-managed interrupt accounting. - The interrupt is already started, which should not be the case during init - The interrupt is already configured as managed, which means double init Suggested-by: Thomas Gleixner Signed-off-by: John Garry Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/1606905417-183214-2-git-send-email-john.garry@huawei.com --- include/linux/interrupt.h | 8 ++++++ kernel/irq/manage.c | 70 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index ee8299eb1f52..870b3251e174 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -352,6 +352,8 @@ extern int irq_can_set_affinity(unsigned int irq); extern int irq_select_affinity(unsigned int irq); extern int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m); +extern int irq_update_affinity_desc(unsigned int irq, + struct irq_affinity_desc *affinity); extern int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify); @@ -387,6 +389,12 @@ static inline int irq_set_affinity_hint(unsigned int irq, return -EINVAL; } +static inline int irq_update_affinity_desc(unsigned int irq, + struct irq_affinity_desc *affinity) +{ + return -EINVAL; +} + static inline int irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify) { diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c460e0496006..c826ba4141fe 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -371,6 +371,76 @@ int irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask, return ret; } +/** + * irq_update_affinity_desc - Update affinity management for an interrupt + * @irq: The interrupt number to update + * @affinity: Pointer to the affinity descriptor + * + * This interface can be used to configure the affinity management of + * interrupts which have been allocated already. + * + * There are certain limitations on when it may be used - attempts to use it + * for when the kernel is configured for generic IRQ reservation mode (in + * config GENERIC_IRQ_RESERVATION_MODE) will fail, as it may conflict with + * managed/non-managed interrupt accounting. In addition, attempts to use it on + * an interrupt which is already started or which has already been configured + * as managed will also fail, as these mean invalid init state or double init. + */ +int irq_update_affinity_desc(unsigned int irq, + struct irq_affinity_desc *affinity) +{ + struct irq_desc *desc; + unsigned long flags; + bool activated; + int ret = 0; + + /* + * Supporting this with the reservation scheme used by x86 needs + * some more thought. Fail it for now. + */ + if (IS_ENABLED(CONFIG_GENERIC_IRQ_RESERVATION_MODE)) + return -EOPNOTSUPP; + + desc = irq_get_desc_buslock(irq, &flags, 0); + if (!desc) + return -EINVAL; + + /* Requires the interrupt to be shut down */ + if (irqd_is_started(&desc->irq_data)) { + ret = -EBUSY; + goto out_unlock; + } + + /* Interrupts which are already managed cannot be modified */ + if (irqd_affinity_is_managed(&desc->irq_data)) { + ret = -EBUSY; + goto out_unlock; + } + + /* + * Deactivate the interrupt. That's required to undo + * anything an earlier activation has established. + */ + activated = irqd_is_activated(&desc->irq_data); + if (activated) + irq_domain_deactivate_irq(&desc->irq_data); + + if (affinity->is_managed) { + irqd_set(&desc->irq_data, IRQD_AFFINITY_MANAGED); + irqd_set(&desc->irq_data, IRQD_MANAGED_SHUTDOWN); + } + + cpumask_copy(desc->irq_common_data.affinity, &affinity->mask); + + /* Restore the activation state */ + if (activated) + irq_domain_activate_irq(&desc->irq_data, false); + +out_unlock: + irq_put_desc_busunlock(desc, flags); + return ret; +} + int __irq_set_affinity(unsigned int irq, const struct cpumask *mask, bool force) { struct irq_desc *desc = irq_to_desc(irq); -- cgit v1.2.3 From 90ac908a418b836427d6eaf84fbc5062881747fd Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 12 Nov 2020 20:26:42 +0100 Subject: cpufreq: schedutil: Simplify sugov_update_next_freq() Rearrange a conditional to make it more straightforward. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- kernel/sched/cpufreq_schedutil.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 97d318b0cd0c..77736058d8e4 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -102,12 +102,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) static bool sugov_update_next_freq(struct sugov_policy *sg_policy, u64 time, unsigned int next_freq) { - if (!sg_policy->need_freq_update) { - if (sg_policy->next_freq == next_freq) - return false; - } else { + if (sg_policy->need_freq_update) sg_policy->need_freq_update = cpufreq_driver_test_flags(CPUFREQ_NEED_UPDATE_LIMITS); - } + else if (sg_policy->next_freq == next_freq) + return false; sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; -- cgit v1.2.3 From e998879d4fb7991856916972168cf27c0d86ed12 Mon Sep 17 00:00:00 2001 From: Ashish Kalra Date: Thu, 10 Dec 2020 01:25:15 +0000 Subject: x86,swiotlb: Adjust SWIOTLB bounce buffer size for SEV guests For SEV, all DMA to and from guest has to use shared (un-encrypted) pages. SEV uses SWIOTLB to make this happen without requiring changes to device drivers. However, depending on the workload being run, the default 64MB of it might not be enough and it may run out of buffers to use for DMA, resulting in I/O errors and/or performance degradation for high I/O workloads. Adjust the default size of SWIOTLB for SEV guests using a percentage of the total memory available to guest for the SWIOTLB buffers. Adds a new sev_setup_arch() function which is invoked from setup_arch() and it calls into a new swiotlb generic code function swiotlb_adjust_size() to do the SWIOTLB buffer adjustment. v5 fixed build errors and warnings as Reported-by: kbuild test robot Signed-off-by: Ashish Kalra Co-developed-by: Borislav Petkov Signed-off-by: Borislav Petkov Signed-off-by: Konrad Rzeszutek Wilk --- arch/x86/include/asm/mem_encrypt.h | 2 ++ arch/x86/kernel/setup.c | 6 ++++++ arch/x86/mm/mem_encrypt.c | 31 +++++++++++++++++++++++++++++++ include/linux/swiotlb.h | 8 ++++++++ kernel/dma/swiotlb.c | 20 ++++++++++++++++++-- 5 files changed, 65 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 2f62bbdd9d12..31c4df123aa0 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -37,6 +37,7 @@ void __init sme_map_bootdata(char *real_mode_data); void __init sme_unmap_bootdata(char *real_mode_data); void __init sme_early_init(void); +void __init sev_setup_arch(void); void __init sme_encrypt_kernel(struct boot_params *bp); void __init sme_enable(struct boot_params *bp); @@ -69,6 +70,7 @@ static inline void __init sme_map_bootdata(char *real_mode_data) { } static inline void __init sme_unmap_bootdata(char *real_mode_data) { } static inline void __init sme_early_init(void) { } +static inline void __init sev_setup_arch(void) { } static inline void __init sme_encrypt_kernel(struct boot_params *bp) { } static inline void __init sme_enable(struct boot_params *bp) { } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 84f581c91db4..874b2c17af41 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1054,6 +1054,12 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); e820__memblock_setup(); + /* + * Needs to run after memblock setup because it needs the physical + * memory size. + */ + sev_setup_arch(); + reserve_bios_regions(); efi_fake_memmap(); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index bc0833713be9..c79e5736ab2b 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -198,6 +198,37 @@ void __init sme_early_init(void) swiotlb_force = SWIOTLB_FORCE; } +void __init sev_setup_arch(void) +{ + phys_addr_t total_mem = memblock_phys_mem_size(); + unsigned long size; + + if (!sev_active()) + return; + + /* + * For SEV, all DMA has to occur via shared/unencrypted pages. + * SEV uses SWIOTLB to make this happen without changing device + * drivers. However, depending on the workload being run, the + * default 64MB of SWIOTLB may not be enough and SWIOTLB may + * run out of buffers for DMA, resulting in I/O errors and/or + * performance degradation especially with high I/O workloads. + * + * Adjust the default size of SWIOTLB for SEV guests using + * a percentage of guest memory for SWIOTLB buffers. + * Also, as the SWIOTLB bounce buffer memory is allocated + * from low memory, ensure that the adjusted size is within + * the limits of low available memory. + * + * The percentage of guest memory used here for SWIOTLB buffers + * is more of an approximation of the static adjustment which + * 64MB for <1G, and ~128M to 256M for 1G-to-4G, i.e., the 6% + */ + size = total_mem * 6 / 100; + size = clamp_val(size, IO_TLB_DEFAULT_SIZE, SZ_1G); + swiotlb_adjust_size(size); +} + static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) { pgprot_t old_prot, new_prot; diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index fbdc65782195..d9c9fc9ca5d2 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -30,6 +30,9 @@ enum swiotlb_force { */ #define IO_TLB_SHIFT 11 +/* default to 64MB */ +#define IO_TLB_DEFAULT_SIZE (64UL<<20) + extern void swiotlb_init(int verbose); int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose); extern unsigned long swiotlb_nr_tbl(void); @@ -78,6 +81,7 @@ void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); bool is_swiotlb_active(void); +void __init swiotlb_adjust_size(unsigned long new_size); #else #define swiotlb_force SWIOTLB_NO_FORCE static inline bool is_swiotlb_buffer(phys_addr_t paddr) @@ -100,6 +104,10 @@ static inline bool is_swiotlb_active(void) { return false; } + +static inline void swiotlb_adjust_size(unsigned long new_size) +{ +} #endif /* CONFIG_SWIOTLB */ extern void swiotlb_print_info(void); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 781b9dca197c..7c42df6e6100 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -152,8 +152,6 @@ void swiotlb_set_max_segment(unsigned int val) max_segment = rounddown(val, PAGE_SIZE); } -/* default to 64MB */ -#define IO_TLB_DEFAULT_SIZE (64UL<<20) unsigned long swiotlb_size_or_default(void) { unsigned long size; @@ -163,6 +161,24 @@ unsigned long swiotlb_size_or_default(void) return size ? size : (IO_TLB_DEFAULT_SIZE); } +void __init swiotlb_adjust_size(unsigned long new_size) +{ + unsigned long size; + + /* + * If swiotlb parameter has not been specified, give a chance to + * architectures such as those supporting memory encryption to + * adjust/expand SWIOTLB size for their use. + */ + if (!io_tlb_nslabs) { + size = ALIGN(new_size, 1 << IO_TLB_SHIFT); + io_tlb_nslabs = size >> IO_TLB_SHIFT; + io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); + + pr_info("SWIOTLB bounce buffer size adjusted to %luMB", size >> 20); + } +} + void swiotlb_print_info(void) { unsigned long bytes = io_tlb_nslabs << IO_TLB_SHIFT; -- cgit v1.2.3 From 6e7b64b9dd6d96537d816ea07ec26b7dedd397b9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 11 Dec 2020 13:36:46 -0800 Subject: elfcore: fix building with clang kernel/elfcore.c only contains weak symbols, which triggers a bug with clang in combination with recordmcount: Cannot find symbol for section 2: .text. kernel/elfcore.o: failed Move the empty stubs into linux/elfcore.h as inline functions. As only two architectures use these, just use the architecture specific Kconfig symbols to key off the declaration. Link: https://lkml.kernel.org/r/20201204165742.3815221-2-arnd@kernel.org Signed-off-by: Arnd Bergmann Cc: Nathan Chancellor Cc: Nick Desaulniers Cc: Barret Rhoden Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/elfcore.h | 22 ++++++++++++++++++++++ kernel/Makefile | 1 - kernel/elfcore.c | 26 -------------------------- 3 files changed, 22 insertions(+), 27 deletions(-) delete mode 100644 kernel/elfcore.c (limited to 'kernel') diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 46c3d691f677..de51c1bef27d 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -104,6 +104,7 @@ static inline int elf_core_copy_task_fpregs(struct task_struct *t, struct pt_reg #endif } +#if defined(CONFIG_UM) || defined(CONFIG_IA64) /* * These functions parameterize elf_core_dump in fs/binfmt_elf.c to write out * extra segments containing the gate DSO contents. Dumping its @@ -118,5 +119,26 @@ elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset); extern int elf_core_write_extra_data(struct coredump_params *cprm); extern size_t elf_core_extra_data_size(void); +#else +static inline Elf_Half elf_core_extra_phdrs(void) +{ + return 0; +} + +static inline int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) +{ + return 1; +} + +static inline int elf_core_write_extra_data(struct coredump_params *cprm) +{ + return 1; +} + +static inline size_t elf_core_extra_data_size(void) +{ + return 0; +} +#endif #endif /* _LINUX_ELFCORE_H */ diff --git a/kernel/Makefile b/kernel/Makefile index af601b9bda0e..6c9f19911be0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -97,7 +97,6 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_ELFCORE) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_TRACE_CLOCK) += trace/ diff --git a/kernel/elfcore.c b/kernel/elfcore.c deleted file mode 100644 index 57fb4dcff434..000000000000 --- a/kernel/elfcore.c +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include - -Elf_Half __weak elf_core_extra_phdrs(void) -{ - return 0; -} - -int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) -{ - return 1; -} - -int __weak elf_core_write_extra_data(struct coredump_params *cprm) -{ - return 1; -} - -size_t __weak elf_core_extra_data_size(void) -{ - return 0; -} -- cgit v1.2.3 From b7906b70a2337e445b8dca3ce7ba8976b6ebd07d Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Fri, 11 Dec 2020 22:36:25 +0100 Subject: bpf: Fix enum names for bpf_this_cpu_ptr() and bpf_per_cpu_ptr() helpers Remove bpf_ prefix, which causes these helpers to be reported in verifier dump as bpf_bpf_this_cpu_ptr() and bpf_bpf_per_cpu_ptr(), respectively. Lets fix it as long as it is still possible before UAPI freezes on these helpers. Fixes: eaa6bcb71ef6 ("bpf: Introduce bpf_per_cpu_ptr()") Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Signed-off-by: Linus Torvalds --- include/uapi/linux/bpf.h | 4 ++-- kernel/bpf/helpers.c | 4 ++-- kernel/trace/bpf_trace.c | 4 ++-- tools/include/uapi/linux/bpf.h | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index e6ceac3f7d62..556216dc9703 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -3897,8 +3897,8 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ - FN(bpf_per_cpu_ptr), \ - FN(bpf_this_cpu_ptr), \ + FN(per_cpu_ptr), \ + FN(this_cpu_ptr), \ FN(redirect_peer), \ /* */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 25520f5eeaf6..deda1185237b 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -717,9 +717,9 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; - case BPF_FUNC_bpf_per_cpu_ptr: + case BPF_FUNC_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_bpf_this_cpu_ptr: + case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; default: break; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 048c655315f1..a125ea5e04cd 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1337,9 +1337,9 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return prog->aux->sleepable ? &bpf_copy_from_user_proto : NULL; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; - case BPF_FUNC_bpf_per_cpu_ptr: + case BPF_FUNC_per_cpu_ptr: return &bpf_per_cpu_ptr_proto; - case BPF_FUNC_bpf_this_cpu_ptr: + case BPF_FUNC_this_cpu_ptr: return &bpf_this_cpu_ptr_proto; default: return NULL; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e6ceac3f7d62..556216dc9703 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -3897,8 +3897,8 @@ union bpf_attr { FN(seq_printf_btf), \ FN(skb_cgroup_classid), \ FN(redirect_neigh), \ - FN(bpf_per_cpu_ptr), \ - FN(bpf_this_cpu_ptr), \ + FN(per_cpu_ptr), \ + FN(this_cpu_ptr), \ FN(redirect_peer), \ /* */ -- cgit v1.2.3 From aa3b66f401b372598b29421bab4d17b631b92407 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 4 Dec 2020 11:55:19 +0100 Subject: tick/sched: Make jiffies update quick check more robust The quick check in tick_do_update_jiffies64() whether jiffies need to be updated is not really correct under all circumstances and on all architectures, especially not on 32bit systems. The quick check does: if (now < READ_ONCE(tick_next_period)) return; and the counterpart in the update is: WRITE_ONCE(tick_next_period, next_update_time); This has two problems: 1) On weakly ordered architectures there is no guarantee that the stores before the WRITE_ONCE() are visible which means that other CPUs can operate on a stale jiffies value. 2) On 32bit the store of tick_next_period which is an u64 is split into two 32bit stores. If the first 32bit store advances tick_next_period far out and the second 32bit store is delayed (virt, NMI ...) then jiffies will become stale until the second 32bit store happens. Address this by seperating the handling for 32bit and 64bit. On 64bit problem #1 is addressed by replacing READ_ONCE() / WRITE_ONCE() with smp_load_acquire() / smp_store_release(). On 32bit problem #2 is addressed by protecting the quick check with the jiffies sequence counter. The load and stores can be plain because the sequence count mechanics provides the required barriers already. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/87czzpc02w.fsf@nanos.tec.linutronix.de --- kernel/time/tick-sched.c | 74 ++++++++++++++++++++++++++++++------------------ 1 file changed, 47 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cc7cba20382e..a9e68936822d 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -57,36 +57,42 @@ static ktime_t last_jiffies_update; static void tick_do_update_jiffies64(ktime_t now) { unsigned long ticks = 1; - ktime_t delta; + ktime_t delta, nextp; /* - * Do a quick check without holding jiffies_lock. The READ_ONCE() + * 64bit can do a quick check without holding jiffies lock and + * without looking at the sequence count. The smp_load_acquire() * pairs with the update done later in this function. * - * This is also an intentional data race which is even safe on - * 32bit in theory. If there is a concurrent update then the check - * might give a random answer. It does not matter because if it - * returns then the concurrent update is already taking care, if it - * falls through then it will pointlessly contend on jiffies_lock. - * - * Though there is one nasty case on 32bit due to store tearing of - * the 64bit value. If the first 32bit store makes the quick check - * return on all other CPUs and the writing CPU context gets - * delayed to complete the second store (scheduled out on virt) - * then jiffies can become stale for up to ~2^32 nanoseconds - * without noticing. After that point all CPUs will wait for - * jiffies lock. - * - * OTOH, this is not any different than the situation with NOHZ=off - * where one CPU is responsible for updating jiffies and - * timekeeping. If that CPU goes out for lunch then all other CPUs - * will operate on stale jiffies until it decides to come back. + * 32bit cannot do that because the store of tick_next_period + * consists of two 32bit stores and the first store could move it + * to a random point in the future. */ - if (ktime_before(now, READ_ONCE(tick_next_period))) - return; + if (IS_ENABLED(CONFIG_64BIT)) { + if (ktime_before(now, smp_load_acquire(&tick_next_period))) + return; + } else { + unsigned int seq; - /* Reevaluate with jiffies_lock held */ + /* + * Avoid contention on jiffies_lock and protect the quick + * check with the sequence count. + */ + do { + seq = read_seqcount_begin(&jiffies_seq); + nextp = tick_next_period; + } while (read_seqcount_retry(&jiffies_seq, seq)); + + if (ktime_before(now, nextp)) + return; + } + + /* Quick check failed, i.e. update is required. */ raw_spin_lock(&jiffies_lock); + /* + * Reevaluate with the lock held. Another CPU might have done the + * update already. + */ if (ktime_before(now, tick_next_period)) { raw_spin_unlock(&jiffies_lock); return; @@ -112,11 +118,25 @@ static void tick_do_update_jiffies64(ktime_t now) jiffies_64 += ticks; /* - * Keep the tick_next_period variable up to date. WRITE_ONCE() - * pairs with the READ_ONCE() in the lockless quick check above. + * Keep the tick_next_period variable up to date. */ - WRITE_ONCE(tick_next_period, - ktime_add_ns(last_jiffies_update, TICK_NSEC)); + nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC); + + if (IS_ENABLED(CONFIG_64BIT)) { + /* + * Pairs with smp_load_acquire() in the lockless quick + * check above and ensures that the update to jiffies_64 is + * not reordered vs. the store to tick_next_period, neither + * by the compiler nor by the CPU. + */ + smp_store_release(&tick_next_period, nextp); + } else { + /* + * A plain store is good enough on 32bit as the quick check + * above is protected by the sequence count. + */ + tick_next_period = nextp; + } /* * Release the sequence count. calc_global_load() below is not -- cgit v1.2.3 From 03941ccfda161c2680147fa5ab92aead2a79cac1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 16:01:33 -0600 Subject: task_work: remove legacy TWA_SIGNAL path All archs now support TIF_NOTIFY_SIGNAL. Signed-off-by: Jens Axboe --- kernel/task_work.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index 15b087286bea..9cde961875c0 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -5,34 +5,6 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ -/* - * TWA_SIGNAL signaling - use TIF_NOTIFY_SIGNAL, if available, as it's faster - * than TIF_SIGPENDING as there's no dependency on ->sighand. The latter is - * shared for threads, and can cause contention on sighand->lock. Even for - * the non-threaded case TIF_NOTIFY_SIGNAL is more efficient, as no locking - * or IRQ disabling is involved for notification (or running) purposes. - */ -static void task_work_notify_signal(struct task_struct *task) -{ -#if defined(TIF_NOTIFY_SIGNAL) - set_notify_signal(task); -#else - unsigned long flags; - - /* - * Only grab the sighand lock if we don't already have some - * task_work pending. This pairs with the smp_store_mb() - * in get_signal(), see comment there. - */ - if (!(READ_ONCE(task->jobctl) & JOBCTL_TASK_WORK) && - lock_task_sighand(task, &flags)) { - task->jobctl |= JOBCTL_TASK_WORK; - signal_wake_up(task, 0); - unlock_task_sighand(task, &flags); - } -#endif -} - /** * task_work_add - ask the @task to execute @work->func() * @task: the task which should run the callback @@ -76,7 +48,7 @@ int task_work_add(struct task_struct *task, struct callback_head *work, set_notify_resume(task); break; case TWA_SIGNAL: - task_work_notify_signal(task); + set_notify_signal(task); break; default: WARN_ON_ONCE(1); -- cgit v1.2.3 From 98b89b649fce39dacb9dc036d6d0fdb8caff73f7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 16:03:01 -0600 Subject: signal: kill JOBCTL_TASK_WORK It's no longer used, get rid of it. Signed-off-by: Jens Axboe --- include/linux/sched/jobctl.h | 4 +--- kernel/signal.c | 20 -------------------- 2 files changed, 1 insertion(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/jobctl.h b/include/linux/sched/jobctl.h index d2b4204ba4d3..fa067de9f1a9 100644 --- a/include/linux/sched/jobctl.h +++ b/include/linux/sched/jobctl.h @@ -19,7 +19,6 @@ struct task_struct; #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ #define JOBCTL_TRAP_FREEZE_BIT 23 /* trap for cgroup freezer */ -#define JOBCTL_TASK_WORK_BIT 24 /* set by TWA_SIGNAL */ #define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) #define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) @@ -29,10 +28,9 @@ struct task_struct; #define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) #define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_FREEZE (1UL << JOBCTL_TRAP_FREEZE_BIT) -#define JOBCTL_TASK_WORK (1UL << JOBCTL_TASK_WORK_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) -#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK | JOBCTL_TASK_WORK) +#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) extern bool task_set_jobctl_pending(struct task_struct *task, unsigned long mask); extern void task_clear_jobctl_trapping(struct task_struct *task); diff --git a/kernel/signal.c b/kernel/signal.c index 923230ff6cfc..cf8b057ca2ac 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2556,26 +2556,6 @@ bool get_signal(struct ksignal *ksig) relock: spin_lock_irq(&sighand->siglock); - /* - * Make sure we can safely read ->jobctl() in task_work add. As Oleg - * states: - * - * It pairs with mb (implied by cmpxchg) before READ_ONCE. So we - * roughly have - * - * task_work_add: get_signal: - * STORE(task->task_works, new_work); STORE(task->jobctl); - * mb(); mb(); - * LOAD(task->jobctl); LOAD(task->task_works); - * - * and we can rely on STORE-MB-LOAD [ in task_work_add]. - */ - smp_store_mb(current->jobctl, current->jobctl & ~JOBCTL_TASK_WORK); - if (unlikely(current->task_works)) { - spin_unlock_irq(&sighand->siglock); - task_work_run(); - goto relock; - } /* * Every stopped thread goes here after wakeup. Check to see if -- cgit v1.2.3 From e296dc4996b8094ccde45d19090d804c4103513e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 9 Oct 2020 16:04:39 -0600 Subject: kernel: remove checking for TIF_NOTIFY_SIGNAL It's available everywhere now, no need to check or add dummy defines. Signed-off-by: Jens Axboe --- include/linux/entry-common.h | 4 ---- include/linux/sched/signal.h | 2 -- include/linux/tracehook.h | 4 ---- kernel/signal.c | 2 -- 4 files changed, 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index b9711e813ec2..abec3a5ae799 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -37,10 +37,6 @@ # define _TIF_UPROBE (0) #endif -#ifndef _TIF_NOTIFY_SIGNAL -# define _TIF_NOTIFY_SIGNAL (0) -#endif - /* * TIF flags handled in syscall_enter_from_user_mode() */ diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index bd5afa076189..24b7b862e043 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -360,7 +360,6 @@ static inline int task_sigpending(struct task_struct *p) static inline int signal_pending(struct task_struct *p) { -#if defined(TIF_NOTIFY_SIGNAL) /* * TIF_NOTIFY_SIGNAL isn't really a signal, but it requires the same * behavior in terms of ensuring that we break out of wait loops @@ -368,7 +367,6 @@ static inline int signal_pending(struct task_struct *p) */ if (unlikely(test_tsk_thread_flag(p, TIF_NOTIFY_SIGNAL))) return 1; -#endif return task_sigpending(p); } diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h index f7d82e4fafd6..ee9ab7dbc8c3 100644 --- a/include/linux/tracehook.h +++ b/include/linux/tracehook.h @@ -205,12 +205,10 @@ static inline void tracehook_notify_resume(struct pt_regs *regs) */ static inline void tracehook_notify_signal(void) { -#if defined(TIF_NOTIFY_SIGNAL) clear_thread_flag(TIF_NOTIFY_SIGNAL); smp_mb__after_atomic(); if (current->task_works) task_work_run(); -#endif } /* @@ -218,11 +216,9 @@ static inline void tracehook_notify_signal(void) */ static inline void set_notify_signal(struct task_struct *task) { -#if defined(TIF_NOTIFY_SIGNAL) if (!test_and_set_tsk_thread_flag(task, TIF_NOTIFY_SIGNAL) && !wake_up_state(task, TASK_INTERRUPTIBLE)) kick_process(task); -#endif } #endif /* */ diff --git a/kernel/signal.c b/kernel/signal.c index cf8b057ca2ac..ccd530509201 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2535,14 +2535,12 @@ bool get_signal(struct ksignal *ksig) * that the arch handlers don't all have to do it. If we get here * without TIF_SIGPENDING, just exit after running signal work. */ -#ifdef TIF_NOTIFY_SIGNAL if (!IS_ENABLED(CONFIG_GENERIC_ENTRY)) { if (test_thread_flag(TIF_NOTIFY_SIGNAL)) tracehook_notify_signal(); if (!task_sigpending(current)) return false; } -#endif if (unlikely(uprobe_deny_signal())) return false; -- cgit v1.2.3 From 60efe21e5976d3d4170a8190ca76a271d6419754 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2020 17:54:09 +0900 Subject: tracing: Disable ftrace selftests when any tracer is running Disable ftrace selftests when any tracer (kernel command line options like ftrace=, trace_events=, kprobe_events=, and boot-time tracing) starts running because selftest can disturb it. Currently ftrace= and trace_events= are checked, but kprobe_events has a different flag, and boot-time tracing didn't checked. This unifies the disabled flag and all of those boot-time tracing features sets the flag. This also fixes warnings on kprobe-event selftest (CONFIG_FTRACE_STARTUP_TEST=y and CONFIG_KPROBE_EVENTS=y) with boot-time tracing (ftrace.event.kprobes.EVENT.probes) like below; [ 59.803496] trace_kprobe: Testing kprobe tracing: [ 59.804258] ------------[ cut here ]------------ [ 59.805682] WARNING: CPU: 3 PID: 1 at kernel/trace/trace_kprobe.c:1987 kprobe_trace_self_tests_ib [ 59.806944] Modules linked in: [ 59.807335] CPU: 3 PID: 1 Comm: swapper/0 Not tainted 5.10.0-rc7+ #172 [ 59.808029] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1 04/01/204 [ 59.808999] RIP: 0010:kprobe_trace_self_tests_init+0x5f/0x42b [ 59.809696] Code: e8 03 00 00 48 c7 c7 30 8e 07 82 e8 6d 3c 46 ff 48 c7 c6 00 b2 1a 81 48 c7 c7 7 [ 59.812439] RSP: 0018:ffffc90000013e78 EFLAGS: 00010282 [ 59.813038] RAX: 00000000ffffffef RBX: 0000000000000000 RCX: 0000000000049443 [ 59.813780] RDX: 0000000000049403 RSI: 0000000000049403 RDI: 000000000002deb0 [ 59.814589] RBP: ffffc90000013e90 R08: 0000000000000001 R09: 0000000000000001 [ 59.815349] R10: 0000000000000001 R11: 0000000000000000 R12: 00000000ffffffef [ 59.816138] R13: ffff888004613d80 R14: ffffffff82696940 R15: ffff888004429138 [ 59.816877] FS: 0000000000000000(0000) GS:ffff88807dcc0000(0000) knlGS:0000000000000000 [ 59.817772] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 59.818395] CR2: 0000000001a8dd38 CR3: 0000000002222000 CR4: 00000000000006a0 [ 59.819144] Call Trace: [ 59.819469] ? init_kprobe_trace+0x6b/0x6b [ 59.819948] do_one_initcall+0x5f/0x300 [ 59.820392] ? rcu_read_lock_sched_held+0x4f/0x80 [ 59.820916] kernel_init_freeable+0x22a/0x271 [ 59.821416] ? rest_init+0x241/0x241 [ 59.821841] kernel_init+0xe/0x10f [ 59.822251] ret_from_fork+0x22/0x30 [ 59.822683] irq event stamp: 16403349 [ 59.823121] hardirqs last enabled at (16403359): [] console_unlock+0x48e/0x580 [ 59.824074] hardirqs last disabled at (16403368): [] console_unlock+0x3f6/0x580 [ 59.825036] softirqs last enabled at (16403200): [] __do_softirq+0x33a/0x484 [ 59.825982] softirqs last disabled at (16403087): [] asm_call_irq_on_stack+0x10 [ 59.827034] ---[ end trace 200c544775cdfeb3 ]--- [ 59.827635] trace_kprobe: error on probing function entry. Link: https://lkml.kernel.org/r/160741764955.3448999.3347769358299456915.stgit@devnote2 Fixes: 4d655281eb1b ("tracing/boot Add kprobe event support") Cc: Ingo Molnar Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 19 +++++++++++++------ kernel/trace/trace.h | 5 +++++ kernel/trace/trace_boot.c | 2 ++ kernel/trace/trace_events.c | 2 +- kernel/trace/trace_kprobe.c | 9 +++------ kernel/trace/trace_selftest.c | 2 +- 6 files changed, 25 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6a282bbc7e7f..eee484afcc51 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -68,10 +68,21 @@ bool ring_buffer_expanded; static bool __read_mostly tracing_selftest_running; /* - * If a tracer is running, we do not want to run SELFTEST. + * If boot-time tracing including tracers/events via kernel cmdline + * is running, we do not want to run SELFTEST. */ bool __read_mostly tracing_selftest_disabled; +#ifdef CONFIG_FTRACE_STARTUP_TEST +void __init disable_tracing_selftest(const char *reason) +{ + if (!tracing_selftest_disabled) { + tracing_selftest_disabled = true; + pr_info("Ftrace startup test is disabled due to %s\n", reason); + } +} +#endif + /* Pipe tracepoints to printk */ struct trace_iterator *tracepoint_print_iter; int tracepoint_printk; @@ -2112,11 +2123,7 @@ int __init register_tracer(struct tracer *type) apply_trace_boot_options(); /* disable other selftests, since this will break it. */ - tracing_selftest_disabled = true; -#ifdef CONFIG_FTRACE_STARTUP_TEST - printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", - type->name); -#endif + disable_tracing_selftest("running a tracer"); out_unlock: return ret; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9462251cab92..e448d2da0b99 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -719,6 +719,8 @@ extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; #ifdef CONFIG_FTRACE_STARTUP_TEST +extern void __init disable_tracing_selftest(const char *reason); + extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_function_graph(struct tracer *trace, @@ -742,6 +744,9 @@ extern int trace_selftest_startup_branch(struct tracer *trace, */ #define __tracer_data __refdata #else +static inline void __init disable_tracing_selftest(const char *reason) +{ +} /* Tracers are seldom changed. Optimize when selftests are disabled. */ #define __tracer_data __read_mostly #endif /* CONFIG_FTRACE_STARTUP_TEST */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index c22a152ef0b4..a82f03f385f8 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -344,6 +344,8 @@ static int __init trace_boot_init(void) trace_boot_init_one_instance(tr, trace_node); trace_boot_init_instances(trace_node); + disable_tracing_selftest("running boot-time tracing"); + return 0; } /* diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 98d194d8460e..7d207c5e9802 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -3201,7 +3201,7 @@ static __init int setup_trace_event(char *str) { strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); ring_buffer_expanded = true; - tracing_selftest_disabled = true; + disable_tracing_selftest("running event tracing"); return 1; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b911e9f6d9f5..b29f92c51b1a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -25,11 +25,12 @@ /* Kprobe early definition from command line */ static char kprobe_boot_events_buf[COMMAND_LINE_SIZE] __initdata; -static bool kprobe_boot_events_enabled __initdata; static int __init set_kprobe_boot_events(char *str) { strlcpy(kprobe_boot_events_buf, str, COMMAND_LINE_SIZE); + disable_tracing_selftest("running kprobe events"); + return 0; } __setup("kprobe_event=", set_kprobe_boot_events); @@ -1887,8 +1888,6 @@ static __init void setup_boot_kprobe_events(void) ret = trace_run_command(cmd, create_or_delete_trace_kprobe); if (ret) pr_warn("Failed to add event(%d): %s\n", ret, cmd); - else - kprobe_boot_events_enabled = true; cmd = p; } @@ -1973,10 +1972,8 @@ static __init int kprobe_trace_self_tests_init(void) if (tracing_is_disabled()) return -ENODEV; - if (kprobe_boot_events_enabled) { - pr_info("Skipping kprobe tests due to kprobe_event on cmdline\n"); + if (tracing_selftest_disabled) return 0; - } target = kprobe_trace_selftest_target; diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 5ed081c6471c..73ef12092250 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -786,7 +786,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, /* Have we just recovered from a hang? */ if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) { - tracing_selftest_disabled = true; + disable_tracing_selftest("recovering from a hang"); ret = -1; goto out; } -- cgit v1.2.3 From 3b3493531c4d415044442349c9d37ad48ad44c85 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 14 Dec 2020 09:45:03 +0100 Subject: tracing: Drop unneeded assignment in ring_buffer_resize() Since commit 0a1754b2a97e ("ring-buffer: Return 0 on success from ring_buffer_resize()"), computing the size is not needed anymore. Drop unneeded assignment in ring_buffer_resize(). Link: https://lkml.kernel.org/r/20201214084503.3079-1-lukas.bulwahn@gmail.com Signed-off-by: Lukas Bulwahn Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f09d3f5911cb..8b57251ebf9d 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1974,8 +1974,6 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, if (nr_pages < 2) nr_pages = 2; - size = nr_pages * BUF_PAGE_SIZE; - /* prevent another thread from changing buffer sizes */ mutex_lock(&buffer->mutex); -- cgit v1.2.3 From 82db909e6be667f2993802f3a1e86426cab57049 Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Wed, 14 Oct 2020 23:27:49 +0800 Subject: ring-buffer: Fix two typos in comments s/inerrupting/interrupting/ s/beween/between/ Link: https://lkml.kernel.org/r/20201014152749.29986-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8b57251ebf9d..e97ecf72c727 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -3399,7 +3399,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* This did not interrupt any time update */ info->delta = info->ts - info->after; else - /* Just use full timestamp for inerrupting event */ + /* Just use full timestamp for interrupting event */ info->delta = info->ts; barrier(); check_buffer(cpu_buffer, info, tail); @@ -3436,7 +3436,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, info->ts = ts; } else { /* - * Interrupted beween C and E: + * Interrupted between C and E: * Lost the previous events time stamp. Just set the * delta to zero, and this will be the same time as * the event this event interrupted. And the events that -- cgit v1.2.3 From 74e2afc6df5782ea34bc7ac350aeb206c3666f9a Mon Sep 17 00:00:00 2001 From: Qiujun Huang Date: Thu, 15 Oct 2020 19:38:42 +0800 Subject: ring-buffer: Add rb_check_bpage in __rb_allocate_pages It may be better to check each page is aligned by 4 bytes. The 2 least significant bits of the address will be used as flags. Link: https://lkml.kernel.org/r/20201015113842.2921-1-hqjagain@gmail.com Signed-off-by: Qiujun Huang Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ring_buffer.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e97ecf72c727..e03bc4e5d482 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1423,7 +1423,8 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) return 0; } -static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) +static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, + long nr_pages, struct list_head *pages) { struct buffer_page *bpage, *tmp; bool user_thread = current->mm != NULL; @@ -1463,13 +1464,15 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) struct page *page; bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), - mflags, cpu_to_node(cpu)); + mflags, cpu_to_node(cpu_buffer->cpu)); if (!bpage) goto free_pages; + rb_check_bpage(cpu_buffer, bpage); + list_add(&bpage->list, pages); - page = alloc_pages_node(cpu_to_node(cpu), mflags, 0); + page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0); if (!page) goto free_pages; bpage->page = page_address(page); @@ -1501,7 +1504,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, WARN_ON(!nr_pages); - if (__rb_allocate_pages(nr_pages, &pages, cpu_buffer->cpu)) + if (__rb_allocate_pages(cpu_buffer, nr_pages, &pages)) return -ENOMEM; /* @@ -2008,8 +2011,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, * allocated without receiving ENOMEM */ INIT_LIST_HEAD(&cpu_buffer->new_pages); - if (__rb_allocate_pages(cpu_buffer->nr_pages_to_update, - &cpu_buffer->new_pages, cpu)) { + if (__rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages)) { /* not enough memory for new pages */ err = -ENOMEM; goto out_err; @@ -2074,8 +2077,8 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size, INIT_LIST_HEAD(&cpu_buffer->new_pages); if (cpu_buffer->nr_pages_to_update > 0 && - __rb_allocate_pages(cpu_buffer->nr_pages_to_update, - &cpu_buffer->new_pages, cpu_id)) { + __rb_allocate_pages(cpu_buffer, cpu_buffer->nr_pages_to_update, + &cpu_buffer->new_pages)) { err = -ENOMEM; goto out_err; } -- cgit v1.2.3 From adab66b71abfe206a020f11e561f4df41f0b2aba Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 14 Dec 2020 12:33:51 -0500 Subject: Revert: "ring-buffer: Remove HAVE_64BIT_ALIGNED_ACCESS" It was believed that metag was the only architecture that required the ring buffer to keep 8 byte words aligned on 8 byte architectures, and with its removal, it was assumed that the ring buffer code did not need to handle this case. It appears that sparc64 also requires this. The following was reported on a sparc64 boot up: kernel: futex hash table entries: 65536 (order: 9, 4194304 bytes, linear) kernel: Running postponed tracer tests: kernel: Testing tracer function: kernel: Kernel unaligned access at TPC[552a20] trace_function+0x40/0x140 kernel: Kernel unaligned access at TPC[552a24] trace_function+0x44/0x140 kernel: Kernel unaligned access at TPC[552a20] trace_function+0x40/0x140 kernel: Kernel unaligned access at TPC[552a24] trace_function+0x44/0x140 kernel: Kernel unaligned access at TPC[552a20] trace_function+0x40/0x140 kernel: PASSED Need to put back the 64BIT aligned code for the ring buffer. Link: https://lore.kernel.org/r/CADxRZqzXQRYgKc=y-KV=S_yHL+Y8Ay2mh5ezeZUnpRvg+syWKw@mail.gmail.com Cc: stable@vger.kernel.org Fixes: 86b3de60a0b6 ("ring-buffer: Remove HAVE_64BIT_ALIGNED_ACCESS") Reported-by: Anatoly Pugachev Signed-off-by: Steven Rostedt (VMware) --- arch/Kconfig | 16 ++++++++++++++++ kernel/trace/ring_buffer.c | 17 +++++++++++++---- 2 files changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 56b6ccc0e32d..fa716994f77e 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -143,6 +143,22 @@ config UPROBES managed by the kernel and kept transparent to the probed application. ) +config HAVE_64BIT_ALIGNED_ACCESS + def_bool 64BIT && !HAVE_EFFICIENT_UNALIGNED_ACCESS + help + Some architectures require 64 bit accesses to be 64 bit + aligned, which also requires structs containing 64 bit values + to be 64 bit aligned too. This includes some 32 bit + architectures which can do 64 bit accesses, as well as 64 bit + architectures without unaligned access. + + This symbol should be selected by an architecture if 64 bit + accesses are required to be 64 bit aligned in this way even + though it is not a 64 bit architecture. + + See Documentation/unaligned-memory-access.txt for more + information on the topic of unaligned memory accesses. + config HAVE_EFFICIENT_UNALIGNED_ACCESS bool help diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e03bc4e5d482..926845eb5ab5 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -130,7 +130,16 @@ int ring_buffer_print_entry_header(struct trace_seq *s) #define RB_ALIGNMENT 4U #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ -#define RB_ALIGN_DATA __aligned(RB_ALIGNMENT) + +#ifndef CONFIG_HAVE_64BIT_ALIGNED_ACCESS +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + +#define RB_ALIGN_DATA __aligned(RB_ARCH_ALIGNMENT) /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -2718,7 +2727,7 @@ rb_update_event(struct ring_buffer_per_cpu *cpu_buffer, event->time_delta = delta; length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) { + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) { event->type_len = 0; event->array[0] = length; } else @@ -2733,11 +2742,11 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length++; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ALIGNMENT); + length = ALIGN(length, RB_ARCH_ALIGNMENT); /* * In case the time delta is larger than the 27 bits for it -- cgit v1.2.3 From cd17d38f8b28f808c368121041c0a4fa91757e0d Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 9 Dec 2020 17:33:49 -0800 Subject: bpf: Permits pointers on stack for helper calls Currently, when checking stack memory accessed by helper calls, for spills, only PTR_TO_BTF_ID and SCALAR_VALUE are allowed. Song discovered an issue where the below bpf program int dump_task(struct bpf_iter__task *ctx) { struct seq_file *seq = ctx->meta->seq; static char[] info = "abc"; BPF_SEQ_PRINTF(seq, "%s\n", info); return 0; } may cause a verifier failure. The verifier output looks like: ; struct seq_file *seq = ctx->meta->seq; 1: (79) r1 = *(u64 *)(r1 +0) ; BPF_SEQ_PRINTF(seq, "%s\n", info); 2: (18) r2 = 0xffff9054400f6000 4: (7b) *(u64 *)(r10 -8) = r2 5: (bf) r4 = r10 ; 6: (07) r4 += -8 ; BPF_SEQ_PRINTF(seq, "%s\n", info); 7: (18) r2 = 0xffff9054400fe000 9: (b4) w3 = 4 10: (b4) w5 = 8 11: (85) call bpf_seq_printf#126 R1_w=ptr_seq_file(id=0,off=0,imm=0) R2_w=map_value(id=0,off=0,ks=4,vs=4,imm=0) R3_w=inv4 R4_w=fp-8 R5_w=inv8 R10=fp0 fp-8_w=map_value last_idx 11 first_idx 0 regs=8 stack=0 before 10: (b4) w5 = 8 regs=8 stack=0 before 9: (b4) w3 = 4 invalid indirect read from stack off -8+0 size 8 Basically, the verifier complains the map_value pointer at "fp-8" location. To fix the issue, if env->allow_ptr_leaks is true, let us also permit pointers on the stack to be accessible by the helper. Reported-by: Song Liu Suggested-by: Alexei Starovoitov Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201210013349.943719-1-yhs@fb.com --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 93def76cf32b..9159c9822ede 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3769,7 +3769,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, goto mark; if (state->stack[spi].slot_type[0] == STACK_SPILL && - state->stack[spi].spilled_ptr.type == SCALAR_VALUE) { + (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || + env->allow_ptr_leaks)) { __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); for (j = 0; j < BPF_REG_SIZE; j++) state->stack[spi].slot_type[j] = STACK_MISC; -- cgit v1.2.3 From 1b04fa9900263b4e217ca2509fd778b32c2b4eb2 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 9 Dec 2020 21:27:31 +0100 Subject: rcu-tasks: Move RCU-tasks initialization to before early_initcall() PowerPC testing encountered boot failures due to RCU Tasks not being fully initialized until core_initcall() time. This commit therefore initializes RCU Tasks (along with Rude RCU and RCU Tasks Trace) just before early_initcall() time, thus allowing waiting on RCU Tasks grace periods from early_initcall() handlers. Link: https://lore.kernel.org/rcu/87eekfh80a.fsf@dja-thinkpad.axtens.net/ Fixes: 36dadef23fcc ("kprobes: Init kprobes in early_initcall") Tested-by: Daniel Axtens Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 6 ++++++ init/main.c | 1 + kernel/rcu/tasks.h | 25 +++++++++++++++++++++---- 3 files changed, 28 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 6cdd0152c253..5c119d6cecf1 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -86,6 +86,12 @@ void rcu_sched_clock_irq(int user); void rcu_report_dead(unsigned int cpu); void rcutree_migrate_callbacks(int cpu); +#ifdef CONFIG_TASKS_RCU_GENERIC +void rcu_init_tasks_generic(void); +#else +static inline void rcu_init_tasks_generic(void) { } +#endif + #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); diff --git a/init/main.c b/init/main.c index 32b2a8affafd..9d964511fe0c 100644 --- a/init/main.c +++ b/init/main.c @@ -1512,6 +1512,7 @@ static noinline void __init kernel_init_freeable(void) init_mm_internals(); + rcu_init_tasks_generic(); do_pre_smp_initcalls(); lockup_detector_init(); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d5d9f2d03e8a..73bbe792fe1e 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -241,7 +241,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) } } -/* Spawn RCU-tasks grace-period kthread, e.g., at core_initcall() time. */ +/* Spawn RCU-tasks grace-period kthread. */ static void __init rcu_spawn_tasks_kthread_generic(struct rcu_tasks *rtp) { struct task_struct *t; @@ -569,7 +569,6 @@ static int __init rcu_spawn_tasks_kthread(void) rcu_spawn_tasks_kthread_generic(&rcu_tasks); return 0; } -core_initcall(rcu_spawn_tasks_kthread); #ifndef CONFIG_TINY_RCU static void show_rcu_tasks_classic_gp_kthread(void) @@ -697,7 +696,6 @@ static int __init rcu_spawn_tasks_rude_kthread(void) rcu_spawn_tasks_kthread_generic(&rcu_tasks_rude); return 0; } -core_initcall(rcu_spawn_tasks_rude_kthread); #ifndef CONFIG_TINY_RCU static void show_rcu_tasks_rude_gp_kthread(void) @@ -975,6 +973,11 @@ static void rcu_tasks_trace_pregp_step(void) static void rcu_tasks_trace_pertask(struct task_struct *t, struct list_head *hop) { + // During early boot when there is only the one boot CPU, there + // is no idle task for the other CPUs. Just return. + if (unlikely(t == NULL)) + return; + WRITE_ONCE(t->trc_reader_special.b.need_qs, false); WRITE_ONCE(t->trc_reader_checked, false); t->trc_ipi_to_cpu = -1; @@ -1200,7 +1203,6 @@ static int __init rcu_spawn_tasks_trace_kthread(void) rcu_spawn_tasks_kthread_generic(&rcu_tasks_trace); return 0; } -core_initcall(rcu_spawn_tasks_trace_kthread); #ifndef CONFIG_TINY_RCU static void show_rcu_tasks_trace_gp_kthread(void) @@ -1229,6 +1231,21 @@ void show_rcu_tasks_gp_kthreads(void) } #endif /* #ifndef CONFIG_TINY_RCU */ +void __init rcu_init_tasks_generic(void) +{ +#ifdef CONFIG_TASKS_RCU + rcu_spawn_tasks_kthread(); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + rcu_spawn_tasks_rude_kthread(); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + rcu_spawn_tasks_trace_kthread(); +#endif +} + #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ static inline void rcu_tasks_bootup_oddness(void) {} void show_rcu_tasks_gp_kthreads(void) {} -- cgit v1.2.3 From ae7927023243dcc7389b2d59b16c09cbbeaecc36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Dec 2020 17:14:08 +0100 Subject: sched: Optimize finish_lock_switch() The kernel test robot measured a -1.6% performance regression on will-it-scale/sched_yield due to commit: 2558aacff858 ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug") Even though we were careful to replace a single load with another single load from the same cacheline. Restore finish_lock_switch() to the exact state before the offending patch and solve the problem differently. Fixes: 2558aacff858 ("sched/hotplug: Ensure only per-cpu kthreads run during hotplug") Reported-by: kernel test robot Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201210161408.GX3021@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 40 +++++++++++++++------------------------- kernel/sched/sched.h | 13 +++++-------- 2 files changed, 20 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7af80c3fce12..0ca7d2dc16d5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3985,15 +3985,20 @@ static void do_balance_callbacks(struct rq *rq, struct callback_head *head) } } +static void balance_push(struct rq *rq); + +struct callback_head balance_push_callback = { + .next = NULL, + .func = (void (*)(struct callback_head *))balance_push, +}; + static inline struct callback_head *splice_balance_callbacks(struct rq *rq) { struct callback_head *head = rq->balance_callback; lockdep_assert_held(&rq->lock); - if (head) { + if (head) rq->balance_callback = NULL; - rq->balance_flags &= ~BALANCE_WORK; - } return head; } @@ -4014,21 +4019,6 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head) } } -static void balance_push(struct rq *rq); - -static inline void balance_switch(struct rq *rq) -{ - if (likely(!rq->balance_flags)) - return; - - if (rq->balance_flags & BALANCE_PUSH) { - balance_push(rq); - return; - } - - __balance_callbacks(rq); -} - #else static inline void __balance_callbacks(struct rq *rq) @@ -4044,10 +4034,6 @@ static inline void balance_callbacks(struct rq *rq, struct callback_head *head) { } -static inline void balance_switch(struct rq *rq) -{ -} - #endif static inline void @@ -4075,7 +4061,7 @@ static inline void finish_lock_switch(struct rq *rq) * prev into current: */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - balance_switch(rq); + __balance_callbacks(rq); raw_spin_unlock_irq(&rq->lock); } @@ -7256,6 +7242,10 @@ static void balance_push(struct rq *rq) lockdep_assert_held(&rq->lock); SCHED_WARN_ON(rq->cpu != smp_processor_id()); + /* + * Ensure the thing is persistent until balance_push_set(.on = false); + */ + rq->balance_callback = &balance_push_callback; /* * Both the cpu-hotplug and stop task are in this case and are @@ -7305,9 +7295,9 @@ static void balance_push_set(int cpu, bool on) rq_lock_irqsave(rq, &rf); if (on) - rq->balance_flags |= BALANCE_PUSH; + rq->balance_callback = &balance_push_callback; else - rq->balance_flags &= ~BALANCE_PUSH; + rq->balance_callback = NULL; rq_unlock_irqrestore(rq, &rf); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f5acb6c5ce49..12ada79d40f3 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -975,7 +975,6 @@ struct rq { unsigned long cpu_capacity_orig; struct callback_head *balance_callback; - unsigned char balance_flags; unsigned char nohz_idle_balance; unsigned char idle_balance; @@ -1226,6 +1225,8 @@ struct rq_flags { #endif }; +extern struct callback_head balance_push_callback; + /* * Lockdep annotation that avoids accidental unlocks; it's like a * sticky/continuous lockdep_assert_held(). @@ -1243,9 +1244,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) #ifdef CONFIG_SCHED_DEBUG rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; -#endif #ifdef CONFIG_SMP - SCHED_WARN_ON(rq->balance_callback); + SCHED_WARN_ON(rq->balance_callback && rq->balance_callback != &balance_push_callback); +#endif #endif } @@ -1408,9 +1409,6 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP -#define BALANCE_WORK 0x01 -#define BALANCE_PUSH 0x02 - static inline void queue_balance_callback(struct rq *rq, struct callback_head *head, @@ -1418,13 +1416,12 @@ queue_balance_callback(struct rq *rq, { lockdep_assert_held(&rq->lock); - if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) + if (unlikely(head->next || rq->balance_callback == &balance_push_callback)) return; head->func = (void (*)(struct callback_head *))func; head->next = rq->balance_callback; rq->balance_callback = head; - rq->balance_flags |= BALANCE_WORK; } #define rcu_dereference_check_sched_domain(p) \ -- cgit v1.2.3 From f6a694665f132cbf6e2222dd2f173dc35330a8aa Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Mon, 14 Dec 2020 21:03:27 -0500 Subject: tracing: Offload eval map updates to a work queue In order for tracepoints to export their enums to user space, the use of the TRACE_DEFINE_ENUM() macro is used. On boot up, the strings shown in the tracefs "print fmt" lines are processed, and all the enums registered by TRACE_DEFINE_ENUM are replaced with the interger value. This way, userspace tools that read the raw binary data, knows how to evaluate the raw events. This is currently done in an initcall, but it has been noticed that slow embedded boards that have tracing may take a few seconds to process them all, and a few seconds slow down on an embedded device is detrimental to the system. Instead, offload the work to a work queue and make sure that its finished by destroying the work queue (which flushes all work) in a late initcall. This will allow the system to continue to boot and run the updates in the background, and this speeds up the boot time. Note, the strings being updated are only used by user space, so finishing the process before the system is fully booted will prevent any race issues. Link: https://lore.kernel.org/r/68d7b3327052757d0cd6359a6c9015a85b437232.camel@pengutronix.de Reported-by: Lucas Stach Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index eee484afcc51..eb5205e48733 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -9066,7 +9066,10 @@ int tracing_init_dentry(void) extern struct trace_eval_map *__start_ftrace_eval_maps[]; extern struct trace_eval_map *__stop_ftrace_eval_maps[]; -static void __init trace_eval_init(void) +static struct workqueue_struct *eval_map_wq __initdata; +static struct work_struct eval_map_work __initdata; + +static void __init eval_map_work_func(struct work_struct *work) { int len; @@ -9074,6 +9077,33 @@ static void __init trace_eval_init(void) trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len); } +static int __init trace_eval_init(void) +{ + INIT_WORK(&eval_map_work, eval_map_work_func); + + eval_map_wq = alloc_workqueue("eval_map_wq", WQ_UNBOUND, 0); + if (!eval_map_wq) { + pr_err("Unable to allocate eval_map_wq\n"); + /* Do work here */ + eval_map_work_func(&eval_map_work); + return -ENOMEM; + } + + queue_work(eval_map_wq, &eval_map_work); + return 0; +} + +static int __init trace_eval_sync(void) +{ + /* Make sure the eval map updates are finished */ + if (eval_map_wq) + destroy_workqueue(eval_map_wq); + return 0; +} + +late_initcall_sync(trace_eval_sync); + + #ifdef CONFIG_MODULES static void trace_module_add_evals(struct module *mod) { -- cgit v1.2.3 From a313357e704f2617f298333e3e617a38b1719760 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:37 +0100 Subject: genirq: Move irq_has_action() into core code This function uses irq_to_desc() and is going to be used by modules to replace the open coded irq_to_desc() (ab)usage. The final goal is to remove the export of irq_to_desc() so driver cannot fiddle with it anymore. Move it into the core code and fixup the usage sites to include the proper header. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194042.548936472@linutronix.de --- arch/alpha/kernel/sys_jensen.c | 2 +- arch/x86/kernel/topology.c | 1 + include/linux/interrupt.h | 1 + include/linux/irqdesc.h | 7 +------ kernel/irq/manage.c | 17 +++++++++++++++++ 5 files changed, 21 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/sys_jensen.c b/arch/alpha/kernel/sys_jensen.c index 0a2ab6cb18db..e5d870ff225f 100644 --- a/arch/alpha/kernel/sys_jensen.c +++ b/arch/alpha/kernel/sys_jensen.c @@ -7,7 +7,7 @@ * * Code supporting the Jensen. */ - +#include #include #include #include diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 0a2ec801b63f..f5477eab5692 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -25,6 +25,7 @@ * * Send feedback to */ +#include #include #include #include diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 870b3251e174..bb8ff9083e7d 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -232,6 +232,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); # define local_irq_enable_in_hardirq() local_irq_enable() #endif +bool irq_has_action(unsigned int irq); extern void disable_irq_nosync(unsigned int irq); extern bool disable_hardirq(unsigned int irq); extern void disable_irq(unsigned int irq); diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 5745491303e0..385a4fafe631 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -179,12 +179,7 @@ int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq, /* Test to see if a driver has successfully requested an irq */ static inline int irq_desc_has_action(struct irq_desc *desc) { - return desc->action != NULL; -} - -static inline int irq_has_action(unsigned int irq) -{ - return irq_desc_has_action(irq_to_desc(irq)); + return desc && desc->action != NULL; } /** diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c826ba4141fe..a5a1cde5c1a2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2822,3 +2822,20 @@ out_unlock: return err; } EXPORT_SYMBOL_GPL(irq_set_irqchip_state); + +/** + * irq_has_action - Check whether an interrupt is requested + * @irq: The linux irq number + * + * Returns: A snapshot of the current state + */ +bool irq_has_action(unsigned int irq) +{ + bool res; + + rcu_read_lock(); + res = irq_desc_has_action(irq_to_desc(irq)); + rcu_read_unlock(); + return res; +} +EXPORT_SYMBOL_GPL(irq_has_action); -- cgit v1.2.3 From fdd029630434b434b127efc7fba337da28f45658 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:38 +0100 Subject: genirq: Move status flag checks to core These checks are used by modules and prevent the removal of the export of irq_to_desc(). Move the accessor into the core. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194042.703779349@linutronix.de --- include/linux/irqdesc.h | 17 +++++------------ kernel/irq/manage.c | 20 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 385a4fafe631..308d7db8991f 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -223,28 +223,21 @@ irq_set_chip_handler_name_locked(struct irq_data *data, struct irq_chip *chip, data->chip = chip; } +bool irq_check_status_bit(unsigned int irq, unsigned int bitmask); + static inline bool irq_balancing_disabled(unsigned int irq) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); - return desc->status_use_accessors & IRQ_NO_BALANCING_MASK; + return irq_check_status_bit(irq, IRQ_NO_BALANCING_MASK); } static inline bool irq_is_percpu(unsigned int irq) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); - return desc->status_use_accessors & IRQ_PER_CPU; + return irq_check_status_bit(irq, IRQ_PER_CPU); } static inline bool irq_is_percpu_devid(unsigned int irq) { - struct irq_desc *desc; - - desc = irq_to_desc(irq); - return desc->status_use_accessors & IRQ_PER_CPU_DEVID; + return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID); } static inline void diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a5a1cde5c1a2..ab8567f32501 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2839,3 +2839,23 @@ bool irq_has_action(unsigned int irq) return res; } EXPORT_SYMBOL_GPL(irq_has_action); + +/** + * irq_check_status_bit - Check whether bits in the irq descriptor status are set + * @irq: The linux irq number + * @bitmask: The bitmask to evaluate + * + * Returns: True if one of the bits in @bitmask is set + */ +bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) +{ + struct irq_desc *desc; + bool res = false; + + rcu_read_lock(); + desc = irq_to_desc(irq); + if (desc) + res = !!(desc->status_use_accessors & bitmask); + rcu_read_unlock(); + return res; +} -- cgit v1.2.3 From f1c6306c0d6b50844ba02c8a53e35405e9c0db05 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:39 +0100 Subject: genirq: Move irq_set_lockdep_class() to core irq_set_lockdep_class() is used from modules and requires irq_to_desc() to be exported. Move it into the core code which lifts another requirement for the export. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194042.860029489@linutronix.de --- include/linux/irqdesc.h | 10 ++++------ kernel/irq/irqdesc.c | 14 ++++++++++++++ 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 308d7db8991f..4a1d016716f4 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -240,16 +240,14 @@ static inline bool irq_is_percpu_devid(unsigned int irq) return irq_check_status_bit(irq, IRQ_PER_CPU_DEVID); } +void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, + struct lock_class_key *request_class); static inline void irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, struct lock_class_key *request_class) { - struct irq_desc *desc = irq_to_desc(irq); - - if (desc) { - lockdep_set_class(&desc->lock, lock_class); - lockdep_set_class(&desc->request_mutex, request_class); - } + if (IS_ENABLED(CONFIG_LOCKDEP)) + __irq_set_lockdep_class(irq, lock_class, request_class); } #endif diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index e810eb9906ea..20a54fa7cd30 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -968,3 +968,17 @@ unsigned int kstat_irqs_usr(unsigned int irq) rcu_read_unlock(); return sum; } + +#ifdef CONFIG_LOCKDEP +void __irq_set_lockdep_class(unsigned int irq, struct lock_class_key *lock_class, + struct lock_class_key *request_class) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc) { + lockdep_set_class(&desc->lock, lock_class); + lockdep_set_class(&desc->request_mutex, request_class); + } +} +EXPORT_SYMBOL_GPL(__irq_set_lockdep_class); +#endif -- cgit v1.2.3 From 9e42ad10cedf0632fc39860381375806092212bd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:41 +0100 Subject: genirq: Annotate irq stats data races Both the per cpu stats and the accumulated count are accessed lockless and can be concurrently modified. That's intentional and the stats are a rough estimate anyway. Annotate them with data_race(). Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194043.067097663@linutronix.de --- kernel/irq/irqdesc.c | 4 ++-- kernel/irq/proc.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 20a54fa7cd30..02b446a21ce6 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -943,10 +943,10 @@ unsigned int kstat_irqs(unsigned int irq) if (!irq_settings_is_per_cpu_devid(desc) && !irq_settings_is_per_cpu(desc) && !irq_is_nmi(desc)) - return desc->tot_count; + return data_race(desc->tot_count); for_each_possible_cpu(cpu) - sum += *per_cpu_ptr(desc->kstat_irqs, cpu); + sum += data_race(*per_cpu_ptr(desc->kstat_irqs, cpu)); return sum; } diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 72513ed2a5fc..98138788cb04 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -488,9 +488,10 @@ int show_interrupts(struct seq_file *p, void *v) if (!desc || irq_settings_is_hidden(desc)) goto outsparse; - if (desc->kstat_irqs) + if (desc->kstat_irqs) { for_each_online_cpu(j) - any_count |= *per_cpu_ptr(desc->kstat_irqs, j); + any_count |= data_race(*per_cpu_ptr(desc->kstat_irqs, j)); + } if ((!desc->action || irq_desc_is_chained(desc)) && !any_count) goto outsparse; -- cgit v1.2.3 From 26c19d0a8610fb233b31730fe26a31145f2d9796 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:43 +0100 Subject: genirq: Make kstat_irqs() static No more users outside the core code. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194043.268774449@linutronix.de --- include/linux/kernel_stat.h | 1 - kernel/irq/irqdesc.c | 19 ++++++------------- 2 files changed, 6 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 89f0745c096d..44ae1a7eb9e3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -67,7 +67,6 @@ static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) /* * Number of interrupts per specific IRQ source, since bootup */ -extern unsigned int kstat_irqs(unsigned int irq); extern unsigned int kstat_irqs_usr(unsigned int irq); /* diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 02b446a21ce6..2eb076f4a566 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -924,15 +924,7 @@ static bool irq_is_nmi(struct irq_desc *desc) return desc->istate & IRQS_NMI; } -/** - * kstat_irqs - Get the statistics for an interrupt - * @irq: The interrupt number - * - * Returns the sum of interrupt counts on all cpus since boot for - * @irq. The caller must ensure that the interrupt is not removed - * concurrently. - */ -unsigned int kstat_irqs(unsigned int irq) +static unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned int sum = 0; @@ -951,13 +943,14 @@ unsigned int kstat_irqs(unsigned int irq) } /** - * kstat_irqs_usr - Get the statistics for an interrupt + * kstat_irqs_usr - Get the statistics for an interrupt from thread context * @irq: The interrupt number * * Returns the sum of interrupt counts on all cpus since boot for @irq. - * Contrary to kstat_irqs() this can be called from any context. - * It uses rcu since a concurrent removal of an interrupt descriptor is - * observing an rcu grace period before delayed_free_desc()/irq_kobj_release(). + * + * It uses rcu to protect the access since a concurrent removal of an + * interrupt descriptor is observing an rcu grace period before + * delayed_free_desc()/irq_kobj_release(). */ unsigned int kstat_irqs_usr(unsigned int irq) { -- cgit v1.2.3 From 501e2db67fa4264b517de5c7934e94cca89b3a1e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:25:44 +0100 Subject: genirq: Provide kstat_irqdesc_cpu() Most users of kstat_irqs_cpu() have the irq descriptor already. No point in calling into the core code and looking it up once more. Use it in per_cpu_count_show() to start with. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194043.362094758@linutronix.de --- include/linux/irqdesc.h | 6 ++++++ kernel/irq/irqdesc.c | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 4a1d016716f4..891b323266df 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -113,6 +113,12 @@ static inline void irq_unlock_sparse(void) { } extern struct irq_desc irq_desc[NR_IRQS]; #endif +static inline unsigned int irq_desc_kstat_cpu(struct irq_desc *desc, + unsigned int cpu) +{ + return desc->kstat_irqs ? *per_cpu_ptr(desc->kstat_irqs, cpu) : 0; +} + static inline struct irq_desc *irq_data_to_desc(struct irq_data *data) { return container_of(data->common, struct irq_desc, irq_common_data); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 2eb076f4a566..f509c4db2029 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -147,12 +147,12 @@ static ssize_t per_cpu_count_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { struct irq_desc *desc = container_of(kobj, struct irq_desc, kobj); - int cpu, irq = desc->irq_data.irq; ssize_t ret = 0; char *p = ""; + int cpu; for_each_possible_cpu(cpu) { - unsigned int c = kstat_irqs_cpu(irq, cpu); + unsigned int c = irq_desc_kstat_cpu(desc, cpu); ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%u", p, c); p = ","; -- cgit v1.2.3 From 64a1b95bb9fe3ec76e1a2cd803eff06389341ae4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 10 Dec 2020 20:26:06 +0100 Subject: genirq: Restrict export of irq_to_desc() No more (ab)use in drivers finally. There is still the modular build of PPC/KVM which needs it, so restrict it to this case which still makes it unavailable for most drivers. Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201210194045.551428291@linutronix.de --- kernel/irq/irqdesc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index f509c4db2029..3d0bc38a0bcf 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -352,7 +352,9 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return radix_tree_lookup(&irq_desc_tree, irq); } -EXPORT_SYMBOL(irq_to_desc); +#ifdef CONFIG_KVM_BOOK3S_64_HV +EXPORT_SYMBOL_GPL(irq_to_desc); +#endif static void delete_irq_desc(unsigned int irq) { -- cgit v1.2.3 From ca6827de4b67367e73fdf43d2ea0a0064423edfb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 14 Dec 2020 21:04:11 +0100 Subject: cpufreq: schedutil: Add util to struct sg_cpu Instead of passing util and max between functions while computing the utilization and capacity, store the former in struct sg_cpu (along with the latter and bw_dl). This will allow the current utilization value to be compared with the one obtained previously (which is requisite for some code changes to follow this one), but also it causes the code to look slightly more consistent and cleaner. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- kernel/sched/cpufreq_schedutil.c | 42 +++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 77736058d8e4..319a270d13c1 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -53,6 +53,7 @@ struct sugov_cpu { unsigned int iowait_boost; u64 last_update; + unsigned long util; unsigned long bw_dl; unsigned long max; @@ -276,16 +277,15 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, return min(max, util); } -static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util = cpu_util_cfs(rq); unsigned long max = arch_scale_cpu_capacity(sg_cpu->cpu); sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - - return schedutil_cpu_util(sg_cpu->cpu, util, max, FREQUENCY_UTIL, NULL); + sg_cpu->util = schedutil_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, + FREQUENCY_UTIL, NULL); } /** @@ -362,8 +362,6 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * sugov_iowait_apply() - Apply the IO boost to a CPU. * @sg_cpu: the sugov data for the cpu to boost * @time: the update time from the caller - * @util: the utilization to (eventually) boost - * @max: the maximum value the utilization can be boosted to * * A CPU running a task which woken up after an IO operation can have its * utilization boosted to speed up the completion of those IO operations. @@ -377,18 +375,17 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long util, unsigned long max) +static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time) { unsigned long boost; /* No boost currently required */ if (!sg_cpu->iowait_boost) - return util; + return; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return util; + return; if (!sg_cpu->iowait_boost_pending) { /* @@ -397,18 +394,19 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, sg_cpu->iowait_boost >>= 1; if (sg_cpu->iowait_boost < IOWAIT_BOOST_MIN) { sg_cpu->iowait_boost = 0; - return util; + return; } } sg_cpu->iowait_boost_pending = false; /* - * @util is already in capacity scale; convert iowait_boost + * sg_cpu->util is already in capacity scale; convert iowait_boost * into the same scale so we can compare. */ - boost = (sg_cpu->iowait_boost * max) >> SCHED_CAPACITY_SHIFT; - return max(boost, util); + boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT; + if (sg_cpu->util < boost) + sg_cpu->util = boost; } #ifdef CONFIG_NO_HZ_COMMON @@ -439,9 +437,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned long util, max; - unsigned int next_f; unsigned int cached_freq = sg_policy->cached_raw_freq; + unsigned int next_f; sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -451,10 +448,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; - util = sugov_get_util(sg_cpu); - max = sg_cpu->max; - util = sugov_iowait_apply(sg_cpu, time, util, max); - next_f = get_next_freq(sg_policy, util, max); + sugov_get_util(sg_cpu); + sugov_iowait_apply(sg_cpu, time); + + next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); /* * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. @@ -491,9 +488,10 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; - j_util = sugov_get_util(j_sg_cpu); + sugov_get_util(j_sg_cpu); + sugov_iowait_apply(j_sg_cpu, time); + j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; - j_util = sugov_iowait_apply(j_sg_cpu, time, j_util, j_max); if (j_util * max > j_max * util) { util = j_util; -- cgit v1.2.3 From ee2cc4276ba4909438f5894a218877660e1536d9 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 14 Dec 2020 21:08:00 +0100 Subject: cpufreq: Add special-purpose fast-switching callback for drivers First off, some cpufreq drivers (eg. intel_pstate) can pass hints beyond the current target frequency to the hardware and there are no provisions for doing that in the cpufreq framework. In particular, today the driver has to assume that it should not allow the frequency to fall below the one requested by the governor (or the required capacity may not be provided) which may not be the case and which may lead to excessive energy usage in some scenarios. Second, the hints passed by these drivers to the hardware need not be in terms of the frequency, so representing the utilization numbers coming from the scheduler as frequency before passing them to those drivers is not really useful. Address the two points above by adding a special-purpose replacement for the ->fast_switch callback, called ->adjust_perf, allowing the governor to pass abstract performance level (rather than frequency) values for the minimum (required) and target (desired) performance along with the CPU capacity to compare them to. Also update the schedutil governor to use the new callback instead of ->fast_switch if present and if the utilization mertics are frequency-invariant (that is requisite for the direct mapping between the utilization and the CPU performance levels to be a reasonable approximation). Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 40 +++++++++++++++++++++++ include/linux/cpufreq.h | 14 +++++++++ include/linux/sched/cpufreq.h | 5 +++ kernel/sched/cpufreq_schedutil.c | 68 ++++++++++++++++++++++++++++++++++------ 4 files changed, 117 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index c17aa2973c44..d0a3525ce27f 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -2097,6 +2097,46 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, } EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch); +/** + * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go. + * @cpu: Target CPU. + * @min_perf: Minimum (required) performance level (units of @capacity). + * @target_perf: Terget (desired) performance level (units of @capacity). + * @capacity: Capacity of the target CPU. + * + * Carry out a fast performance level switch of @cpu without sleeping. + * + * The driver's ->adjust_perf() callback invoked by this function must be + * suitable for being called from within RCU-sched read-side critical sections + * and it is expected to select a suitable performance level equal to or above + * @min_perf and preferably equal to or below @target_perf. + * + * This function must not be called if policy->fast_switch_enabled is unset. + * + * Governors calling this function must guarantee that it will never be invoked + * twice in parallel for the same CPU and that it will never be called in + * parallel with either ->target() or ->target_index() or ->fast_switch() for + * the same CPU. + */ +void cpufreq_driver_adjust_perf(unsigned int cpu, + unsigned long min_perf, + unsigned long target_perf, + unsigned long capacity) +{ + cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity); +} + +/** + * cpufreq_driver_has_adjust_perf - Check "direct fast switch" callback. + * + * Return 'true' if the ->adjust_perf callback is present for the + * current driver or 'false' otherwise. + */ +bool cpufreq_driver_has_adjust_perf(void) +{ + return !!cpufreq_driver->adjust_perf; +} + /* Must set freqs->new to intermediate frequency */ static int __target_intermediate(struct cpufreq_policy *policy, struct cpufreq_freqs *freqs, int index) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 584fccd4fcab..9c8b7437b6cd 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -320,6 +320,15 @@ struct cpufreq_driver { unsigned int index); unsigned int (*fast_switch)(struct cpufreq_policy *policy, unsigned int target_freq); + /* + * ->fast_switch() replacement for drivers that use an internal + * representation of performance levels and can pass hints other than + * the target performance level to the hardware. + */ + void (*adjust_perf)(unsigned int cpu, + unsigned long min_perf, + unsigned long target_perf, + unsigned long capacity); /* * Caches and returns the lowest driver-supported frequency greater than @@ -588,6 +597,11 @@ struct cpufreq_governor { /* Pass a target to the cpufreq driver */ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, unsigned int target_freq); +void cpufreq_driver_adjust_perf(unsigned int cpu, + unsigned long min_perf, + unsigned long target_perf, + unsigned long capacity); +bool cpufreq_driver_has_adjust_perf(void); int cpufreq_driver_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation); diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index 3ed5aa18593f..6205578ab6ee 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -28,6 +28,11 @@ static inline unsigned long map_util_freq(unsigned long util, { return (freq + (freq >> 2)) * util / cap; } + +static inline unsigned long map_util_perf(unsigned long util) +{ + return util + (util >> 2); +} #endif /* CONFIG_CPU_FREQ */ #endif /* _LINUX_SCHED_CPUFREQ_H */ diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 319a270d13c1..803bcb30db27 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -432,13 +432,10 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_p sg_policy->limits_changed = true; } -static void sugov_update_single(struct update_util_data *hook, u64 time, - unsigned int flags) +static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu, + u64 time, unsigned int flags) { - struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned int cached_freq = sg_policy->cached_raw_freq; - unsigned int next_f; sugov_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -446,11 +443,25 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, ignore_dl_rate_limit(sg_cpu, sg_policy); if (!sugov_should_update_freq(sg_policy, time)) - return; + return false; sugov_get_util(sg_cpu); sugov_iowait_apply(sg_cpu, time); + return true; +} + +static void sugov_update_single_freq(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + unsigned int cached_freq = sg_policy->cached_raw_freq; + unsigned int next_f; + + if (!sugov_update_single_common(sg_cpu, time, flags)) + return; + next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max); /* * Do not reduce the frequency if the CPU has not been idle @@ -477,6 +488,38 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, } } +static void sugov_update_single_perf(struct update_util_data *hook, u64 time, + unsigned int flags) +{ + struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); + unsigned long prev_util = sg_cpu->util; + + /* + * Fall back to the "frequency" path if frequency invariance is not + * supported, because the direct mapping between the utilization and + * the performance levels depends on the frequency invariance. + */ + if (!arch_scale_freq_invariant()) { + sugov_update_single_freq(hook, time, flags); + return; + } + + if (!sugov_update_single_common(sg_cpu, time, flags)) + return; + + /* + * Do not reduce the target performance level if the CPU has not been + * idle recently, as the reduction is likely to be premature then. + */ + if (sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) + sg_cpu->util = prev_util; + + cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl), + map_util_perf(sg_cpu->util), sg_cpu->max); + + sg_cpu->sg_policy->last_freq_update_time = time; +} + static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; @@ -815,6 +858,7 @@ static void sugov_exit(struct cpufreq_policy *policy) static int sugov_start(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; + void (*uu)(struct update_util_data *data, u64 time, unsigned int flags); unsigned int cpu; sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; @@ -834,13 +878,17 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->sg_policy = sg_policy; } + if (policy_is_shared(policy)) + uu = sugov_update_shared; + else if (policy->fast_switch_enabled && cpufreq_driver_has_adjust_perf()) + uu = sugov_update_single_perf; + else + uu = sugov_update_single_freq; + for_each_cpu(cpu, policy->cpus) { struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); - cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, - policy_is_shared(policy) ? - sugov_update_shared : - sugov_update_single); + cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, uu); } return 0; } -- cgit v1.2.3 From f630c7c6f10546ebff15c3a856e7949feb7a2372 Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 14 Dec 2020 19:03:14 -0800 Subject: kthread: add kthread_work tracepoints While migrating some code from wq to kthread_worker, I found that I missed the execute_start/end tracepoints. So add similar tracepoints for kthread_work. And for completeness, queue_work tracepoint (although this one differs slightly from the matching workqueue tracepoint). Link: https://lkml.kernel.org/r/20201010180323.126634-1-robdclark@gmail.com Signed-off-by: Rob Clark Cc: Rob Clark Cc: Steven Rostedt Cc: Ingo Molnar Cc: "Peter Zijlstra (Intel)" Cc: Phil Auld Cc: Valentin Schneider Cc: Thara Gopinath Cc: Randy Dunlap Cc: Vincent Donnefort Cc: Mel Gorman Cc: Jens Axboe Cc: Marcelo Tosatti Cc: Frederic Weisbecker Cc: Ilias Stamatis Cc: Liang Chen Cc: Ben Dooks Cc: Peter Zijlstra Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/sched.h | 84 ++++++++++++++++++++++++++++++++++++++++++++ kernel/kthread.c | 9 +++++ 2 files changed, 93 insertions(+) (limited to 'kernel') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c96a4337afe6..5039af667645 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -5,6 +5,7 @@ #if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCHED_H +#include #include #include #include @@ -51,6 +52,89 @@ TRACE_EVENT(sched_kthread_stop_ret, TP_printk("ret=%d", __entry->ret) ); +/** + * sched_kthread_work_queue_work - called when a work gets queued + * @worker: pointer to the kthread_worker + * @work: pointer to struct kthread_work + * + * This event occurs when a work is queued immediately or once a + * delayed work is actually queued (ie: once the delay has been + * reached). + */ +TRACE_EVENT(sched_kthread_work_queue_work, + + TP_PROTO(struct kthread_worker *worker, + struct kthread_work *work), + + TP_ARGS(worker, work), + + TP_STRUCT__entry( + __field( void *, work ) + __field( void *, function) + __field( void *, worker) + ), + + TP_fast_assign( + __entry->work = work; + __entry->function = work->func; + __entry->worker = worker; + ), + + TP_printk("work struct=%p function=%ps worker=%p", + __entry->work, __entry->function, __entry->worker) +); + +/** + * sched_kthread_work_execute_start - called immediately before the work callback + * @work: pointer to struct kthread_work + * + * Allows to track kthread work execution. + */ +TRACE_EVENT(sched_kthread_work_execute_start, + + TP_PROTO(struct kthread_work *work), + + TP_ARGS(work), + + TP_STRUCT__entry( + __field( void *, work ) + __field( void *, function) + ), + + TP_fast_assign( + __entry->work = work; + __entry->function = work->func; + ), + + TP_printk("work struct %p: function %ps", __entry->work, __entry->function) +); + +/** + * sched_kthread_work_execute_end - called immediately after the work callback + * @work: pointer to struct work_struct + * @function: pointer to worker function + * + * Allows to track workqueue execution. + */ +TRACE_EVENT(sched_kthread_work_execute_end, + + TP_PROTO(struct kthread_work *work, kthread_work_func_t function), + + TP_ARGS(work, function), + + TP_STRUCT__entry( + __field( void *, work ) + __field( void *, function) + ), + + TP_fast_assign( + __entry->work = work; + __entry->function = function; + ), + + TP_printk("work struct %p: function %ps", __entry->work, __entry->function) +); + /* * Tracepoint for waking up a task: */ diff --git a/kernel/kthread.c b/kernel/kthread.c index 933a625621b8..34516b0a6eb7 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -704,8 +704,15 @@ repeat: raw_spin_unlock_irq(&worker->lock); if (work) { + kthread_work_func_t func = work->func; __set_current_state(TASK_RUNNING); + trace_sched_kthread_work_execute_start(work); work->func(work); + /* + * Avoid dereferencing work after this point. The trace + * event only cares about the address. + */ + trace_sched_kthread_work_execute_end(work, func); } else if (!freezing(current)) schedule(); @@ -834,6 +841,8 @@ static void kthread_insert_work(struct kthread_worker *worker, { kthread_insert_work_sanity_check(worker, work); + trace_sched_kthread_work_queue_work(worker, work); + list_add_tail(&work->node, pos); work->worker = worker; if (!worker->current_work && likely(worker->task)) -- cgit v1.2.3 From ebb2bdcef8a00d59b27d3532c423110559821e1d Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 14 Dec 2020 19:03:18 -0800 Subject: kthread_worker: document CPU hotplug handling The kthread worker API is simple. In short, it allows to create, use, and destroy workers. kthread_create_worker_on_cpu() just allows to bind a newly created worker to a given CPU. It is up to the API user how to handle CPU hotplug. They have to decide how to handle pending work items, prevent queuing new ones, and restore the functionality when the CPU goes off and on. There are few catches: + The CPU affinity gets lost when it is scheduled on an offline CPU. + The worker might not exist when the CPU was off when the user created the workers. A good practice is to implement two CPU hotplug callbacks and destroy/create the worker when CPU goes down/up. Mention this in the function description. [akpm@linux-foundation.org: grammar tweaks] Link: https://lore.kernel.org/r/20201028073031.4536-1-qiang.zhang@windriver.com Link: https://lkml.kernel.org/r/20201102101039.19227-1-pmladek@suse.com Reported-by: Zhang Qiang Signed-off-by: Petr Mladek Cc: Tejun Heo Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 34516b0a6eb7..97e053ade74a 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -793,7 +793,25 @@ EXPORT_SYMBOL(kthread_create_worker); * A good practice is to add the cpu number also into the worker name. * For example, use kthread_create_worker_on_cpu(cpu, "helper/%d", cpu). * - * Returns a pointer to the allocated worker on success, ERR_PTR(-ENOMEM) + * CPU hotplug: + * The kthread worker API is simple and generic. It just provides a way + * to create, use, and destroy workers. + * + * It is up to the API user how to handle CPU hotplug. They have to decide + * how to handle pending work items, prevent queuing new ones, and + * restore the functionality when the CPU goes off and on. There are a + * few catches: + * + * - CPU affinity gets lost when it is scheduled on an offline CPU. + * + * - The worker might not exist when the CPU was off when the user + * created the workers. + * + * Good practice is to implement two CPU hotplug callbacks and to + * destroy/create the worker when the CPU goes down/up. + * + * Return: + * The pointer to the allocated worker on success, ERR_PTR(-ENOMEM) * when the needed structures could not get allocated, and ERR_PTR(-EINTR) * when the worker was SIGKILLed. */ -- cgit v1.2.3 From 57efa1fe5957694fa541c9062de0a127f0b9acb0 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 14 Dec 2020 19:05:44 -0800 Subject: mm/gup: prevent gup_fast from racing with COW during fork Since commit 70e806e4e645 ("mm: Do early cow for pinned pages during fork() for ptes") pages under a FOLL_PIN will not be write protected during COW for fork. This means that pages returned from pin_user_pages(FOLL_WRITE) should not become write protected while the pin is active. However, there is a small race where get_user_pages_fast(FOLL_PIN) can establish a FOLL_PIN at the same time copy_present_page() is write protecting it: CPU 0 CPU 1 get_user_pages_fast() internal_get_user_pages_fast() copy_page_range() pte_alloc_map_lock() copy_present_page() atomic_read(has_pinned) == 0 page_maybe_dma_pinned() == false atomic_set(has_pinned, 1); gup_pgd_range() gup_pte_range() pte_t pte = gup_get_pte(ptep) pte_access_permitted(pte) try_grab_compound_head() pte = pte_wrprotect(pte) set_pte_at(); pte_unmap_unlock() // GUP now returns with a write protected page The first attempt to resolve this by using the write protect caused problems (and was missing a barrrier), see commit f3c64eda3e50 ("mm: avoid early COW write protect games during fork()") Instead wrap copy_p4d_range() with the write side of a seqcount and check the read side around gup_pgd_range(). If there is a collision then get_user_pages_fast() fails and falls back to slow GUP. Slow GUP is safe against this race because copy_page_range() is only called while holding the exclusive side of the mmap_lock on the src mm_struct. [akpm@linux-foundation.org: coding style fixes] Link: https://lore.kernel.org/r/CAHk-=wi=iCnYCARbPGjkVJu9eyYeZ13N64tZYLdOB8CP5Q_PLw@mail.gmail.com Link: https://lkml.kernel.org/r/2-v4-908497cf359a+4782-gup_fork_jgg@nvidia.com Fixes: f3c64eda3e50 ("mm: avoid early COW write protect games during fork()") Signed-off-by: Jason Gunthorpe Suggested-by: Linus Torvalds Reviewed-by: John Hubbard Reviewed-by: Jan Kara Reviewed-by: Peter Xu Acked-by: "Ahmed S. Darwish" [seqcount_t parts] Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Christoph Hellwig Cc: Hugh Dickins Cc: Jann Horn Cc: Kirill Shutemov Cc: Kirill Tkhai Cc: Leon Romanovsky Cc: Michal Hocko Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/tboot.c | 1 + drivers/firmware/efi/efi.c | 1 + include/linux/mm_types.h | 8 ++++++++ kernel/fork.c | 1 + mm/gup.c | 18 ++++++++++++++++++ mm/init-mm.c | 1 + mm/memory.c | 13 ++++++++++++- 7 files changed, 42 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index ae64f98ec2ab..4c09ba110204 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -93,6 +93,7 @@ static struct mm_struct tboot_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(tboot_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 6c6eec044a97..df3f9bcab581 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -57,6 +57,7 @@ struct mm_struct efi_mm = { .mm_rb = RB_ROOT, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(efi_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist), diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5a9238f6caad..915f4f100383 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -446,6 +447,13 @@ struct mm_struct { */ atomic_t has_pinned; + /** + * @write_protect_seq: Locked when any thread is write + * protecting pages mapped by this mm to enforce a later COW, + * for instance during page table copying for fork(). + */ + seqcount_t write_protect_seq; + #ifdef CONFIG_MMU atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif diff --git a/kernel/fork.c b/kernel/fork.c index 6d266388d380..dc55f68a6ee3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1007,6 +1007,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); + seqcount_init(&mm->write_protect_seq); mmap_init_lock(mm); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; diff --git a/mm/gup.c b/mm/gup.c index c7e24301860a..9c6a2f5001c5 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2684,11 +2684,18 @@ static unsigned long lockless_pages_from_mm(unsigned long start, { unsigned long flags; int nr_pinned = 0; + unsigned seq; if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) || !gup_fast_permitted(start, end)) return 0; + if (gup_flags & FOLL_PIN) { + seq = raw_read_seqcount(¤t->mm->write_protect_seq); + if (seq & 1) + return 0; + } + /* * Disable interrupts. The nested form is used, in order to allow full, * general purpose use of this routine. @@ -2703,6 +2710,17 @@ static unsigned long lockless_pages_from_mm(unsigned long start, local_irq_save(flags); gup_pgd_range(start, end, gup_flags, pages, &nr_pinned); local_irq_restore(flags); + + /* + * When pinning pages for DMA there could be a concurrent write protect + * from fork() via copy_page_range(), in this case always fail fast GUP. + */ + if (gup_flags & FOLL_PIN) { + if (read_seqcount_retry(¤t->mm->write_protect_seq, seq)) { + unpin_user_pages(pages, nr_pinned); + return 0; + } + } return nr_pinned; } diff --git a/mm/init-mm.c b/mm/init-mm.c index 3a613c85f9ed..153162669f80 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -31,6 +31,7 @@ struct mm_struct init_mm = { .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), + .write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq), MMAP_LOCK_INITIALIZER(init_mm) .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), diff --git a/mm/memory.c b/mm/memory.c index c48f8df6e502..50632c4366b8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1171,6 +1171,15 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, src_vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); + /* + * Disabling preemption is not needed for the write side, as + * the read side doesn't spin, but goes to the mmap_lock. + * + * Use the raw variant of the seqcount_t write API to avoid + * lockdep complaining about preemptibility. + */ + mmap_assert_write_locked(src_mm); + raw_write_seqcount_begin(&src_mm->write_protect_seq); } ret = 0; @@ -1187,8 +1196,10 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) } } while (dst_pgd++, src_pgd++, addr = next, addr != end); - if (is_cow) + if (is_cow) { + raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); + } return ret; } -- cgit v1.2.3 From bef8620cd8e0a117c1a0719604052e424eb418f9 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 14 Dec 2020 19:06:49 -0800 Subject: mm: memcg: deprecate the non-hierarchical mode Patch series "mm: memcg: deprecate cgroup v1 non-hierarchical mode", v1. The non-hierarchical cgroup v1 mode is a legacy of early days of the memory controller and doesn't bring any value today. However, it complicates the code and creates many edge cases all over the memory controller code. It's a good time to deprecate it completely. This patchset removes the internal logic, adjusts the user interface and updates the documentation. The alt patch removes some bits of the cgroup core code, which become obsolete. Michal Hocko said: "All that we know today is that we have a warning in place to complain loudly when somebody relies on use_hierarchy=0 with a deeper hierarchy. For all those years we have seen _zero_ reports that would describe a sensible usecase. Moreover we (SUSE) have backported this warning into old distribution kernels (since 3.0 based kernels) to extend the coverage and didn't hear even for users who adopt new kernels only very slowly. The only report we have seen so far was a LTP test suite which doesn't really reflect any real life usecase" This patch (of 3): The non-hierarchical cgroup v1 mode is a legacy of early days of the memory controller and doesn't bring any value today. However, it complicates the code and creates many edge cases all over the memory controller code. It's a good time to deprecate it completely. Functionally this patch enabled is by default for all cgroups and forbids switching it off. Nothing changes if cgroup v2 is used: hierarchical mode was enforced from scratch. To protect the ABI memory.use_hierarchy interface is preserved with a limited functionality: reading always returns "1", writing of "1" passes silently, writing of any other value fails with -EINVAL and a warning to dmesg (on the first occasion). Link: https://lkml.kernel.org/r/20201110220800.929549-1-guro@fb.com Link: https://lkml.kernel.org/r/20201110220800.929549-2-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Reviewed-by: Shakeel Butt Acked-by: David Rientjes Acked-by: Johannes Weiner Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 7 ---- kernel/cgroup/cgroup.c | 5 --- mm/memcontrol.c | 90 +++++++--------------------------------------- 3 files changed, 13 insertions(+), 89 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f0f64fa3ccd..dd992b81bcb7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -234,11 +234,6 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; - /* - * Should the accounting and control be hierarchical, per subtree? - */ - bool use_hierarchy; - /* * Should the OOM killer kill all belonging tasks, had it kill one? */ @@ -588,8 +583,6 @@ static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, { if (root == memcg) return true; - if (!root->use_hierarchy) - return false; return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e41c21819ba0..80c5c34416e8 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -281,9 +281,6 @@ bool cgroup_ssid_enabled(int ssid) * - cpuset: a task can be moved into an empty cpuset, and again it takes * masks of ancestors. * - * - memcg: use_hierarchy is on by default and the cgroup file for the flag - * is not created. - * * - blkcg: blk-throttle becomes properly hierarchical. * * - debug: disallowed on the default hierarchy. @@ -5156,8 +5153,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, cgroup_parent(parent)) { pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", current->comm, current->pid, ss->name); - if (!strcmp(ss->name, "memory")) - pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n"); ss->warned_broken_hierarchy = true; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 466d9aed6cc8..fc18a2b7d25a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1141,12 +1141,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, if (prev && !reclaim) pos = prev; - if (!root->use_hierarchy && root != root_mem_cgroup) { - if (prev) - goto out; - return root; - } - rcu_read_lock(); if (reclaim) { @@ -1226,7 +1220,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, out_unlock: rcu_read_unlock(); -out: if (prev && prev != root) css_put(&prev->css); @@ -3461,10 +3454,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, } /* - * Test whether @memcg has children, dead or alive. Note that this - * function doesn't care whether @memcg has use_hierarchy enabled and - * returns %true if there are child csses according to the cgroup - * hierarchy. Testing use_hierarchy is the caller's responsibility. + * Test whether @memcg has children, dead or alive. */ static inline bool memcg_has_children(struct mem_cgroup *memcg) { @@ -3524,37 +3514,20 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of, static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_from_css(css)->use_hierarchy; + return 1; } static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - int retval = 0; - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent); - - if (memcg->use_hierarchy == val) + if (val == 1) return 0; - /* - * If parent's use_hierarchy is set, we can't make any modifications - * in the child subtrees. If it is unset, then the change can - * occur, provided the current cgroup has no children. - * - * For the root cgroup, parent_mem is NULL, we allow value to be - * set if there are no children. - */ - if ((!parent_memcg || !parent_memcg->use_hierarchy) && - (val == 1 || val == 0)) { - if (!memcg_has_children(memcg)) - memcg->use_hierarchy = val; - else - retval = -EBUSY; - } else - retval = -EINVAL; + pr_warn_once("Non-hierarchical mode is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); - return retval; + return -EINVAL; } static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) @@ -3742,8 +3715,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) child = mem_cgroup_from_css(css); BUG_ON(child->kmemcg_id != kmemcg_id); child->kmemcg_id = parent->kmemcg_id; - if (!memcg->use_hierarchy) - break; } rcu_read_unlock(); @@ -5334,38 +5305,22 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; - } - if (!parent) { - page_counter_init(&memcg->memory, NULL); - page_counter_init(&memcg->swap, NULL); - page_counter_init(&memcg->kmem, NULL); - page_counter_init(&memcg->tcpmem, NULL); - } else if (parent->use_hierarchy) { - memcg->use_hierarchy = true; + page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->swap, &parent->swap); page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { - page_counter_init(&memcg->memory, &root_mem_cgroup->memory); - page_counter_init(&memcg->swap, &root_mem_cgroup->swap); - page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem); - page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem); - /* - * Deeper hierachy with use_hierarchy == false doesn't make - * much sense so let cgroup subsystem know about this - * unfortunate state in our controller. - */ - if (parent != root_mem_cgroup) - memory_cgrp_subsys.broken_hierarchy = true; - } + page_counter_init(&memcg->memory, NULL); + page_counter_init(&memcg->swap, NULL); + page_counter_init(&memcg->kmem, NULL); + page_counter_init(&memcg->tcpmem, NULL); - /* The following stuff does not apply to the root */ - if (!parent) { root_mem_cgroup = memcg; return &memcg->css; } + /* The following stuff does not apply to the root */ error = memcg_online_kmem(memcg); if (error) goto fail; @@ -6202,24 +6157,6 @@ static void mem_cgroup_move_task(void) } #endif -/* - * Cgroup retains root cgroups across [un]mount cycles making it necessary - * to verify whether we're attached to the default hierarchy on each mount - * attempt. - */ -static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) -{ - /* - * use_hierarchy is forced on the default hierarchy. cgroup core - * guarantees that @root doesn't have any children, so turning it - * on for the root memcg is enough. - */ - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - root_mem_cgroup->use_hierarchy = true; - else - root_mem_cgroup->use_hierarchy = false; -} - static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value) { if (value == PAGE_COUNTER_MAX) @@ -6557,7 +6494,6 @@ struct cgroup_subsys memory_cgrp_subsys = { .can_attach = mem_cgroup_can_attach, .cancel_attach = mem_cgroup_cancel_attach, .post_attach = mem_cgroup_move_task, - .bind = mem_cgroup_bind, .dfl_cftypes = memory_files, .legacy_cftypes = mem_cgroup_legacy_files, .early_init = 0, -- cgit v1.2.3 From 9d9d341df4d519d96e7927941d91f5785c5cea07 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Mon, 14 Dec 2020 19:06:55 -0800 Subject: cgroup: remove obsoleted broken_hierarchy and warned_broken_hierarchy With the deprecation of the non-hierarchical mode of the memory controller there are no more examples of broken hierarchies left. Let's remove the cgroup core code which was supposed to print warnings about creating of broken hierarchies. Link: https://lkml.kernel.org/r/20201110220800.929549-4-guro@fb.com Signed-off-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: David Rientjes Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup-defs.h | 15 --------------- kernel/cgroup/cgroup.c | 7 ------- 2 files changed, 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index fee0b5547cd0..559ee05f86b2 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -668,21 +668,6 @@ struct cgroup_subsys { */ bool threaded:1; - /* - * If %false, this subsystem is properly hierarchical - - * configuration, resource accounting and restriction on a parent - * cgroup cover those of its children. If %true, hierarchy support - * is broken in some ways - some subsystems ignore hierarchy - * completely while others are only implemented half-way. - * - * It's now disallowed to create nested cgroups if the subsystem is - * broken and cgroup core will emit a warning message on such - * cases. Eventually, all subsystems will be made properly - * hierarchical and this will go away. - */ - bool broken_hierarchy:1; - bool warned_broken_hierarchy:1; - /* the following two fields are initialized automtically during boot */ int id; const char *name; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 80c5c34416e8..16f4692dc961 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5149,13 +5149,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, if (err) goto err_list_del; - if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && - cgroup_parent(parent)) { - pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", - current->comm, current->pid, ss->name); - ss->warned_broken_hierarchy = true; - } - return css; err_list_del: -- cgit v1.2.3 From da3ceeff923e3bc750a8423c840462760c463926 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Mon, 14 Dec 2020 19:07:04 -0800 Subject: mm: memcg/slab: rename *_lruvec_slab_state to *_lruvec_kmem_state The *_lruvec_slab_state is also suitable for pages allocated from buddy, not just for the slab objects. But the function name seems to tell us that only slab object is applicable. So we can rename the keyword of slab to kmem. Link: https://lkml.kernel.org/r/20201117085249.24319-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Roman Gushchin Reviewed-by: Shakeel Butt Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 18 +++++++++--------- kernel/fork.c | 2 +- mm/memcontrol.c | 2 +- mm/workingset.c | 8 ++++---- 4 files changed, 15 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dd992b81bcb7..71c961452d9a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -788,15 +788,15 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); -void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val); +void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val); -static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); - __mod_lruvec_slab_state(p, idx, val); + __mod_lruvec_kmem_state(p, idx, val); local_irq_restore(flags); } @@ -1229,7 +1229,7 @@ static inline void mod_lruvec_page_state(struct page *page, mod_node_page_state(page_pgdat(page), idx, val); } -static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); @@ -1237,7 +1237,7 @@ static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, __mod_node_page_state(page_pgdat(page), idx, val); } -static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, +static inline void mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); @@ -1332,14 +1332,14 @@ static inline void __dec_lruvec_page_state(struct page *page, __mod_lruvec_page_state(page, idx, -1); } -static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx) +static inline void __inc_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_slab_state(p, idx, 1); + __mod_lruvec_kmem_state(p, idx, 1); } -static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx) +static inline void __dec_lruvec_kmem_state(void *p, enum node_stat_item idx) { - __mod_lruvec_slab_state(p, idx, -1); + __mod_lruvec_kmem_state(p, idx, -1); } /* idx can be of type enum memcg_stat_item or node_stat_item */ diff --git a/kernel/fork.c b/kernel/fork.c index dc55f68a6ee3..5e7cc88eadb5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -385,7 +385,7 @@ static void account_kernel_stack(struct task_struct *tsk, int account) mod_lruvec_page_state(vm->pages[0], NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); else - mod_lruvec_slab_state(stack, NR_KERNEL_STACK_KB, + mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB, account * (THREAD_SIZE / 1024)); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ce19e8484c89..b50f7f336b16 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -853,7 +853,7 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_memcg_lruvec_state(lruvec, idx, val); } -void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) +void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) { pg_data_t *pgdat = page_pgdat(virt_to_page(p)); struct mem_cgroup *memcg; diff --git a/mm/workingset.c b/mm/workingset.c index 975a4d2dd02e..25f75bbe80e0 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -445,12 +445,12 @@ void workingset_update_node(struct xa_node *node) if (node->count && node->count == node->nr_values) { if (list_empty(&node->private_list)) { list_lru_add(&shadow_nodes, &node->private_list); - __inc_lruvec_slab_state(node, WORKINGSET_NODES); + __inc_lruvec_kmem_state(node, WORKINGSET_NODES); } } else { if (!list_empty(&node->private_list)) { list_lru_del(&shadow_nodes, &node->private_list); - __dec_lruvec_slab_state(node, WORKINGSET_NODES); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); } } } @@ -544,7 +544,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, } list_lru_isolate(lru, item); - __dec_lruvec_slab_state(node, WORKINGSET_NODES); + __dec_lruvec_kmem_state(node, WORKINGSET_NODES); spin_unlock(lru_lock); @@ -559,7 +559,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; mapping->nrexceptional -= node->nr_values; xa_delete_node(node, workingset_update_node); - __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM); + __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM); out_invalid: xa_unlock_irq(&mapping->i_pages); -- cgit v1.2.3 From d3f5ffcacd1528736471bc78f03f06da6c4551cc Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 14 Dec 2020 19:07:45 -0800 Subject: mm: cleanup: remove unused tsk arg from __access_remote_vm Despite a comment that said that page fault accounting would be charged to whatever task_struct* was passed into __access_remote_vm(), the tsk argument was actually unused. Making page fault accounting actually use this task struct is quite a project, so there is no point in keeping the tsk argument. Delete both the comment, and the argument. [rppt@linux.ibm.com: changelog addition] Link: https://lkml.kernel.org/r/20201026074137.4147787-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Mike Rapoport Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- kernel/ptrace.c | 2 +- mm/memory.c | 11 +++++------ mm/nommu.c | 8 ++++---- 4 files changed, 12 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 5bbbf4aeee94..1d8e84bf718a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1716,8 +1716,8 @@ extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); -extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags); +extern int __access_remote_vm(struct mm_struct *mm, unsigned long addr, + void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 79de1294f8eb..a77d25c641e9 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -57,7 +57,7 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, return 0; } - ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + ret = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return ret; diff --git a/mm/memory.c b/mm/memory.c index 50632c4366b8..4a42a74a2240 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4885,11 +4885,10 @@ EXPORT_SYMBOL_GPL(generic_access_phys); #endif /* - * Access another process' address space as given in mm. If non-NULL, use the - * given task for page fault accounting. + * Access another process' address space as given in mm. */ -int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags) +int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, + int len, unsigned int gup_flags) { struct vm_area_struct *vma; void *old_buf = buf; @@ -4966,7 +4965,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); + return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* @@ -4984,7 +4983,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, if (!mm) return 0; - ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + ret = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); diff --git a/mm/nommu.c b/mm/nommu.c index 0faf39b32cdb..870fea12823e 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1675,8 +1675,8 @@ void filemap_map_pages(struct vm_fault *vmf, } EXPORT_SYMBOL(filemap_map_pages); -int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, - unsigned long addr, void *buf, int len, unsigned int gup_flags) +int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, + int len, unsigned int gup_flags) { struct vm_area_struct *vma; int write = gup_flags & FOLL_WRITE; @@ -1722,7 +1722,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags) { - return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags); + return __access_remote_vm(mm, addr, buf, len, gup_flags); } /* @@ -1741,7 +1741,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in if (!mm) return 0; - len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags); + len = __access_remote_vm(mm, addr, buf, len, gup_flags); mmput(mm); return len; -- cgit v1.2.3 From e89a85d63fb2e187f5afcbf83c12743132596563 Mon Sep 17 00:00:00 2001 From: Walter Wu Date: Mon, 14 Dec 2020 19:09:09 -0800 Subject: workqueue: kasan: record workqueue stack Patch series "kasan: add workqueue stack for generic KASAN", v5. Syzbot reports many UAF issues for workqueue, see [1]. In some of these access/allocation happened in process_one_work(), we see the free stack is useless in KASAN report, it doesn't help programmers to solve UAF for workqueue issue. This patchset improves KASAN reports by making them to have workqueue queueing stack. It is useful for programmers to solve use-after-free or double-free memory issue. Generic KASAN also records the last two workqueue stacks and prints them in KASAN report. It is only suitable for generic KASAN. [1] https://groups.google.com/g/syzkaller-bugs/search?q=%22use-after-free%22+process_one_work [2] https://bugzilla.kernel.org/show_bug.cgi?id=198437 This patch (of 4): When analyzing use-after-free or double-free issue, recording the enqueuing work stacks is helpful to preserve usage history which potentially gives a hint about the affected code. For workqueue it has turned out to be useful to record the enqueuing work call stacks. Because user can see KASAN report to determine whether it is root cause. They don't need to enable debugobjects, but they have a chance to find out the root cause. Link: https://lkml.kernel.org/r/20201203022148.29754-1-walter-zh.wu@mediatek.com Link: https://lkml.kernel.org/r/20201203022442.30006-1-walter-zh.wu@mediatek.com Signed-off-by: Walter Wu Suggested-by: Marco Elver Acked-by: Marco Elver Acked-by: Tejun Heo Reviewed-by: Dmitry Vyukov Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Alexander Potapenko Cc: Lai Jiangshan Cc: Marco Elver Cc: Matthias Brugger Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 437935e7a199..33608a8c611e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1327,6 +1327,9 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, { struct worker_pool *pool = pwq->pool; + /* record the work call stack in order to print it in KASAN reports */ + kasan_record_aux_stack(work); + /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); -- cgit v1.2.3 From 2abf962a8d42b32f5ffeb827826290b799c85f86 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Mon, 14 Dec 2020 19:10:25 -0800 Subject: PM: hibernate: make direct map manipulations more explicit When DEBUG_PAGEALLOC or ARCH_HAS_SET_DIRECT_MAP is enabled a page may be not present in the direct map and has to be explicitly mapped before it could be copied. Introduce hibernate_map_page() and hibernation_unmap_page() that will explicitly use set_direct_map_{default,invalid}_noflush() for ARCH_HAS_SET_DIRECT_MAP case and debug_pagealloc_{map,unmap}_pages() for DEBUG_PAGEALLOC case. The remapping of the pages in safe_copy_page() presumes that it only changes protection bits in an existing PTE and so it is safe to ignore return value of set_direct_map_{default,invalid}_noflush(). Still, add a pr_warn() so that future changes in set_memory APIs will not silently break hibernation. Link: https://lkml.kernel.org/r/20201109192128.960-3-rppt@kernel.org Signed-off-by: Mike Rapoport Acked-by: Rafael J. Wysocki Reviewed-by: David Hildenbrand Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Albert Ou Cc: Andy Lutomirski Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Christoph Lameter Cc: Dave Hansen Cc: David Rientjes Cc: "David S. Miller" Cc: "Edgecombe, Rick P" Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Joonsoo Kim Cc: Len Brown Cc: Michael Ellerman Cc: Palmer Dabbelt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Pavel Machek Cc: Pekka Enberg Cc: Peter Zijlstra Cc: "Rafael J. Wysocki" Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 12 ------------ kernel/power/snapshot.c | 38 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f4c34672a13..6b5df31387b5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2934,16 +2934,6 @@ static inline bool debug_pagealloc_enabled_static(void) #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_ARCH_HAS_SET_DIRECT_MAP) extern void __kernel_map_pages(struct page *page, int numpages, int enable); -/* - * When called in DEBUG_PAGEALLOC context, the call should most likely be - * guarded by debug_pagealloc_enabled() or debug_pagealloc_enabled_static() - */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) -{ - __kernel_map_pages(page, numpages, enable); -} - static inline void debug_pagealloc_map_pages(struct page *page, int numpages) { if (debug_pagealloc_enabled_static()) @@ -2960,8 +2950,6 @@ static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) extern bool kernel_page_present(struct page *page); #endif /* CONFIG_HIBERNATION */ #else /* CONFIG_DEBUG_PAGEALLOC || CONFIG_ARCH_HAS_SET_DIRECT_MAP */ -static inline void -kernel_map_pages(struct page *page, int numpages, int enable) {} static inline void debug_pagealloc_map_pages(struct page *page, int numpages) {} static inline void debug_pagealloc_unmap_pages(struct page *page, int numpages) {} #ifdef CONFIG_HIBERNATION diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 46b1804c1ddf..d848377dd8dc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -76,6 +76,40 @@ static inline void hibernate_restore_protect_page(void *page_address) {} static inline void hibernate_restore_unprotect_page(void *page_address) {} #endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */ + +/* + * The calls to set_direct_map_*() should not fail because remapping a page + * here means that we only update protection bits in an existing PTE. + * It is still worth to have a warning here if something changes and this + * will no longer be the case. + */ +static inline void hibernate_map_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + int ret = set_direct_map_default_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + } else { + debug_pagealloc_map_pages(page, 1); + } +} + +static inline void hibernate_unmap_page(struct page *page) +{ + if (IS_ENABLED(CONFIG_ARCH_HAS_SET_DIRECT_MAP)) { + unsigned long addr = (unsigned long)page_address(page); + int ret = set_direct_map_invalid_noflush(page); + + if (ret) + pr_warn_once("Failed to remap page\n"); + + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + } else { + debug_pagealloc_unmap_pages(page, 1); + } +} + static int swsusp_page_is_free(struct page *); static void swsusp_set_page_forbidden(struct page *); static void swsusp_unset_page_forbidden(struct page *); @@ -1355,9 +1389,9 @@ static void safe_copy_page(void *dst, struct page *s_page) if (kernel_page_present(s_page)) { do_copy_page(dst, page_address(s_page)); } else { - kernel_map_pages(s_page, 1, 1); + hibernate_map_page(s_page); do_copy_page(dst, page_address(s_page)); - kernel_map_pages(s_page, 1, 0); + hibernate_unmap_page(s_page); } } -- cgit v1.2.3 From 03b6c9a3e8805606c0bb4ad41855fac3bf85c3b9 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 14 Dec 2020 19:13:38 -0800 Subject: kernel/power: allow hibernation with page_poison sanity checking Page poisoning used to be incompatible with hibernation, as the state of poisoned pages was lost after resume, thus enabling CONFIG_HIBERNATION forces CONFIG_PAGE_POISONING_NO_SANITY. For the same reason, the poisoning with zeroes variant CONFIG_PAGE_POISONING_ZERO used to disable hibernation. The latter restriction was removed by commit 1ad1410f632d ("PM / Hibernate: allow hibernation with PAGE_POISONING_ZERO") and similarly for init_on_free by commit 18451f9f9e58 ("PM: hibernate: fix crashes with init_on_free=1") by making sure free pages are cleared after resume. We can use the same mechanism to instead poison free pages with PAGE_POISON after resume. This covers both zero and 0xAA patterns. Thus we can remove the Kconfig restriction that disables page poison sanity checking when hibernation is enabled. Link: https://lkml.kernel.org/r/20201113104033.22907-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Rafael J. Wysocki [hibernation] Reviewed-by: David Hildenbrand Cc: Mike Rapoport Cc: Alexander Potapenko Cc: Kees Cook Cc: Laura Abbott Cc: Mateusz Nosek Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + kernel/power/hibernate.c | 2 +- kernel/power/power.h | 2 +- kernel/power/snapshot.c | 14 +++++++++++--- mm/Kconfig.debug | 1 - 5 files changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/mm.h b/include/linux/mm.h index 026707a58159..3e1fe8ca9720 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2903,6 +2903,7 @@ static inline void kernel_unpoison_pages(struct page *page, int numpages) #else static inline bool page_poisoning_enabled(void) { return false; } static inline bool page_poisoning_enabled_static(void) { return false; } +static inline void __kernel_poison_pages(struct page *page, int nunmpages) { } static inline void kernel_poison_pages(struct page *page, int numpages) { } static inline void kernel_unpoison_pages(struct page *page, int numpages) { } #endif diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 2fc7d509a34f..da0b41914177 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -326,7 +326,7 @@ static int create_image(int platform_mode) if (!in_suspend) { events_check_enabled = false; - clear_free_pages(); + clear_or_poison_free_pages(); } platform_leave(platform_mode); diff --git a/kernel/power/power.h b/kernel/power/power.h index 24f12d534515..778bf431ec02 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -106,7 +106,7 @@ extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); -extern void clear_free_pages(void); +extern void clear_or_poison_free_pages(void); /** * Auxiliary structure used for reading the snapshot image data and diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d848377dd8dc..d63560e1cf87 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1178,7 +1178,15 @@ void free_basic_memory_bitmaps(void) pr_debug("Basic memory bitmaps freed\n"); } -void clear_free_pages(void) +static void clear_or_poison_free_page(struct page *page) +{ + if (page_poisoning_enabled_static()) + __kernel_poison_pages(page, 1); + else if (want_init_on_free()) + clear_highpage(page); +} + +void clear_or_poison_free_pages(void) { struct memory_bitmap *bm = free_pages_map; unsigned long pfn; @@ -1186,12 +1194,12 @@ void clear_free_pages(void) if (WARN_ON(!(free_pages_map))) return; - if (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) || want_init_on_free()) { + if (page_poisoning_enabled() || want_init_on_free()) { memory_bm_position_reset(bm); pfn = memory_bm_next_pfn(bm); while (pfn != BM_END_OF_MAP) { if (pfn_valid(pfn)) - clear_highpage(pfn_to_page(pfn)); + clear_or_poison_free_page(pfn_to_page(pfn)); pfn = memory_bm_next_pfn(bm); } diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 864f129f1937..c57786ad5be9 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -64,7 +64,6 @@ config PAGE_OWNER config PAGE_POISONING bool "Poison pages after freeing" - select PAGE_POISONING_NO_SANITY if HIBERNATION help Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages. The filling of the memory helps -- cgit v1.2.3 From 35189b8ff18ee0c6f7c04f4c674584d1149d5c55 Mon Sep 17 00:00:00 2001 From: Hui Su Date: Tue, 15 Dec 2020 20:42:52 -0800 Subject: kernel/acct.c: use #elif instead of #end and #elif Cleanup: use #elif instead of #end and #elif. Link: https://lkml.kernel.org/r/20201015150736.GA91603@rlk Signed-off-by: Hui Su Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index f175df8f6aa4..a64102be2bb0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -381,9 +381,7 @@ static comp2_t encode_comp2_t(u64 value) return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); } } -#endif - -#if ACCT_VERSION == 3 +#elif ACCT_VERSION == 3 /* * encode an u64 into a 32 bit IEEE float */ @@ -500,8 +498,7 @@ static void do_acct_process(struct bsd_acct_struct *acct) /* backward-compatible 16 bit fields */ ac.ac_uid16 = ac.ac_uid; ac.ac_gid16 = ac.ac_gid; -#endif -#if ACCT_VERSION == 3 +#elif ACCT_VERSION == 3 { struct pid_namespace *ns = acct->ns; -- cgit v1.2.3 From ca4a9241cc5e718de86a34afd41972869546a5e3 Mon Sep 17 00:00:00 2001 From: Alexander Egorenkov Date: Tue, 15 Dec 2020 20:45:31 -0800 Subject: kdump: append uts_namespace.name offset to VMCOREINFO The offset of the field 'init_uts_ns.name' has changed since commit 9a56493f6942 ("uts: Use generic ns_common::count"). Make the offset of the field 'uts_namespace.name' available in VMCOREINFO because tools like 'crash-utility' and 'makedumpfile' must be able to read it from crash dumps. Link: https://lore.kernel.org/r/159644978167.604812.1773586504374412107.stgit@localhost.localdomain Link: https://lkml.kernel.org/r/20200930102328.396488-1-egorenar@linux.ibm.com Signed-off-by: Alexander Egorenkov Acked-by: lijiang Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Cc: "Eric W . Biederman" Cc: Kirill Tkhai Cc: Kees Cook Cc: Christian Brauner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kdump/vmcoreinfo.rst | 6 ++++++ kernel/crash_core.c | 1 + 2 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/Documentation/admin-guide/kdump/vmcoreinfo.rst b/Documentation/admin-guide/kdump/vmcoreinfo.rst index e44a6c01f336..3861a25faae1 100644 --- a/Documentation/admin-guide/kdump/vmcoreinfo.rst +++ b/Documentation/admin-guide/kdump/vmcoreinfo.rst @@ -39,6 +39,12 @@ call. User-space tools can get the kernel name, host name, kernel release number, kernel version, architecture name and OS type from it. +(uts_namespace, name) +--------------------- + +Offset of the name's member. Crash Utility and Makedumpfile get +the start address of the init_uts_ns.name from this. + node_online_map --------------- diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 4fcfe0b70c4e..825284baaf46 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -447,6 +447,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_PAGESIZE(PAGE_SIZE); VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_OFFSET(uts_namespace, name); VMCOREINFO_SYMBOL(node_online_map); #ifdef CONFIG_MMU VMCOREINFO_SYMBOL_ARRAY(swapper_pg_dir); -- cgit v1.2.3 From 99b75eb7c86b05f9594e8a7826174b8bf22e82b8 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Tue, 15 Dec 2020 20:45:37 -0800 Subject: gcov: remove support for GCC < 4.9 Since commit 0bddd227f3dc ("Documentation: update for gcc 4.9 requirement") the minimum supported version of GCC is gcc-4.9. It's now safe to remove this code. Similar to commit 10415533a906 ("gcov: Remove old GCC 3.4 support") but that was for GCC 4.8 and this is for GCC 4.9. Link: https://github.com/ClangBuiltLinux/linux/issues/427 Link: https://lkml.kernel.org/r/20201111030557.2015680-1-ndesaulniers@google.com Signed-off-by: Nick Desaulniers Reviewed-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/gcc_4_7.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 53c67c87f141..0da0aacc1f26 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -25,10 +25,8 @@ #define GCOV_COUNTERS 9 #elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 -#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 -#define GCOV_COUNTERS 9 #else -#define GCOV_COUNTERS 8 +#define GCOV_COUNTERS 9 #endif #define GCOV_TAG_FUNCTION_LENGTH 3 -- cgit v1.2.3 From 26ecea089f422b6f518f2906495a2d64ca7938d7 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Tue, 15 Dec 2020 20:45:40 -0800 Subject: gcov: fix kernel-doc markup issue Fix the following kernel-doc issue in gcov: kernel/gcov/gcc_4_7.c:238: warning: Function parameter or member 'dst' not described in 'gcov_info_add' kernel/gcov/gcc_4_7.c:238: warning: Function parameter or member 'src' not described in 'gcov_info_add' kernel/gcov/gcc_4_7.c:238: warning: Excess function parameter 'dest' description in 'gcov_info_add' kernel/gcov/gcc_4_7.c:238: warning: Excess function parameter 'source' description in 'gcov_info_add' Link: https://lkml.kernel.org/r/1605252352-63983-1-git-send-email-alex.shi@linux.alibaba.com Signed-off-by: Alex Shi Acked-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/gcc_4_7.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 0da0aacc1f26..c53408a00d0b 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -227,10 +227,10 @@ int gcov_info_is_compatible(struct gcov_info *info1, struct gcov_info *info2) /** * gcov_info_add - add up profiling data - * @dest: profiling data set to which data is added - * @source: profiling data set which is added + * @dst: profiling data set to which data is added + * @src: profiling data set which is added * - * Adds profiling counts of @source to @dest. + * Adds profiling counts of @src to @dst. */ void gcov_info_add(struct gcov_info *dst, struct gcov_info *src) { -- cgit v1.2.3 From 3d03295a7e9194c2318977b44999972ce3609664 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 15 Dec 2020 20:45:47 -0800 Subject: relay: remove unused buf_mapped and buf_unmapped callbacks Patch series "relay: cleanup and const callbacks", v2. None of the relay users require the use of mutable structs for callbacks, however the relay code does. Instead of assigning default callbacks when there is none, add callback wrappers to conditionally call the client callbacks if available, and fall back to default behaviour (typically no-op) otherwise. This lets all relay users make their struct rchan_callbacks const data. This series starts with a number of cleanups first based on Christoph's feedback. This patch (of 9): No relay client uses the buf_mapped or buf_unmapped callbacks. Remove them. This makes relay's vm_operations_struct close callback a dummy, remove it as well. Link: https://lkml.kernel.org/r/cover.1606153547.git.jani.nikula@intel.com Link: https://lkml.kernel.org/r/c69fff6e0cd485563604240bbfcc028434983bec.1606153547.git.jani.nikula@intel.com Signed-off-by: Jani Nikula Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: Kalle Valo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/relay.h | 19 ------------------- kernel/relay.c | 34 ---------------------------------- 2 files changed, 53 deletions(-) (limited to 'kernel') diff --git a/include/linux/relay.h b/include/linux/relay.h index e13a333e7c37..b3c4f49f6951 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -101,25 +101,6 @@ struct rchan_callbacks void *prev_subbuf, size_t prev_padding); - /* - * buf_mapped - relay buffer mmap notification - * @buf: the channel buffer - * @filp: relay file pointer - * - * Called when a relay file is successfully mmapped - */ - void (*buf_mapped)(struct rchan_buf *buf, - struct file *filp); - - /* - * buf_unmapped - relay buffer unmap notification - * @buf: the channel buffer - * @filp: relay file pointer - * - * Called when a relay file is successfully unmapped - */ - void (*buf_unmapped)(struct rchan_buf *buf, - struct file *filp); /* * create_buf_file - create file to represent a relay channel buffer * @filename: the name of the file to create diff --git a/kernel/relay.c b/kernel/relay.c index b08d936d5fa7..b51343642bf4 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -27,15 +27,6 @@ static DEFINE_MUTEX(relay_channels_mutex); static LIST_HEAD(relay_channels); -/* - * close() vm_op implementation for relay file mapping. - */ -static void relay_file_mmap_close(struct vm_area_struct *vma) -{ - struct rchan_buf *buf = vma->vm_private_data; - buf->chan->cb->buf_unmapped(buf, vma->vm_file); -} - /* * fault() vm_op implementation for relay file mapping. */ @@ -62,7 +53,6 @@ static vm_fault_t relay_buf_fault(struct vm_fault *vmf) */ static const struct vm_operations_struct relay_file_mmap_ops = { .fault = relay_buf_fault, - .close = relay_file_mmap_close, }; /* @@ -96,7 +86,6 @@ static void relay_free_page_array(struct page **array) static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) { unsigned long length = vma->vm_end - vma->vm_start; - struct file *filp = vma->vm_file; if (!buf) return -EBADF; @@ -107,7 +96,6 @@ static int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma) vma->vm_ops = &relay_file_mmap_ops; vma->vm_flags |= VM_DONTEXPAND; vma->vm_private_data = buf; - buf->chan->cb->buf_mapped(buf, filp); return 0; } @@ -283,22 +271,6 @@ static int subbuf_start_default_callback (struct rchan_buf *buf, return 1; } -/* - * buf_mapped() default callback. Does nothing. - */ -static void buf_mapped_default_callback(struct rchan_buf *buf, - struct file *filp) -{ -} - -/* - * buf_unmapped() default callback. Does nothing. - */ -static void buf_unmapped_default_callback(struct rchan_buf *buf, - struct file *filp) -{ -} - /* * create_buf_file_create() default callback. Does nothing. */ @@ -322,8 +294,6 @@ static int remove_buf_file_default_callback(struct dentry *dentry) /* relay channel default callbacks */ static struct rchan_callbacks default_channel_callbacks = { .subbuf_start = subbuf_start_default_callback, - .buf_mapped = buf_mapped_default_callback, - .buf_unmapped = buf_unmapped_default_callback, .create_buf_file = create_buf_file_default_callback, .remove_buf_file = remove_buf_file_default_callback, }; @@ -509,10 +479,6 @@ static void setup_callbacks(struct rchan *chan, if (!cb->subbuf_start) cb->subbuf_start = subbuf_start_default_callback; - if (!cb->buf_mapped) - cb->buf_mapped = buf_mapped_default_callback; - if (!cb->buf_unmapped) - cb->buf_unmapped = buf_unmapped_default_callback; if (!cb->create_buf_file) cb->create_buf_file = create_buf_file_default_callback; if (!cb->remove_buf_file) -- cgit v1.2.3 From 6f8f25440d791855e8b6a26cd2bff9d738468416 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 15 Dec 2020 20:45:50 -0800 Subject: relay: require non-NULL callbacks in relay_open() There are no clients passing NULL callbacks, which makes sense as it wouldn't even create a file. Require non-NULL callbacks, and throw away the handling for NULL callbacks. Link: https://lkml.kernel.org/r/e40642f3b027d2bb6bc851ddb60e0a61ea51f5f8.1606153547.git.jani.nikula@intel.com Signed-off-by: Jani Nikula Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: Kalle Valo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index b51343642bf4..d9b8185161a8 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -291,13 +291,6 @@ static int remove_buf_file_default_callback(struct dentry *dentry) return -EINVAL; } -/* relay channel default callbacks */ -static struct rchan_callbacks default_channel_callbacks = { - .subbuf_start = subbuf_start_default_callback, - .create_buf_file = create_buf_file_default_callback, - .remove_buf_file = remove_buf_file_default_callback, -}; - /** * wakeup_readers - wake up readers waiting on a channel * @work: contains the channel buffer @@ -472,11 +465,6 @@ static void relay_close_buf(struct rchan_buf *buf) static void setup_callbacks(struct rchan *chan, struct rchan_callbacks *cb) { - if (!cb) { - chan->cb = &default_channel_callbacks; - return; - } - if (!cb->subbuf_start) cb->subbuf_start = subbuf_start_default_callback; if (!cb->create_buf_file) @@ -542,6 +530,8 @@ struct rchan *relay_open(const char *base_filename, return NULL; if (subbuf_size > UINT_MAX / n_subbufs) return NULL; + if (!cb) + return NULL; chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); if (!chan) -- cgit v1.2.3 From 371e03880d9d34534d3eafd2a7581042be598e39 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 15 Dec 2020 20:45:53 -0800 Subject: relay: make create_buf_file and remove_buf_file callbacks mandatory All clients provide create_buf_file and remove_buf_file callbacks, and they're required for relay to make sense. There is no point in them being optional. Also document whether each callback is mandatory/optional. Link: https://lkml.kernel.org/r/88003c1527386b93036e286e7917f1e33aec84ac.1606153547.git.jani.nikula@intel.com Signed-off-by: Jani Nikula Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: Kalle Valo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/relay.h | 6 ++++++ kernel/relay.c | 26 +------------------------- 2 files changed, 7 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/relay.h b/include/linux/relay.h index b3c4f49f6951..99d024475ba5 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -89,6 +89,8 @@ struct rchan_callbacks * The client should return 1 to continue logging, 0 to stop * logging. * + * This callback is optional. + * * NOTE: subbuf_start will also be invoked when the buffer is * created, so that the first sub-buffer can be initialized * if necessary. In this case, prev_subbuf will be NULL. @@ -122,6 +124,8 @@ struct rchan_callbacks * cause relay_open() to create a single global buffer rather * than the default set of per-cpu buffers. * + * This callback is mandatory. + * * See Documentation/filesystems/relay.rst for more info. */ struct dentry *(*create_buf_file)(const char *filename, @@ -139,6 +143,8 @@ struct rchan_callbacks * channel buffer. * * The callback should return 0 if successful, negative if not. + * + * This callback is mandatory. */ int (*remove_buf_file)(struct dentry *dentry); }; diff --git a/kernel/relay.c b/kernel/relay.c index d9b8185161a8..dd4ec4ec07f3 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -271,26 +271,6 @@ static int subbuf_start_default_callback (struct rchan_buf *buf, return 1; } -/* - * create_buf_file_create() default callback. Does nothing. - */ -static struct dentry *create_buf_file_default_callback(const char *filename, - struct dentry *parent, - umode_t mode, - struct rchan_buf *buf, - int *is_global) -{ - return NULL; -} - -/* - * remove_buf_file() default callback. Does nothing. - */ -static int remove_buf_file_default_callback(struct dentry *dentry) -{ - return -EINVAL; -} - /** * wakeup_readers - wake up readers waiting on a channel * @work: contains the channel buffer @@ -467,10 +447,6 @@ static void setup_callbacks(struct rchan *chan, { if (!cb->subbuf_start) cb->subbuf_start = subbuf_start_default_callback; - if (!cb->create_buf_file) - cb->create_buf_file = create_buf_file_default_callback; - if (!cb->remove_buf_file) - cb->remove_buf_file = remove_buf_file_default_callback; chan->cb = cb; } @@ -530,7 +506,7 @@ struct rchan *relay_open(const char *base_filename, return NULL; if (subbuf_size > UINT_MAX / n_subbufs) return NULL; - if (!cb) + if (!cb || !cb->create_buf_file || !cb->remove_buf_file) return NULL; chan = kzalloc(sizeof(struct rchan), GFP_KERNEL); -- cgit v1.2.3 From 023542f48b57d6b785fcadb86ac336ae80653e58 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 15 Dec 2020 20:45:57 -0800 Subject: relay: allow the use of const callback structs None of the relay users require the use of mutable structs for callbacks, however the relay code does. Instead of assigning the default callback for subbuf_start, add a wrapper to conditionally call the client callback if available, and fall back to default behaviour otherwise. This lets all relay users make their struct rchan_callbacks const data. [jani.nikula@intel.com: cleanups, per Christoph] Link: https://lkml.kernel.org/r/20201124115412.32402-1-jani.nikula@intel.com Link: https://lkml.kernel.org/r/cc3ff292e4eb4fdc56bee3d690c7b8e39209cd37.1606153547.git.jani.nikula@intel.com Signed-off-by: Jani Nikula Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: Kalle Valo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/relay.h | 4 ++-- kernel/relay.c | 37 ++++++++++--------------------------- 2 files changed, 12 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/relay.h b/include/linux/relay.h index 99d024475ba5..72b876dd5cb8 100644 --- a/include/linux/relay.h +++ b/include/linux/relay.h @@ -62,7 +62,7 @@ struct rchan size_t subbuf_size; /* sub-buffer size */ size_t n_subbufs; /* number of sub-buffers per buffer */ size_t alloc_size; /* total buffer size allocated */ - struct rchan_callbacks *cb; /* client callbacks */ + const struct rchan_callbacks *cb; /* client callbacks */ struct kref kref; /* channel refcount */ void *private_data; /* for user-defined data */ size_t last_toobig; /* tried to log event > subbuf size */ @@ -157,7 +157,7 @@ struct rchan *relay_open(const char *base_filename, struct dentry *parent, size_t subbuf_size, size_t n_subbufs, - struct rchan_callbacks *cb, + const struct rchan_callbacks *cb, void *private_data); extern int relay_late_setup_files(struct rchan *chan, const char *base_filename, diff --git a/kernel/relay.c b/kernel/relay.c index dd4ec4ec07f3..d1a67fbb819d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -252,23 +252,14 @@ EXPORT_SYMBOL_GPL(relay_buf_full); * High-level relay kernel API and associated functions. */ -/* - * rchan_callback implementations defining default channel behavior. Used - * in place of corresponding NULL values in client callback struct. - */ - -/* - * subbuf_start() default callback. Does nothing. - */ -static int subbuf_start_default_callback (struct rchan_buf *buf, - void *subbuf, - void *prev_subbuf, - size_t prev_padding) +static int relay_subbuf_start(struct rchan_buf *buf, void *subbuf, + void *prev_subbuf, size_t prev_padding) { - if (relay_buf_full(buf)) - return 0; + if (!buf->chan->cb->subbuf_start) + return !relay_buf_full(buf); - return 1; + return buf->chan->cb->subbuf_start(buf, subbuf, + prev_subbuf, prev_padding); } /** @@ -314,7 +305,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) for (i = 0; i < buf->chan->n_subbufs; i++) buf->padding[i] = 0; - buf->chan->cb->subbuf_start(buf, buf->data, NULL, 0); + relay_subbuf_start(buf, buf->data, NULL, 0); } /** @@ -442,14 +433,6 @@ static void relay_close_buf(struct rchan_buf *buf) kref_put(&buf->kref, relay_remove_buf); } -static void setup_callbacks(struct rchan *chan, - struct rchan_callbacks *cb) -{ - if (!cb->subbuf_start) - cb->subbuf_start = subbuf_start_default_callback; - chan->cb = cb; -} - int relay_prepare_cpu(unsigned int cpu) { struct rchan *chan; @@ -495,7 +478,7 @@ struct rchan *relay_open(const char *base_filename, struct dentry *parent, size_t subbuf_size, size_t n_subbufs, - struct rchan_callbacks *cb, + const struct rchan_callbacks *cb, void *private_data) { unsigned int i; @@ -529,7 +512,7 @@ struct rchan *relay_open(const char *base_filename, chan->has_base_filename = 1; strlcpy(chan->base_filename, base_filename, NAME_MAX); } - setup_callbacks(chan, cb); + chan->cb = cb; kref_init(&chan->kref); mutex_lock(&relay_channels_mutex); @@ -712,7 +695,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length) new_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; new = buf->start + new_subbuf * buf->chan->subbuf_size; buf->offset = 0; - if (!buf->chan->cb->subbuf_start(buf, new, old, buf->prev_padding)) { + if (!relay_subbuf_start(buf, new, old, buf->prev_padding)) { buf->offset = buf->chan->subbuf_size + 1; return 0; } -- cgit v1.2.3 From abf4e00c7bc69f7b878039ebe57d885e3bdc4fdb Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Tue, 15 Dec 2020 20:46:13 -0800 Subject: blktrace: make relay callbacks const Now that relay_open() accepts const callbacks, make relay callbacks const. Link: https://lkml.kernel.org/r/7ff5ce0b735901eb4f10e13da2704f1d8c4a2507.1606153547.git.jani.nikula@intel.com Signed-off-by: Jani Nikula Reviewed-by: Christoph Hellwig Cc: Jens Axboe Cc: Kalle Valo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f1022945e346..b5c4b9ade960 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -449,7 +449,7 @@ static struct dentry *blk_create_buf_file_callback(const char *filename, &relay_file_operations); } -static struct rchan_callbacks blk_relay_callbacks = { +static const struct rchan_callbacks blk_relay_callbacks = { .subbuf_start = blk_subbuf_start_callback, .create_buf_file = blk_create_buf_file_callback, .remove_buf_file = blk_remove_buf_file_callback, -- cgit v1.2.3 From 3be8da570868a7989f1a0c11820ee1413877fa8c Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 15 Dec 2020 20:46:16 -0800 Subject: kernel/resource.c: fix kernel-doc markups Kernel-doc markups should use this format: identifier - description While here, fix a kernel-doc tag that was using, instead, a normal comment block. [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/c5e38e1070f8dbe2f9607a10b44afe2875bd966c.1605521731.git.mchehab+huawei@kernel.org Signed-off-by: Mauro Carvalho Chehab Cc: "Jonathan Corbet" Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 82df80417489..833394f9c608 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -320,9 +320,8 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); /** - * Finds the lowest iomem resource that covers part of [@start..@end]. The - * caller must specify @start, @end, @flags, and @desc (which may be - * IORES_DESC_NONE). + * find_next_iomem_res - Finds the lowest iomem resource that covers part of + * [@start..@end]. * * If a resource is found, returns 0 and @*res is overwritten with the part * of the resource that's within [@start..@end]; if none is found, returns @@ -337,6 +336,9 @@ EXPORT_SYMBOL(release_resource); * @desc: descriptor the resource must have * @first_lvl: walk only the first level children, if set * @res: return ptr, if resource found + * + * The caller must specify @start, @end, @flags, and @desc + * (which may be IORES_DESC_NONE). */ static int find_next_iomem_res(resource_size_t start, resource_size_t end, unsigned long flags, unsigned long desc, @@ -416,11 +418,9 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, } /** - * Walks through iomem resources and calls func() with matching resource - * ranges. This walks through whole tree and not just first level children. - * All the memory ranges which overlap start,end and also match flags and - * desc are valid candidates. - * + * walk_iomem_res_desc - Walks through iomem resources and calls func() + * with matching resource ranges. + * * * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check. * @flags: I/O resource flags * @start: start addr @@ -428,6 +428,10 @@ static int __walk_iomem_res_desc(resource_size_t start, resource_size_t end, * @arg: function argument for the callback @func * @func: callback function that is called for each qualifying resource area * + * This walks through whole tree and not just first level children. + * All the memory ranges which overlap start,end and also match flags and + * desc are valid candidates. + * * NOTE: For a new descriptor search, define a new IORES_DESC in * and set it in 'desc' of a target resource entry. */ @@ -1372,9 +1376,9 @@ static bool system_ram_resources_mergeable(struct resource *r1, !r1->child && !r2->child; } -/* +/** * merge_system_ram_resource - mark the System RAM resource mergeable and try to - * merge it with adjacent, mergeable resources + * merge it with adjacent, mergeable resources * @res: resource descriptor * * This interface is intended for memory hotplug, whereby lots of contiguous -- cgit v1.2.3 From c1cb05e77f8e3ec89eec7bed64af07cd20ed24de Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 15 Dec 2020 20:46:46 -0800 Subject: kcov: don't instrument with UBSAN Both KCOV and UBSAN use compiler instrumentation. If UBSAN detects a bug in KCOV, it may cause infinite recursion via printk and other common functions. We already don't instrument KCOV with KASAN/KCSAN for this reason, don't instrument it with UBSAN as well. As a side effect this also resolves the following gcc warning: conflicting types for built-in function '__sanitizer_cov_trace_switch'; expected 'void(long unsigned int, void *)' [-Wbuiltin-declaration-mismatch] It's only reported when kcov.c is compiled with any of the sanitizers enabled. Size of the arguments is correct, it's just that gcc uses 'long' on 64-bit arches and 'long long' on 32-bit arches, while kernel type is always 'long long'. Link: https://lkml.kernel.org/r/20201209100152.2492072-1-dvyukov@google.com Signed-off-by: Dmitry Vyukov Reported-by: Stephen Rothwell Suggested-by: Marco Elver Acked-by: Marco Elver Reviewed-by: Andrey Konovalov Reviewed-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index dddf51266719..aa7368c7eabf 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -34,8 +34,11 @@ KCOV_INSTRUMENT_extable.o := n KCOV_INSTRUMENT_stacktrace.o := n # Don't self-instrument. KCOV_INSTRUMENT_kcov.o := n +# If sanitizers detect any issues in kcov, it may lead to recursion +# via printk, etc. KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n +UBSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector obj-y += sched/ -- cgit v1.2.3 From f9a90501faac55ddbea93c1f73497857f1997227 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 15 Dec 2020 20:46:53 -0800 Subject: reboot: refactor and comment the cpu selection code Small improvements to the code, without changing the way it works: - use a local variable, to avoid a small time lapse where reboot_cpu can have an invalid value - comment the code which is not easy to understand at a glance - merge two identical code blocks into one - replace pointer arithmetics with equivalent array syntax Link: https://lkml.kernel.org/r/20201103214025.116799-4-mcroce@linux.microsoft.com Signed-off-by: Matteo Croce Cc: Arnd Bergmann Cc: Fabian Frederick Cc: Greg Kroah-Hartman Cc: Guenter Roeck Cc: Kees Cook Cc: Mike Rapoport Cc: Pavel Tatashin Cc: Petr Mladek Cc: Robin Holt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 2a18b76ffc06..aa3bfd6c673b 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -553,20 +553,24 @@ static int __init reboot_setup(char *str) break; case 's': - if (isdigit(*(str+1))) - reboot_cpu = simple_strtoul(str+1, NULL, 0); - else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) - reboot_cpu = simple_strtoul(str+3, NULL, 0); - else + /* + * reboot_cpu is s[mp]#### with #### being the processor + * to be used for rebooting. Skip 's' or 'smp' prefix. + */ + str += str[1] == 'm' && str[2] == 'p' ? 3 : 1; + + if (isdigit(str[0])) { + int cpu = simple_strtoul(str, NULL, 0); + + if (cpu >= num_possible_cpus()) { + pr_err("Ignoring the CPU number in reboot= option. " + "CPU %d exceeds possible cpu number %d\n", + cpu, num_possible_cpus()); + break; + } + reboot_cpu = cpu; + } else *mode = REBOOT_SOFT; - if (reboot_cpu >= num_possible_cpus()) { - pr_err("Ignoring the CPU number in reboot= option. " - "CPU %d exceeds possible cpu number %d\n", - reboot_cpu, num_possible_cpus()); - reboot_cpu = 0; - break; - } break; case 'g': -- cgit v1.2.3 From 2c622ed0eaa38b68d7440bedb8c6cdd138b5a860 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 15 Dec 2020 20:46:57 -0800 Subject: reboot: allow to specify reboot mode via sysfs The kernel cmdline reboot= option offers some sort of control on how the reboot is issued. We don't always know in advance what type of reboot to perform. Sometimes a warm reboot is preferred to persist certain memory regions across the reboot. Others a cold one is needed to apply a future system update that makes a memory memory model change, like changing the base page size or resizing a persistent memory region. Or simply we want to enable reboot_force because we noticed that something bad happened. Add handles in sysfs to allow setting these reboot options, so they can be changed when the system is booted, other than at boot time. The handlers are under /kernel/reboot, can be read to get the current configuration and written to alter it. # cd /sys/kernel/reboot/ # grep . * cpu:0 force:0 mode:cold type:acpi # echo 2 >cpu # echo yes >force # echo soft >mode # echo bios >type # grep . * cpu:2 force:1 mode:soft type:bios Before setting anything, check for CAP_SYS_BOOT capability, so it's possible to allow an unpriviledged process to change these settings simply by relaxing the handles permissions, without opening them to the world. [natechancellor@gmail.com: fix variable assignments in type_store] Link: https://lkml.kernel.org/r/20201112035023.974748-1-natechancellor@gmail.com Link: https://github.com/ClangBuiltLinux/linux/issues/1197 Link: https://lkml.kernel.org/r/20201110202746.9690-1-mcroce@linux.microsoft.com Signed-off-by: Matteo Croce Signed-off-by: Nathan Chancellor Reviewed-by: Petr Mladek Cc: Mike Rapoport Cc: Guenter Roeck Cc: Arnd Bergmann Cc: Pavel Tatashin Cc: Kees Cook Cc: Tyler Hicks Cc: Nathan Chancellor Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/ABI/testing/sysfs-kernel-reboot | 32 ++++ kernel/reboot.c | 206 ++++++++++++++++++++++++++ 2 files changed, 238 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-kernel-reboot (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-kernel-reboot b/Documentation/ABI/testing/sysfs-kernel-reboot new file mode 100644 index 000000000000..837330fb2511 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-kernel-reboot @@ -0,0 +1,32 @@ +What: /sys/kernel/reboot +Date: November 2020 +KernelVersion: 5.11 +Contact: Matteo Croce +Description: Interface to set the kernel reboot behavior, similarly to + what can be done via the reboot= cmdline option. + (see Documentation/admin-guide/kernel-parameters.txt) + +What: /sys/kernel/reboot/mode +Date: November 2020 +KernelVersion: 5.11 +Contact: Matteo Croce +Description: Reboot mode. Valid values are: cold warm hard soft gpio + +What: /sys/kernel/reboot/type +Date: November 2020 +KernelVersion: 5.11 +Contact: Matteo Croce +Description: Reboot type. Valid values are: bios acpi kbd triple efi pci + +What: /sys/kernel/reboot/cpu +Date: November 2020 +KernelVersion: 5.11 +Contact: Matteo Croce +Description: CPU number to use to reboot. + +What: /sys/kernel/reboot/force +Date: November 2020 +KernelVersion: 5.11 +Contact: Matteo Croce +Description: Don't wait for any other CPUs on reboot and + avoid anything that could hang. diff --git a/kernel/reboot.c b/kernel/reboot.c index aa3bfd6c673b..940cbb784e17 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -600,3 +600,209 @@ static int __init reboot_setup(char *str) return 1; } __setup("reboot=", reboot_setup); + +#ifdef CONFIG_SYSFS + +#define REBOOT_COLD_STR "cold" +#define REBOOT_WARM_STR "warm" +#define REBOOT_HARD_STR "hard" +#define REBOOT_SOFT_STR "soft" +#define REBOOT_GPIO_STR "gpio" +#define REBOOT_UNDEFINED_STR "undefined" + +#define BOOT_TRIPLE_STR "triple" +#define BOOT_KBD_STR "kbd" +#define BOOT_BIOS_STR "bios" +#define BOOT_ACPI_STR "acpi" +#define BOOT_EFI_STR "efi" +#define BOOT_CF9_FORCE_STR "cf9_force" +#define BOOT_CF9_SAFE_STR "cf9_safe" + +static ssize_t mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + const char *val; + + switch (reboot_mode) { + case REBOOT_COLD: + val = REBOOT_COLD_STR; + break; + case REBOOT_WARM: + val = REBOOT_WARM_STR; + break; + case REBOOT_HARD: + val = REBOOT_HARD_STR; + break; + case REBOOT_SOFT: + val = REBOOT_SOFT_STR; + break; + case REBOOT_GPIO: + val = REBOOT_GPIO_STR; + break; + default: + val = REBOOT_UNDEFINED_STR; + } + + return sprintf(buf, "%s\n", val); +} +static ssize_t mode_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + if (!strncmp(buf, REBOOT_COLD_STR, strlen(REBOOT_COLD_STR))) + reboot_mode = REBOOT_COLD; + else if (!strncmp(buf, REBOOT_WARM_STR, strlen(REBOOT_WARM_STR))) + reboot_mode = REBOOT_WARM; + else if (!strncmp(buf, REBOOT_HARD_STR, strlen(REBOOT_HARD_STR))) + reboot_mode = REBOOT_HARD; + else if (!strncmp(buf, REBOOT_SOFT_STR, strlen(REBOOT_SOFT_STR))) + reboot_mode = REBOOT_SOFT; + else if (!strncmp(buf, REBOOT_GPIO_STR, strlen(REBOOT_GPIO_STR))) + reboot_mode = REBOOT_GPIO; + else + return -EINVAL; + + return count; +} +static struct kobj_attribute reboot_mode_attr = __ATTR_RW(mode); + +static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + const char *val; + + switch (reboot_type) { + case BOOT_TRIPLE: + val = BOOT_TRIPLE_STR; + break; + case BOOT_KBD: + val = BOOT_KBD_STR; + break; + case BOOT_BIOS: + val = BOOT_BIOS_STR; + break; + case BOOT_ACPI: + val = BOOT_ACPI_STR; + break; + case BOOT_EFI: + val = BOOT_EFI_STR; + break; + case BOOT_CF9_FORCE: + val = BOOT_CF9_FORCE_STR; + break; + case BOOT_CF9_SAFE: + val = BOOT_CF9_SAFE_STR; + break; + default: + val = REBOOT_UNDEFINED_STR; + } + + return sprintf(buf, "%s\n", val); +} +static ssize_t type_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + if (!strncmp(buf, BOOT_TRIPLE_STR, strlen(BOOT_TRIPLE_STR))) + reboot_type = BOOT_TRIPLE; + else if (!strncmp(buf, BOOT_KBD_STR, strlen(BOOT_KBD_STR))) + reboot_type = BOOT_KBD; + else if (!strncmp(buf, BOOT_BIOS_STR, strlen(BOOT_BIOS_STR))) + reboot_type = BOOT_BIOS; + else if (!strncmp(buf, BOOT_ACPI_STR, strlen(BOOT_ACPI_STR))) + reboot_type = BOOT_ACPI; + else if (!strncmp(buf, BOOT_EFI_STR, strlen(BOOT_EFI_STR))) + reboot_type = BOOT_EFI; + else if (!strncmp(buf, BOOT_CF9_FORCE_STR, strlen(BOOT_CF9_FORCE_STR))) + reboot_type = BOOT_CF9_FORCE; + else if (!strncmp(buf, BOOT_CF9_SAFE_STR, strlen(BOOT_CF9_SAFE_STR))) + reboot_type = BOOT_CF9_SAFE; + else + return -EINVAL; + + return count; +} +static struct kobj_attribute reboot_type_attr = __ATTR_RW(type); + +static ssize_t cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", reboot_cpu); +} +static ssize_t cpu_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned int cpunum; + int rc; + + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + rc = kstrtouint(buf, 0, &cpunum); + + if (rc) + return rc; + + if (cpunum >= num_possible_cpus()) + return -ERANGE; + + reboot_cpu = cpunum; + + return count; +} +static struct kobj_attribute reboot_cpu_attr = __ATTR_RW(cpu); + +static ssize_t force_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", reboot_force); +} +static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + bool res; + + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + if (kstrtobool(buf, &res)) + return -EINVAL; + + reboot_force = res; + + return count; +} +static struct kobj_attribute reboot_force_attr = __ATTR_RW(force); + +static struct attribute *reboot_attrs[] = { + &reboot_mode_attr.attr, + &reboot_type_attr.attr, + &reboot_cpu_attr.attr, + &reboot_force_attr.attr, + NULL, +}; + +static const struct attribute_group reboot_attr_group = { + .attrs = reboot_attrs, +}; + +static int __init reboot_ksysfs_init(void) +{ + struct kobject *reboot_kobj; + int ret; + + reboot_kobj = kobject_create_and_add("reboot", kernel_kobj); + if (!reboot_kobj) + return -ENOMEM; + + ret = sysfs_create_group(reboot_kobj, &reboot_attr_group); + if (ret) { + kobject_put(reboot_kobj); + return ret; + } + + return 0; +} +late_initcall(reboot_ksysfs_init); + +#endif -- cgit v1.2.3 From 0c5c0179e2cddb0d1c52ba1487f9f9e77714c8af Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 15 Dec 2020 20:47:00 -0800 Subject: reboot: remove cf9_safe from allowed types and rename cf9_force BOOT_CF9_SAFE_STR is an internal value used only by the x86 code and it's not possible to set it from userspace. Remove it, and rename 'cf9_force' to 'pci', so to make it coherent with the kernel command line reboot= option. Tested with this script: cd /sys/kernel/reboot/ for i in cold warm hard soft gpio; do echo $i >mode read j type read j cpu read j force read j Cc: Arnd Bergmann Cc: Guenter Roeck Cc: Kees Cook Cc: Mike Rapoport Cc: Nathan Chancellor Cc: Pavel Tatashin Cc: Petr Mladek Cc: Tyler Hicks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 940cbb784e17..769ad55c7187 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -615,8 +615,7 @@ __setup("reboot=", reboot_setup); #define BOOT_BIOS_STR "bios" #define BOOT_ACPI_STR "acpi" #define BOOT_EFI_STR "efi" -#define BOOT_CF9_FORCE_STR "cf9_force" -#define BOOT_CF9_SAFE_STR "cf9_safe" +#define BOOT_PCI_STR "pci" static ssize_t mode_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -688,10 +687,7 @@ static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char val = BOOT_EFI_STR; break; case BOOT_CF9_FORCE: - val = BOOT_CF9_FORCE_STR; - break; - case BOOT_CF9_SAFE: - val = BOOT_CF9_SAFE_STR; + val = BOOT_PCI_STR; break; default: val = REBOOT_UNDEFINED_STR; @@ -715,10 +711,8 @@ static ssize_t type_store(struct kobject *kobj, struct kobj_attribute *attr, reboot_type = BOOT_ACPI; else if (!strncmp(buf, BOOT_EFI_STR, strlen(BOOT_EFI_STR))) reboot_type = BOOT_EFI; - else if (!strncmp(buf, BOOT_CF9_FORCE_STR, strlen(BOOT_CF9_FORCE_STR))) + else if (!strncmp(buf, BOOT_PCI_STR, strlen(BOOT_PCI_STR))) reboot_type = BOOT_CF9_FORCE; - else if (!strncmp(buf, BOOT_CF9_SAFE_STR, strlen(BOOT_CF9_SAFE_STR))) - reboot_type = BOOT_CF9_SAFE; else return -EINVAL; -- cgit v1.2.3 From 1a9d079f43e3215b81ec13d427950093b8f04c70 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 15 Dec 2020 20:47:04 -0800 Subject: reboot: allow to override reboot type if quirks are found Patch series "reboot: sysfs improvements". Some improvements to the sysfs reboot interface: hide not working settings and support machines with known reboot quirks. This patch (of 2): On some machines a quirk can force a specific reboot type. Quirks are found during a DMI scan, the list of machines which need special reboot handling is defined in reboot_dmi_table. The kernel command line reboot= option overrides this via a global variable `reboot_default`, so that the reboot type requested in the command line is really performed. This was not true when setting the reboot type via the new sysfs interface. Fix this by setting reboot_default upon the first change, like reboot_setup() does for the command line. Link: https://lkml.kernel.org/r/20201130173717.198952-1-mcroce@linux.microsoft.com Link: https://lkml.kernel.org/r/20201130173717.198952-2-mcroce@linux.microsoft.com Signed-off-by: Matteo Croce Reviewed-by: Petr Mladek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 769ad55c7187..4a1a66452ec4 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -662,6 +662,8 @@ static ssize_t mode_store(struct kobject *kobj, struct kobj_attribute *attr, else return -EINVAL; + reboot_default = 0; + return count; } static struct kobj_attribute reboot_mode_attr = __ATTR_RW(mode); @@ -716,6 +718,8 @@ static ssize_t type_store(struct kobject *kobj, struct kobj_attribute *attr, else return -EINVAL; + reboot_default = 0; + return count; } static struct kobj_attribute reboot_type_attr = __ATTR_RW(type); @@ -741,6 +745,7 @@ static ssize_t cpu_store(struct kobject *kobj, struct kobj_attribute *attr, if (cpunum >= num_possible_cpus()) return -ERANGE; + reboot_default = 0; reboot_cpu = cpunum; return count; @@ -762,6 +767,7 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, if (kstrtobool(buf, &res)) return -EINVAL; + reboot_default = 0; reboot_force = res; return count; -- cgit v1.2.3 From 40247e55ba099067bf160332365ed78b5aeb62da Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Tue, 15 Dec 2020 20:47:07 -0800 Subject: reboot: hide from sysfs not applicable settings Not all the reboot settings from both the kernel command line or sysfs interface are available to all platforms. Filter out reboot_type and reboot_force which are x86 only, and also remove reboot_cpu on kernels without SMP support. This saves some space, and avoid confusing the user with settings which will have no effect. Link: https://lkml.kernel.org/r/20201130173717.198952-3-mcroce@linux.microsoft.com Signed-off-by: Matteo Croce Reviewed-by: Petr Mladek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 54 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 4a1a66452ec4..eb1b15850761 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -668,6 +668,29 @@ static ssize_t mode_store(struct kobject *kobj, struct kobj_attribute *attr, } static struct kobj_attribute reboot_mode_attr = __ATTR_RW(mode); +#ifdef CONFIG_X86 +static ssize_t force_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", reboot_force); +} +static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + bool res; + + if (!capable(CAP_SYS_BOOT)) + return -EPERM; + + if (kstrtobool(buf, &res)) + return -EINVAL; + + reboot_default = 0; + reboot_force = res; + + return count; +} +static struct kobj_attribute reboot_force_attr = __ATTR_RW(force); + static ssize_t type_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { const char *val; @@ -723,7 +746,9 @@ static ssize_t type_store(struct kobject *kobj, struct kobj_attribute *attr, return count; } static struct kobj_attribute reboot_type_attr = __ATTR_RW(type); +#endif +#ifdef CONFIG_SMP static ssize_t cpu_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { return sprintf(buf, "%d\n", reboot_cpu); @@ -751,34 +776,17 @@ static ssize_t cpu_store(struct kobject *kobj, struct kobj_attribute *attr, return count; } static struct kobj_attribute reboot_cpu_attr = __ATTR_RW(cpu); - -static ssize_t force_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) -{ - return sprintf(buf, "%d\n", reboot_force); -} -static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t count) -{ - bool res; - - if (!capable(CAP_SYS_BOOT)) - return -EPERM; - - if (kstrtobool(buf, &res)) - return -EINVAL; - - reboot_default = 0; - reboot_force = res; - - return count; -} -static struct kobj_attribute reboot_force_attr = __ATTR_RW(force); +#endif static struct attribute *reboot_attrs[] = { &reboot_mode_attr.attr, +#ifdef CONFIG_X86 + &reboot_force_attr.attr, &reboot_type_attr.attr, +#endif +#ifdef CONFIG_SMP &reboot_cpu_attr.attr, - &reboot_force_attr.attr, +#endif NULL, }; -- cgit v1.2.3 From 537cd89484ab57ca38ae25d9557361c0815977d1 Mon Sep 17 00:00:00 2001 From: Barnabás Pőcze Date: Tue, 15 Dec 2020 20:47:10 -0800 Subject: fault-injection: handle EI_ETYPE_TRUE MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit af3b854492f351d1 ("mm/page_alloc.c: allow error injection") introduced EI_ETYPE_TRUE, but did not extend * lib/error-inject.c:error_type_string(), and * kernel/fail_function.c:adjust_error_retval() to accommodate for this change. Handle EI_ETYPE_TRUE in both functions appropriately by * returning "TRUE" in error_type_string(), * adjusting the return value to true (1) in adjust_error_retval(). Furthermore, simplify the logic of handling EI_ETYPE_NULL in adjust_error_retval(). Link: https://lkml.kernel.org/r/njB1czX0ZgWPR9h61euHIBb5bEyePw9D4D2m3i5lc9Cl96P8Q1308dTcmsEZW7Vtz3Ifz4do-rOtSfuFTyGoEDYokkK2aUqBePVptzZEWfU=@protonmail.com Signed-off-by: Barnabás Pőcze Acked-by: Masami Hiramatsu Reviewed-by: Akinobu Mita Cc: "Naveen N. Rao" Cc: Anil S Keshavamurthy Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fail_function.c | 6 +++--- lib/error-inject.c | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fail_function.c b/kernel/fail_function.c index b0b1ad93fa95..60dc825ecc2b 100644 --- a/kernel/fail_function.c +++ b/kernel/fail_function.c @@ -37,9 +37,7 @@ static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv) { switch (get_injectable_error_type(addr)) { case EI_ETYPE_NULL: - if (retv != 0) - return 0; - break; + return 0; case EI_ETYPE_ERRNO: if (retv < (unsigned long)-MAX_ERRNO) return (unsigned long)-EINVAL; @@ -48,6 +46,8 @@ static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv) if (retv != 0 && retv < (unsigned long)-MAX_ERRNO) return (unsigned long)-EINVAL; break; + case EI_ETYPE_TRUE: + return 1; } return retv; diff --git a/lib/error-inject.c b/lib/error-inject.c index aa63751c916f..c73651b15b76 100644 --- a/lib/error-inject.c +++ b/lib/error-inject.c @@ -180,6 +180,8 @@ static const char *error_type_string(int etype) return "ERRNO"; case EI_ETYPE_ERRNO_NULL: return "ERRNO_NULL"; + case EI_ETYPE_TRUE: + return "TRUE"; default: return "(unknown)"; } -- cgit v1.2.3 From f12ad423c4af877b2e4b5a80928b95195fccab04 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Dec 2020 22:12:54 +0100 Subject: tick: Remove pointless cpu valid check in hotplug code tick_handover_do_timer() which is invoked when a CPU is unplugged has a check for cpumask_first(cpu_online_mask) when it tries to hand over the tick update duty. Checking the result of cpumask_first() there is pointless because if the online mask is empty at this point, then this would be the last CPU in the system going offline, which is impossible. There is always at least one CPU remaining. If online mask would be really empty then the timer duty would be the least of the resulting problems. Remove the well meant check simply because it is pointless and confusing. Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20201206212002.582579516@linutronix.de --- kernel/time/tick-common.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a03764df5366..9d3a22510bab 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -407,17 +407,13 @@ EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); /* * Transfer the do_timer job away from a dying cpu. * - * Called with interrupts disabled. Not locking required. If + * Called with interrupts disabled. No locking required. If * tick_do_timer_cpu is owned by this cpu, nothing can change it. */ void tick_handover_do_timer(void) { - if (tick_do_timer_cpu == smp_processor_id()) { - int cpu = cpumask_first(cpu_online_mask); - - tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : - TICK_DO_TIMER_NONE; - } + if (tick_do_timer_cpu == smp_processor_id()) + tick_do_timer_cpu = cpumask_first(cpu_online_mask); } /* -- cgit v1.2.3 From ba8ea8e7dd6e1662e34e730eadfc52aa6816f9dd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 6 Dec 2020 22:12:55 +0100 Subject: tick/sched: Remove bogus boot "safety" check can_stop_idle_tick() checks whether the do_timer() duty has been taken over by a CPU on boot. That's silly because the boot CPU always takes over with the initial clockevent device. But even if no CPU would have installed a clockevent and taken over the duty then the question whether the tick on the current CPU can be stopped or not is moot. In that case the current CPU would have no clockevent either, so there would be nothing to keep ticking. Remove it. Signed-off-by: Thomas Gleixner Acked-by: Frederic Weisbecker Link: https://lore.kernel.org/r/20201206212002.725238293@linutronix.de --- kernel/time/tick-sched.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a9e68936822d..5fbc748f0058 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -991,13 +991,6 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) */ if (tick_do_timer_cpu == cpu) return false; - /* - * Boot safety: make sure the timekeeping duty has been - * assigned before entering dyntick-idle mode, - * tick_do_timer_cpu is TICK_DO_TIMER_BOOT - */ - if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_BOOT)) - return false; /* Should not happen for nohz-full */ if (WARN_ON_ONCE(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) -- cgit v1.2.3 From 2d18e54dd8662442ef5898c6bdadeaf90b3cebbc Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Thu, 10 Dec 2020 09:29:43 +0800 Subject: cgroup: Fix memory leak when parsing multiple source parameters A memory leak is found in cgroup1_parse_param() when multiple source parameters overwrite fc->source in the fs_context struct without free. unreferenced object 0xffff888100d930e0 (size 16): comm "mount", pid 520, jiffies 4303326831 (age 152.783s) hex dump (first 16 bytes): 74 65 73 74 6c 65 61 6b 00 00 00 00 00 00 00 00 testleak........ backtrace: [<000000003e5023ec>] kmemdup_nul+0x2d/0xa0 [<00000000377dbdaa>] vfs_parse_fs_string+0xc0/0x150 [<00000000cb2b4882>] generic_parse_monolithic+0x15a/0x1d0 [<000000000f750198>] path_mount+0xee1/0x1820 [<0000000004756de2>] do_mount+0xea/0x100 [<0000000094cafb0a>] __x64_sys_mount+0x14b/0x1f0 Fix this bug by permitting a single source parameter and rejecting with an error all subsequent ones. Fixes: 8d2451f4994f ("cgroup1: switch to option-by-option parsing") Reported-by: Hulk Robot Signed-off-by: Qinglang Miao Reviewed-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 191c329e482a..32596fdbcd5b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -908,6 +908,8 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) opt = fs_parse(fc, cgroup1_fs_parameters, param, &result); if (opt == -ENOPARAM) { if (strcmp(param->key, "source") == 0) { + if (fc->source) + return invalf(fc, "Multiple sources not supported"); fc->source = param->string; param->string = NULL; return 0; -- cgit v1.2.3 From d467d80dc399ba77875d647f2f37b7d1a70d94c2 Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Wed, 16 Dec 2020 10:47:15 +0800 Subject: bpf: Remove unused including Remove including that don't need it. Signed-off-by: Tian Tao Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1608086835-54523-1-git-send-email-tiantao6@hisilicon.com --- kernel/bpf/syscall.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 287be337d5f6..bb2700ec5bf3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 55d2eba8e7cd439c11cdb204898c2d384227629b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Dec 2020 12:21:36 +0100 Subject: jump_label: Fix usage in module __init When the static_key is part of the module, and the module calls static_key_inc/enable() from it's __init section *AND* has a static_branch_*() user in that very same __init section, things go wobbly. If the static_key lives outside the module, jump_label_add_module() would append this module's sites to the key and jump_label_update() would take the static_key_linked() branch and all would be fine. If all the sites are outside of __init, then everything will be fine too. However, when all is aligned just as described above, jump_label_update() calls __jump_label_update(.init = false) and we'll not update sites in __init text. Fixes: 19483677684b ("jump_label: Annotate entries that operate on __init code earlier") Reported-by: Dexuan Cui Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Tested-by: Jessica Yu Link: https://lkml.kernel.org/r/20201216135435.GV3092@hirez.programming.kicks-ass.net --- kernel/jump_label.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 015ef903ce8c..c6a39d662935 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -793,6 +793,7 @@ int jump_label_text_reserved(void *start, void *end) static void jump_label_update(struct static_key *key) { struct jump_entry *stop = __stop___jump_table; + bool init = system_state < SYSTEM_RUNNING; struct jump_entry *entry; #ifdef CONFIG_MODULES struct module *mod; @@ -804,15 +805,16 @@ static void jump_label_update(struct static_key *key) preempt_disable(); mod = __module_address((unsigned long)key); - if (mod) + if (mod) { stop = mod->jump_entries + mod->num_jump_entries; + init = mod->state == MODULE_STATE_COMING; + } preempt_enable(); #endif entry = static_key_entries(key); /* if there are no users, entry can be NULL */ if (entry) - __jump_label_update(key, entry, stop, - system_state < SYSTEM_RUNNING); + __jump_label_update(key, entry, stop, init); } #ifdef CONFIG_STATIC_KEYS_SELFTEST -- cgit v1.2.3 From 91ea62d58bd661827c328a2c6c02a87fa4aae88b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Dec 2020 16:39:14 +0100 Subject: softirq: Avoid bad tracing / lockdep interaction Similar to commit: 1a63dcd8765b ("softirq: Reorder trace_softirqs_on to prevent lockdep splat") __local_bh_enable_ip() can also call into tracing with inconsistent state. Unlike that commit we don't need to bother about the tracepoint because 'cnt-1' never matches preempt_count() (by construction). Reported-by: Heiko Carstens Signed-off-by: Peter Zijlstra (Intel) Tested-by: Heiko Carstens Link: https://lkml.kernel.org/r/20201218154519.GW3092@hirez.programming.kicks-ass.net --- kernel/softirq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 09229ad82209..0f1d3a32d53b 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -185,7 +185,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) * Keep preemption disabled until we are done with * softirq processing: */ - preempt_count_sub(cnt - 1); + __preempt_count_sub(cnt - 1); if (unlikely(!in_interrupt() && local_softirq_pending())) { /* -- cgit v1.2.3 From 06fde695ee76429634c1e8c8c1154035aa61191e Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Fri, 18 Dec 2020 14:00:39 +0800 Subject: genirq/msi: Initialize msi_alloc_info before calling msi_domain_prepare_irqs() Since commit 5fe71d271df8 ("irqchip/gic-v3-its: Tag ITS device as shared if allocating for a proxy device"), some of the devices are wrongly marked as "shared" by the ITS driver on systems equipped with the ITS(es). The problem is that the @info->flags may not be initialized anywhere and we end up looking at random bits on the stack. That's obviously not good. We can perform the initialization in the IRQ core layer before calling msi_domain_prepare_irqs(), which is neat enough. Fixes: 5fe71d271df8 ("irqchip/gic-v3-its: Tag ITS device as shared if allocating for a proxy device") Signed-off-by: Zenghui Yu Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/20201218060039.1770-1-yuzenghui@huawei.com --- kernel/irq/msi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 2c0c4d6d0f83..dc0e2d7fbdfd 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -402,7 +402,7 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_domain_ops *ops = info->ops; struct irq_data *irq_data; struct msi_desc *desc; - msi_alloc_info_t arg; + msi_alloc_info_t arg = { }; int i, ret, virq; bool can_reserve; -- cgit v1.2.3 From f6f5cd840ae782680c5e94048c72420e4e6857f9 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 17 Dec 2020 17:17:05 +0000 Subject: timekeeping: Fix spelling mistake in Kconfig "fullfill" -> "fulfill" There is a spelling mistake in the Kconfig help text. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Thomas Gleixner Acked-by: Linus Walleij Link: https://lore.kernel.org/r/20201217171705.57586-1-colin.king@canonical.com --- kernel/time/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index a09b1d61df6a..64051f47475c 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -141,7 +141,7 @@ config CONTEXT_TRACKING_FORCE dynticks working. This option stands for testing when an arch implements the - context tracking backend but doesn't yet fullfill all the + context tracking backend but doesn't yet fulfill all the requirements to make the full dynticks feature working. Without the full dynticks, there is no way to test the support for context tracking and the subsystems that rely on it: RCU -- cgit v1.2.3 From b0a0c2615f6f199a656ed8549d7dce625d77aa77 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Fri, 18 Dec 2020 14:05:41 -0800 Subject: epoll: wire up syscall epoll_pwait2 Split off from prev patch in the series that implements the syscall. Link: https://lkml.kernel.org/r/20201121144401.3727659-4-willemdebruijn.kernel@gmail.com Signed-off-by: Willem de Bruijn Cc: Al Viro Cc: Arnd Bergmann Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/kernel/syscalls/syscall.tbl | 1 + arch/arm/tools/syscall.tbl | 1 + arch/arm64/include/asm/unistd.h | 2 +- arch/arm64/include/asm/unistd32.h | 2 ++ arch/ia64/kernel/syscalls/syscall.tbl | 1 + arch/m68k/kernel/syscalls/syscall.tbl | 1 + arch/microblaze/kernel/syscalls/syscall.tbl | 1 + arch/mips/kernel/syscalls/syscall_n32.tbl | 1 + arch/mips/kernel/syscalls/syscall_n64.tbl | 1 + arch/mips/kernel/syscalls/syscall_o32.tbl | 1 + arch/parisc/kernel/syscalls/syscall.tbl | 1 + arch/powerpc/kernel/syscalls/syscall.tbl | 1 + arch/s390/kernel/syscalls/syscall.tbl | 1 + arch/sh/kernel/syscalls/syscall.tbl | 1 + arch/sparc/kernel/syscalls/syscall.tbl | 1 + arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + arch/xtensa/kernel/syscalls/syscall.tbl | 1 + include/linux/compat.h | 6 ++++++ include/linux/syscalls.h | 5 +++++ include/uapi/asm-generic/unistd.h | 4 +++- kernel/sys_ni.c | 2 ++ 22 files changed, 35 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index ee7b01bb7346..a6617067dbe6 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -480,3 +480,4 @@ 548 common pidfd_getfd sys_pidfd_getfd 549 common faccessat2 sys_faccessat2 550 common process_madvise sys_process_madvise +551 common epoll_pwait2 sys_epoll_pwait2 diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index d056a548358e..20e1170e2e0a 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -454,3 +454,4 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index b3b2019f8d16..86a9d7b3eabe 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 441 +#define __NR_compat_syscalls 442 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 107f08e03b9f..f4bca2b90218 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -889,6 +889,8 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise) +#define __NR_epoll_pwait2 441 +__SYSCALL(__NR_epoll_pwait2, sys_epoll_pwait2) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index b96ed8b8a508..bfc00f2bd437 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -361,3 +361,4 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 625fb6d32842..7fe4e45c864c 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -440,3 +440,4 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index aae729c95cf9..a522adf194ab 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -446,3 +446,4 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 32817c954435..ad9c3dd0ab1f 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -379,3 +379,4 @@ 438 n32 pidfd_getfd sys_pidfd_getfd 439 n32 faccessat2 sys_faccessat2 440 n32 process_madvise sys_process_madvise +441 n32 epoll_pwait2 sys_epoll_pwait2 diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 9e4ea3c31b1c..91649690b52f 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -355,3 +355,4 @@ 438 n64 pidfd_getfd sys_pidfd_getfd 439 n64 faccessat2 sys_faccessat2 440 n64 process_madvise sys_process_madvise +441 n64 epoll_pwait2 sys_epoll_pwait2 diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 29f5f28cf5ce..4bad0c40aed6 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -428,3 +428,4 @@ 438 o32 pidfd_getfd sys_pidfd_getfd 439 o32 faccessat2 sys_faccessat2 440 o32 process_madvise sys_process_madvise +441 o32 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index f375ea528e59..6bcc31966b44 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -438,3 +438,4 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 1275daec7fec..f744eb5cba88 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -530,3 +530,4 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 28c168000483..14f6525886a8 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -443,3 +443,4 @@ 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 sys_epoll_pwait2 diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 783738448ff5..9df40ac0ebc0 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -443,3 +443,4 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 78160260991b..c7da4c3271e6 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -486,3 +486,4 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 0d0667a9fbd7..874aeacde2dd 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -445,3 +445,4 @@ 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 440 i386 process_madvise sys_process_madvise +441 i386 epoll_pwait2 sys_epoll_pwait2 compat_sys_epoll_pwait2 diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 379819244b91..78672124d28b 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -362,6 +362,7 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index b070f272995d..46116a28eeed 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -411,3 +411,4 @@ 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 440 common process_madvise sys_process_madvise +441 common epoll_pwait2 sys_epoll_pwait2 diff --git a/include/linux/compat.h b/include/linux/compat.h index 400c0941c8af..6e65be753603 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -537,6 +537,12 @@ asmlinkage long compat_sys_epoll_pwait(int epfd, int maxevents, int timeout, const compat_sigset_t __user *sigmask, compat_size_t sigsetsize); +asmlinkage long compat_sys_epoll_pwait2(int epfd, + struct epoll_event __user *events, + int maxevents, + const struct __kernel_timespec __user *timeout, + const compat_sigset_t __user *sigmask, + compat_size_t sigsetsize); /* fs/fcntl.c */ asmlinkage long compat_sys_fcntl(unsigned int fd, unsigned int cmd, diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index df0c3c74609e..f3929aff39cf 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -362,6 +362,11 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, int maxevents, int timeout, const sigset_t __user *sigmask, size_t sigsetsize); +asmlinkage long sys_epoll_pwait2(int epfd, struct epoll_event __user *events, + int maxevents, + const struct __kernel_timespec __user *timeout, + const sigset_t __user *sigmask, + size_t sigsetsize); /* fs/fcntl.c */ asmlinkage long sys_dup(unsigned int fildes); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index fc48c64700eb..728752917785 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -859,9 +859,11 @@ __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) __SYSCALL(__NR_faccessat2, sys_faccessat2) #define __NR_process_madvise 440 __SYSCALL(__NR_process_madvise, sys_process_madvise) +#define __NR_epoll_pwait2 441 +__SC_COMP(__NR_epoll_pwait2, sys_epoll_pwait2, compat_sys_epoll_pwait2) #undef __NR_syscalls -#define __NR_syscalls 441 +#define __NR_syscalls 442 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index f27ac94d5fa7..19aa806890d5 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -68,6 +68,8 @@ COND_SYSCALL(epoll_create1); COND_SYSCALL(epoll_ctl); COND_SYSCALL(epoll_pwait); COND_SYSCALL_COMPAT(epoll_pwait); +COND_SYSCALL(epoll_pwait2); +COND_SYSCALL_COMPAT(epoll_pwait2); /* fs/fcntl.c */ -- cgit v1.2.3 From e7e518053c267bb6be3799520d9f4a34c7264a2e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 21 Dec 2020 11:25:06 -0800 Subject: bpf: Add schedule point in htab_init_buckets() We noticed that with a LOCKDEP enabled kernel, allocating a hash table with 65536 buckets would use more than 60ms. htab_init_buckets() runs from process context, it is safe to schedule to avoid latency spikes. Fixes: c50eb518e262 ("bpf: Use separate lockdep class for each hashtab") Reported-by: John Sperbeck Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201221192506.707584-1-eric.dumazet@gmail.com --- kernel/bpf/hashtab.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 7e848200cd26..c1ac7f964bc9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -152,6 +152,7 @@ static void htab_init_buckets(struct bpf_htab *htab) lockdep_set_class(&htab->buckets[i].lock, &htab->lockdep_key); } + cond_resched(); } } -- cgit v1.2.3 From cebd0eb29acdfc2f5e44e5f356ffcd0c44f16b4a Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 22 Dec 2020 12:00:21 -0800 Subject: kasan: rename (un)poison_shadow to (un)poison_range This is a preparatory commit for the upcoming addition of a new hardware tag-based (MTE-based) KASAN mode. The new mode won't be using shadow memory. Rename external annotation kasan_unpoison_shadow() to kasan_unpoison_range(), and introduce internal functions (un)poison_range() (without kasan_ prefix). Co-developed-by: Marco Elver Link: https://lkml.kernel.org/r/fccdcaa13dc6b2211bf363d6c6d499279a54fe3a.1606161801.git.andreyknvl@google.com Signed-off-by: Marco Elver Signed-off-by: Andrey Konovalov Signed-off-by: Vincenzo Frascino Reviewed-by: Alexander Potapenko Tested-by: Vincenzo Frascino Cc: Andrey Ryabinin Cc: Branislav Rankov Cc: Catalin Marinas Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Kevin Brodsky Cc: Vasily Gorbik Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 6 +++--- kernel/fork.c | 4 ++-- mm/kasan/common.c | 49 +++++++++++++++++++++++++++---------------------- mm/kasan/generic.c | 23 +++++++++++------------ mm/kasan/kasan.h | 3 ++- mm/kasan/tags.c | 2 +- mm/slab_common.c | 2 +- 7 files changed, 47 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 7828436a3a99..9740c06a04a1 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -72,7 +72,7 @@ extern void kasan_enable_current(void); /* Disable reporting bugs for current task */ extern void kasan_disable_current(void); -void kasan_unpoison_shadow(const void *address, size_t size); +void kasan_unpoison_range(const void *address, size_t size); void kasan_unpoison_task_stack(struct task_struct *task); @@ -109,7 +109,7 @@ struct kasan_cache { size_t __ksize(const void *); static inline void kasan_unpoison_slab(const void *ptr) { - kasan_unpoison_shadow(ptr, __ksize(ptr)); + kasan_unpoison_range(ptr, __ksize(ptr)); } size_t kasan_metadata_size(struct kmem_cache *cache); @@ -118,7 +118,7 @@ void kasan_restore_multi_shot(bool enabled); #else /* CONFIG_KASAN */ -static inline void kasan_unpoison_shadow(const void *address, size_t size) {} +static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_unpoison_task_stack(struct task_struct *task) {} diff --git a/kernel/fork.c b/kernel/fork.c index 41906a52a764..37720a6d04ea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -225,8 +225,8 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) if (!s) continue; - /* Clear the KASAN shadow of the stack. */ - kasan_unpoison_shadow(s->addr, THREAD_SIZE); + /* Mark stack accessible for KASAN. */ + kasan_unpoison_range(s->addr, THREAD_SIZE); /* Clear stale pointers from reused stack. */ memset(s->addr, 0, THREAD_SIZE); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 89e5ef9417a7..73e79a34671b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -108,7 +108,7 @@ void *memcpy(void *dest, const void *src, size_t len) * Poisons the shadow memory for 'size' bytes starting from 'addr'. * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE. */ -void kasan_poison_shadow(const void *address, size_t size, u8 value) +void poison_range(const void *address, size_t size, u8 value) { void *shadow_start, *shadow_end; @@ -125,7 +125,7 @@ void kasan_poison_shadow(const void *address, size_t size, u8 value) __memset(shadow_start, value, shadow_end - shadow_start); } -void kasan_unpoison_shadow(const void *address, size_t size) +void unpoison_range(const void *address, size_t size) { u8 tag = get_tag(address); @@ -136,7 +136,7 @@ void kasan_unpoison_shadow(const void *address, size_t size) */ address = reset_tag(address); - kasan_poison_shadow(address, size, tag); + poison_range(address, size, tag); if (size & KASAN_SHADOW_MASK) { u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size); @@ -148,12 +148,17 @@ void kasan_unpoison_shadow(const void *address, size_t size) } } +void kasan_unpoison_range(const void *address, size_t size) +{ + unpoison_range(address, size); +} + static void __kasan_unpoison_stack(struct task_struct *task, const void *sp) { void *base = task_stack_page(task); size_t size = sp - base; - kasan_unpoison_shadow(base, size); + unpoison_range(base, size); } /* Unpoison the entire stack for a task. */ @@ -172,7 +177,7 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) */ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1)); - kasan_unpoison_shadow(base, watermark - base); + unpoison_range(base, watermark - base); } void kasan_alloc_pages(struct page *page, unsigned int order) @@ -186,13 +191,13 @@ void kasan_alloc_pages(struct page *page, unsigned int order) tag = random_tag(); for (i = 0; i < (1 << order); i++) page_kasan_tag_set(page + i, tag); - kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order); + unpoison_range(page_address(page), PAGE_SIZE << order); } void kasan_free_pages(struct page *page, unsigned int order) { if (likely(!PageHighMem(page))) - kasan_poison_shadow(page_address(page), + poison_range(page_address(page), PAGE_SIZE << order, KASAN_FREE_PAGE); } @@ -284,18 +289,18 @@ void kasan_poison_slab(struct page *page) for (i = 0; i < compound_nr(page); i++) page_kasan_tag_reset(page + i); - kasan_poison_shadow(page_address(page), page_size(page), - KASAN_KMALLOC_REDZONE); + poison_range(page_address(page), page_size(page), + KASAN_KMALLOC_REDZONE); } void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) { - kasan_unpoison_shadow(object, cache->object_size); + unpoison_range(object, cache->object_size); } void kasan_poison_object_data(struct kmem_cache *cache, void *object) { - kasan_poison_shadow(object, + poison_range(object, round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE), KASAN_KMALLOC_REDZONE); } @@ -408,7 +413,7 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object, } rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE); - kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); + poison_range(object, rounded_up_size, KASAN_KMALLOC_FREE); if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) || unlikely(!(cache->flags & SLAB_KASAN))) @@ -448,9 +453,9 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object, tag = assign_tag(cache, object, false, keep_tag); /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */ - kasan_unpoison_shadow(set_tag(object, tag), size); - kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, - KASAN_KMALLOC_REDZONE); + unpoison_range(set_tag(object, tag), size); + poison_range((void *)redzone_start, redzone_end - redzone_start, + KASAN_KMALLOC_REDZONE); if (cache->flags & SLAB_KASAN) kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags); @@ -489,9 +494,9 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size, KASAN_SHADOW_SCALE_SIZE); redzone_end = (unsigned long)ptr + page_size(page); - kasan_unpoison_shadow(ptr, size); - kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start, - KASAN_PAGE_REDZONE); + unpoison_range(ptr, size); + poison_range((void *)redzone_start, redzone_end - redzone_start, + KASAN_PAGE_REDZONE); return (void *)ptr; } @@ -523,7 +528,7 @@ void kasan_poison_kfree(void *ptr, unsigned long ip) kasan_report_invalid_free(ptr, ip); return; } - kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE); + poison_range(ptr, page_size(page), KASAN_FREE_PAGE); } else { __kasan_slab_free(page->slab_cache, ptr, ip, false); } @@ -709,7 +714,7 @@ int kasan_populate_vmalloc(unsigned long addr, unsigned long size) * // vmalloc() allocates memory * // let a = area->addr * // we reach kasan_populate_vmalloc - * // and call kasan_unpoison_shadow: + * // and call unpoison_range: * STORE shadow(a), unpoison_val * ... * STORE shadow(a+99), unpoison_val x = LOAD p @@ -744,7 +749,7 @@ void kasan_poison_vmalloc(const void *start, unsigned long size) return; size = round_up(size, KASAN_SHADOW_SCALE_SIZE); - kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID); + poison_range(start, size, KASAN_VMALLOC_INVALID); } void kasan_unpoison_vmalloc(const void *start, unsigned long size) @@ -752,7 +757,7 @@ void kasan_unpoison_vmalloc(const void *start, unsigned long size) if (!is_vmalloc_or_module_addr(start)) return; - kasan_unpoison_shadow(start, size); + unpoison_range(start, size); } static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index d341859a1b95..9fe44f9b3b30 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -202,11 +202,11 @@ static void register_global(struct kasan_global *global) { size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE); - kasan_unpoison_shadow(global->beg, global->size); + unpoison_range(global->beg, global->size); - kasan_poison_shadow(global->beg + aligned_size, - global->size_with_redzone - aligned_size, - KASAN_GLOBAL_REDZONE); + poison_range(global->beg + aligned_size, + global->size_with_redzone - aligned_size, + KASAN_GLOBAL_REDZONE); } void __asan_register_globals(struct kasan_global *globals, size_t size) @@ -285,13 +285,12 @@ void __asan_alloca_poison(unsigned long addr, size_t size) WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE)); - kasan_unpoison_shadow((const void *)(addr + rounded_down_size), - size - rounded_down_size); - kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_LEFT); - kasan_poison_shadow(right_redzone, - padding_size + KASAN_ALLOCA_REDZONE_SIZE, - KASAN_ALLOCA_RIGHT); + unpoison_range((const void *)(addr + rounded_down_size), + size - rounded_down_size); + poison_range(left_redzone, KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_LEFT); + poison_range(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE, + KASAN_ALLOCA_RIGHT); } EXPORT_SYMBOL(__asan_alloca_poison); @@ -301,7 +300,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom) if (unlikely(!stack_top || stack_top > stack_bottom)) return; - kasan_unpoison_shadow(stack_top, stack_bottom - stack_top); + unpoison_range(stack_top, stack_bottom - stack_top); } EXPORT_SYMBOL(__asan_allocas_unpoison); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index ac499456740f..42ab02c61331 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -150,7 +150,8 @@ static inline bool addr_has_shadow(const void *addr) return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); } -void kasan_poison_shadow(const void *address, size_t size, u8 value); +void poison_range(const void *address, size_t size, u8 value); +void unpoison_range(const void *address, size_t size); /** * check_memory_region - Check memory region, and report if invalid access. diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 5c8b08a25715..c0b3f327812b 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -153,7 +153,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort); void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size) { - kasan_poison_shadow((void *)addr, size, tag); + poison_range((void *)addr, size, tag); } EXPORT_SYMBOL(__hwasan_tag_memory); diff --git a/mm/slab_common.c b/mm/slab_common.c index 2f2b55c2798e..573fbacd9ef5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1176,7 +1176,7 @@ size_t ksize(const void *objp) * We assume that ksize callers could use whole allocated area, * so we need to unpoison this area. */ - kasan_unpoison_shadow(objp, size); + kasan_unpoison_range(objp, size); return size; } EXPORT_SYMBOL(ksize); -- cgit v1.2.3 From 69ca310f34168eae0ada434796bfc22fb4a0fa26 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Fri, 18 Dec 2020 10:50:30 -0800 Subject: bpf: Save correct stopping point in file seq iteration On some systems, some variant of the following splat is repeatedly seen. The common factor in all traces seems to be the entry point to task_file_seq_next(). With the patch, all warnings go away. rcu: INFO: rcu_sched self-detected stall on CPU rcu: \x0926-....: (20992 ticks this GP) idle=d7e/1/0x4000000000000002 softirq=81556231/81556231 fqs=4876 \x09(t=21033 jiffies g=159148529 q=223125) NMI backtrace for cpu 26 CPU: 26 PID: 2015853 Comm: bpftool Kdump: loaded Not tainted 5.6.13-0_fbk4_3876_gd8d1f9bf80bb #1 Hardware name: Quanta Twin Lakes MP/Twin Lakes Passive MP, BIOS F09_3A12 10/08/2018 Call Trace: dump_stack+0x50/0x70 nmi_cpu_backtrace.cold.6+0x13/0x50 ? lapic_can_unplug_cpu.cold.30+0x40/0x40 nmi_trigger_cpumask_backtrace+0xba/0xca rcu_dump_cpu_stacks+0x99/0xc7 rcu_sched_clock_irq.cold.90+0x1b4/0x3aa ? tick_sched_do_timer+0x60/0x60 update_process_times+0x24/0x50 tick_sched_timer+0x37/0x70 __hrtimer_run_queues+0xfe/0x270 hrtimer_interrupt+0xf4/0x210 smp_apic_timer_interrupt+0x5e/0x120 apic_timer_interrupt+0xf/0x20 RIP: 0010:get_pid_task+0x38/0x80 Code: 89 f6 48 8d 44 f7 08 48 8b 00 48 85 c0 74 2b 48 83 c6 55 48 c1 e6 04 48 29 f0 74 19 48 8d 78 20 ba 01 00 00 00 f0 0f c1 50 20 <85> d2 74 27 78 11 83 c2 01 78 0c 48 83 c4 08 c3 31 c0 48 83 c4 08 RSP: 0018:ffffc9000d293dc8 EFLAGS: 00000202 ORIG_RAX: ffffffffffffff13 RAX: ffff888637c05600 RBX: ffffc9000d293e0c RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000550 RDI: ffff888637c05620 RBP: ffffffff8284eb80 R08: ffff88831341d300 R09: ffff88822ffd8248 R10: ffff88822ffd82d0 R11: 00000000003a93c0 R12: 0000000000000001 R13: 00000000ffffffff R14: ffff88831341d300 R15: 0000000000000000 ? find_ge_pid+0x1b/0x20 task_seq_get_next+0x52/0xc0 task_file_seq_get_next+0x159/0x220 task_file_seq_next+0x4f/0xa0 bpf_seq_read+0x159/0x390 vfs_read+0x8a/0x140 ksys_read+0x59/0xd0 do_syscall_64+0x42/0x110 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f95ae73e76e Code: Bad RIP value. RSP: 002b:00007ffc02c1dbf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 RAX: ffffffffffffffda RBX: 000000000170faa0 RCX: 00007f95ae73e76e RDX: 0000000000001000 RSI: 00007ffc02c1dc30 RDI: 0000000000000007 RBP: 00007ffc02c1ec70 R08: 0000000000000005 R09: 0000000000000006 R10: fffffffffffff20b R11: 0000000000000246 R12: 00000000019112a0 R13: 0000000000000000 R14: 0000000000000007 R15: 00000000004283c0 If unable to obtain the file structure for the current task, proceed to the next task number after the one returned from task_seq_get_next(), instead of the next task number from the original iterator. Also, save the stopping task number from task_seq_get_next() on failure in case of restarts. Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets") Signed-off-by: Jonathan Lemon Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201218185032.2464558-2-jonathan.lemon@gmail.com --- kernel/bpf/task_iter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 0458a40edf10..8033ab19138a 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -158,13 +158,14 @@ again: if (!curr_task) { info->task = NULL; info->files = NULL; + info->tid = curr_tid; return NULL; } curr_files = get_files_struct(curr_task); if (!curr_files) { put_task_struct(curr_task); - curr_tid = ++(info->tid); + curr_tid = curr_tid + 1; info->fd = 0; goto again; } -- cgit v1.2.3 From a61daaf351da7c8493f2586437617d60c24350b0 Mon Sep 17 00:00:00 2001 From: Jonathan Lemon Date: Fri, 18 Dec 2020 10:50:31 -0800 Subject: bpf: Use thread_group_leader() Instead of directly comparing task->tgid and task->pid, use the thread_group_leader() helper. This helps with readability, and there should be no functional change. Signed-off-by: Jonathan Lemon Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201218185032.2464558-3-jonathan.lemon@gmail.com --- kernel/bpf/task_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8033ab19138a..dc4007f1843b 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -37,7 +37,7 @@ retry: if (!task) { ++*tid; goto retry; - } else if (skip_if_dup_files && task->tgid != task->pid && + } else if (skip_if_dup_files && !thread_group_leader(task) && task->files == task->group_leader->files) { put_task_struct(task); task = NULL; -- cgit v1.2.3 From 11cc92eb747aace5aa2b54b65b5cb8325a8981de Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Fri, 25 Dec 2020 22:30:58 +1100 Subject: genirq: Fix export of irq_to_desc() for powerpc KVM Commit 64a1b95bb9fe ("genirq: Restrict export of irq_to_desc()") removed the export of irq_to_desc() unless powerpc KVM is being built, because there is still a use of irq_to_desc() in modular code there. However it used: #ifdef CONFIG_KVM_BOOK3S_64_HV Which doesn't work when that symbol is =m, leading to a build failure: ERROR: modpost: "irq_to_desc" [arch/powerpc/kvm/kvm-hv.ko] undefined! Fix it by checking for the definedness of the correct symbol which is CONFIG_KVM_BOOK3S_64_HV_MODULE. Fixes: 64a1b95bb9fe ("genirq: Restrict export of irq_to_desc()") Signed-off-by: Michael Ellerman Signed-off-by: Linus Torvalds --- kernel/irq/irqdesc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 3d0bc38a0bcf..cc1a09406c6e 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -352,7 +352,7 @@ struct irq_desc *irq_to_desc(unsigned int irq) { return radix_tree_lookup(&irq_desc_tree, irq); } -#ifdef CONFIG_KVM_BOOK3S_64_HV +#ifdef CONFIG_KVM_BOOK3S_64_HV_MODULE EXPORT_SYMBOL_GPL(irq_to_desc); #endif -- cgit v1.2.3 From b1b6b5a30dce872f500dc43f067cba8e7f86fc7d Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 30 Dec 2020 21:34:16 +0000 Subject: kernel/io_uring: cancel io_uring before task works For cancelling io_uring requests it needs either to be able to run currently enqueued task_works or having it shut down by that moment. Otherwise io_uring_cancel_files() may be waiting for requests that won't ever complete. Go with the first way and do cancellations before setting PF_EXITING and so before putting the task_work infrastructure into a transition state where task_work_run() would better not be called. Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/file.c | 2 -- kernel/exit.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/file.c b/fs/file.c index c0b60961c672..dab120b71e44 100644 --- a/fs/file.c +++ b/fs/file.c @@ -21,7 +21,6 @@ #include #include #include -#include unsigned int sysctl_nr_open __read_mostly = 1024*1024; unsigned int sysctl_nr_open_min = BITS_PER_LONG; @@ -428,7 +427,6 @@ void exit_files(struct task_struct *tsk) struct files_struct * files = tsk->files; if (files) { - io_uring_files_cancel(files); task_lock(tsk); tsk->files = NULL; task_unlock(tsk); diff --git a/kernel/exit.c b/kernel/exit.c index 3594291a8542..04029e35e69a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include @@ -776,6 +777,7 @@ void __noreturn do_exit(long code) schedule(); } + io_uring_files_cancel(tsk->files); exit_signals(tsk); /* sets PF_EXITING */ /* sync mm's RSS info before statistics gathering */ -- cgit v1.2.3 From 04901aab40ea3779f6fc6383ef74d8e130e817bf Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 30 Dec 2020 21:24:18 -0800 Subject: bpf: Fix a task_iter bug caused by a merge conflict resolution Latest bpf tree has a bug for bpf_iter selftest: $ ./test_progs -n 4/25 test_bpf_sk_storage_get:PASS:bpf_iter_bpf_sk_storage_helpers__open_and_load 0 nsec test_bpf_sk_storage_get:PASS:socket 0 nsec ... do_dummy_read:PASS:read 0 nsec test_bpf_sk_storage_get:FAIL:bpf_map_lookup_elem map value wasn't set correctly (expected 1792, got -1, err=0) #4/25 bpf_sk_storage_get:FAIL #4 bpf_iter:FAIL Summary: 0/0 PASSED, 0 SKIPPED, 2 FAILED When doing merge conflict resolution, Commit 4bfc4714849d missed to save curr_task to seq_file private data. The task pointer in seq_file private data is passed to bpf program. This caused NULL-pointer task passed to bpf program which will immediately return upon checking whether task pointer is NULL. This patch added back the assignment of curr_task to seq_file private data and fixed the issue. Fixes: 4bfc4714849d ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf") Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20201231052418.577024-1-yhs@fb.com --- kernel/bpf/task_iter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 3efe38191d1c..175b7b42bfc4 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -159,6 +159,7 @@ again: } /* set info->task and info->tid */ + info->task = curr_task; if (curr_tid == info->tid) { curr_fd = info->fd; } else { -- cgit v1.2.3 From 6bc335828056f3b301a3deadda782de4e8f0db08 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 3 Nov 2020 09:25:57 -0500 Subject: rcu/tree: Make rcu_do_batch count how many callbacks were executed The rcu_do_batch() function extracts the ready-to-invoke callbacks from the rcu_segcblist located in the ->cblist field of the current CPU's rcu_data structure. These callbacks are first moved to a local (unsegmented) rcu_cblist. The rcu_do_batch() function then uses this rcu_cblist's ->len field to count how many CBs it has invoked, but it does so by counting that field down from zero. Finally, this function negates the value in this ->len field (resulting in a positive number) and subtracts the result from the ->len field of the current CPU's ->cblist field. Except that it is sometimes necessary for rcu_do_batch() to stop invoking callbacks mid-stream, despite there being more ready to invoke, for example, if a high-priority task wakes up. In this case the remaining not-yet-invoked callbacks are requeued back onto the CPU's ->cblist, but remain in the ready-to-invoke segment of that list. As above, the negative of the local rcu_cblist's ->len field is still subtracted from the ->len field of the current CPU's ->cblist field. The design of counting down from 0 is confusing and error-prone, plus use of a positive count will make it easier to provide a uniform and consistent API to deal with the per-segment counts that are added later in this series. For example, rcu_segcblist_extract_done_cbs() can unconditionally populate the resulting unsegmented list's ->len field during extraction. This commit therefore explicitly counts how many callbacks were executed in rcu_do_batch() itself, counting up from zero, and then uses that to update the per-CPU segcb list's ->len field, without relying on the downcounting of rcl->len from zero. Reviewed-by: Frederic Weisbecker Reviewed-by: Neeraj Upadhyay Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 2 +- kernel/rcu/rcu_segcblist.h | 1 + kernel/rcu/tree.c | 11 +++++------ 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 2d2a6b6b9dfb..bb246d8c6ef1 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -95,7 +95,7 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) * This increase is fully ordered with respect to the callers accesses * both before and after. */ -static void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) +void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) { #ifdef CONFIG_RCU_NOCB_CPU smp_mb__before_atomic(); /* Up to the caller! */ diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 492262bcb591..1d2d61406463 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -76,6 +76,7 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) } void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); +void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); void rcu_segcblist_offload(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..cc6f379c7ddf 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2434,7 +2434,7 @@ static void rcu_do_batch(struct rcu_data *rdp) const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_head *rhp; struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); - long bl, count; + long bl, count = 0; long pending, tlimit = 0; /* If no callbacks are ready, just return. */ @@ -2479,6 +2479,7 @@ static void rcu_do_batch(struct rcu_data *rdp) for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { rcu_callback_t f; + count++; debug_rcu_head_unqueue(rhp); rcu_lock_acquire(&rcu_callback_map); @@ -2492,15 +2493,14 @@ static void rcu_do_batch(struct rcu_data *rdp) /* * Stop only if limit reached and CPU has something to do. - * Note: The rcl structure counts down from zero. */ - if (-rcl.len >= bl && !offloaded && + if (count >= bl && !offloaded && (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; if (unlikely(tlimit)) { /* only call local_clock() every 32 callbacks */ - if (likely((-rcl.len & 31) || local_clock() < tlimit)) + if (likely((count & 31) || local_clock() < tlimit)) continue; /* Exceeded the time limit, so leave. */ break; @@ -2517,7 +2517,6 @@ static void rcu_do_batch(struct rcu_data *rdp) local_irq_save(flags); rcu_nocb_lock(rdp); - count = -rcl.len; rdp->n_cbs_invoked += count; trace_rcu_batch_end(rcu_state.name, count, !!rcl.head, need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); @@ -2525,7 +2524,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); smp_mb(); /* List handling before counting for rcu_barrier(). */ - rcu_segcblist_insert_count(&rdp->cblist, &rcl); + rcu_segcblist_add_len(&rdp->cblist, -count); /* Reinstate batch limit if we have worked down the excess. */ count = rcu_segcblist_n_cbs(&rdp->cblist); -- cgit v1.2.3 From 84109ab58590dc6c4e7eb36329fdc7ec121ed5a5 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 20 Nov 2020 06:53:11 -0800 Subject: rcu: Record kvfree_call_rcu() call stack for KASAN This commit adds a call to kasan_record_aux_stack() in kvfree_call_rcu() in order to record the call stack of the code that caused the object to be freed. Please note that this function does not update the allocated/freed state, which is important because RCU readers might still be referencing this object. Acked-by: Dmitry Vyukov Reviewed-by: Uladzislau Rezki (Sony) Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..2db736cbe342 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3498,6 +3498,7 @@ void kvfree_call_rcu(struct rcu_head *head, rcu_callback_t func) goto unlock_return; } + kasan_record_aux_stack(ptr); success = kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr); if (!success) { run_page_cache_worker(krcp); -- cgit v1.2.3 From 2341bc4a0311e4319ced6c2828bb19309dee74fd Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 15 Dec 2020 15:16:45 +0100 Subject: rcu: Make RCU_BOOST default on CONFIG_PREEMPT_RT On PREEMPT_RT kernels, RCU callbacks are deferred to the `rcuc' kthread. This can stall RCU grace periods due to lengthy preemption not only of RCU readers but also of 'rcuc' kthreads, either of which prevent grace periods from completing, which can in turn result in OOM. Because PREEMPT_RT kernels have more kthreads that can block grace periods, it is more important for such kernels to enable RCU_BOOST. This commit therefore makes RCU_BOOST the default on PREEMPT_RT. RCU_BOOST can still be manually disabled if need be. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index cdc57b4f6d48..aa8cc8c977e7 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -188,8 +188,8 @@ config RCU_FAST_NO_HZ config RCU_BOOST bool "Enable RCU priority boosting" - depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT - default n + depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT + default y if PREEMPT_RT help This option boosts the priority of preempted RCU readers that block the current preemptible RCU grace period for too long. -- cgit v1.2.3 From 8b9a0ecc7ef5e1ed3afbc926de17399a37128c82 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Tue, 15 Dec 2020 15:16:46 +0100 Subject: rcu: Unconditionally use rcuc threads on PREEMPT_RT PREEMPT_RT systems have long used the rcutree.use_softirq kernel boot parameter to avoid use of RCU_SOFTIRQ handlers, which can disrupt real-time applications by invoking callbacks during return from interrupts that arrived while executing time-critical code. This kernel boot parameter instead runs RCU core processing in an 'rcuc' kthread, thus allowing the scheduler to do its job of avoiding disrupting time-critical code. This commit therefore disables the rcutree.use_softirq kernel boot parameter on PREEMPT_RT systems, thus forcing such systems to do RCU core processing in 'rcuc' kthreads. This approach has long been in use by users of the -rt patchset, and there have been no complaints. There is therefore no way for the system administrator to override this choice, at least without modifying and rebuilding the kernel. Signed-off-by: Scott Wood [bigeasy: Reword commit message] Signed-off-by: Sebastian Andrzej Siewior [ paulmck: Update kernel-parameters.txt accordingly. ] Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ kernel/rcu/tree.c | 4 +++- 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c722ec19cd00..521255f4ef13 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4092,6 +4092,10 @@ value, meaning that RCU_SOFTIRQ is used by default. Specify rcutree.use_softirq=0 to use rcuc kthreads. + But note that CONFIG_PREEMPT_RT=y kernels disable + this kernel boot parameter, forcibly setting it + to zero. + rcutree.rcu_fanout_exact= [KNL] Disable autobalancing of the rcu_node combining tree. This is used by rcutorture, and might diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..d60903581300 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -100,8 +100,10 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = true; +static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT); +#ifndef CONFIG_PREEMPT_RT module_param(use_softirq, bool, 0444); +#endif /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); -- cgit v1.2.3 From 36221e109eb20ac111bc3bf3e8d5639aa457c7e0 Mon Sep 17 00:00:00 2001 From: Julia Cartwright Date: Tue, 15 Dec 2020 15:16:47 +0100 Subject: rcu: Enable rcu_normal_after_boot unconditionally for RT Expedited RCU grace periods send IPIs to all non-idle CPUs, and thus can disrupt time-critical code in real-time applications. However, there is a portion of boot-time processing (presumably before any real-time applications have started) where expedited RCU grace periods are the only option. And so it is that experience with the -rt patchset indicates that PREEMPT_RT systems should always set the rcupdate.rcu_normal_after_boot kernel boot parameter. This commit therefore makes the post-boot application environment safe for real-time applications by making PREEMPT_RT systems disable the rcupdate.rcu_normal_after_boot kernel boot parameter and acting as if this parameter had been set. This means that post-boot calls to synchronize_rcu_expedited() will be treated as if they were instead calls to synchronize_rcu(), thus preventing the IPIs, and thus avoiding disrupting real-time applications. Suggested-by: Luiz Capitulino Acked-by: Paul E. McKenney Signed-off-by: Julia Cartwright Signed-off-by: Sebastian Andrzej Siewior [ paulmck: Update kernel-parameters.txt accordingly. ] Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ kernel/rcu/update.c | 4 +++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 521255f4ef13..e0008d9caccb 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4474,6 +4474,13 @@ only normal grace-period primitives. No effect on CONFIG_TINY_RCU kernels. + But note that CONFIG_PREEMPT_RT=y kernels enables + this kernel boot parameter, forcibly setting + it to the value one, that is, converting any + post-boot attempt at an expedited RCU grace + period to instead use normal non-expedited + grace-period processing. + rcupdate.rcu_task_ipi_delay= [KNL] Set time in jiffies during which RCU tasks will avoid sending IPIs, starting with the beginning diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 39334d2d2b37..b95ae86c40a7 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -56,8 +56,10 @@ #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); -static int rcu_normal_after_boot; +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT); +#ifndef CONFIG_PREEMPT_RT module_param(rcu_normal_after_boot, int, 0); +#endif #endif /* #ifndef CONFIG_TINY_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC -- cgit v1.2.3 From 74612a07b83fc46c2b2e6f71a541d55b024ebefc Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Nov 2020 16:34:09 -0800 Subject: srcu: Make Tiny SRCU use multi-bit grace-period counter There is a need for a polling interface for SRCU grace periods. This polling needs to distinguish between an SRCU instance being idle on the one hand or in the middle of a grace period on the other. This commit therefore converts the Tiny SRCU srcu_struct structure's srcu_idx from a defacto boolean to a free-running counter, using the bottom bit to indicate that a grace period is in progress. The second-from-bottom bit is thus used as the index returned by srcu_read_lock(). Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet [ paulmck: Fix ->srcu_lock_nesting[] indexing per Neeraj Upadhyay. ] Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/srcutiny.h | 6 +++--- kernel/rcu/srcutiny.c | 5 +++-- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 5a5a1941ca15..b8b42d0a24f1 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -15,7 +15,7 @@ struct srcu_struct { short srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ - short srcu_idx; /* Current reader array element. */ + unsigned short srcu_idx; /* Current reader array element in bit 0x2. */ u8 srcu_gp_running; /* GP workqueue running? */ u8 srcu_gp_waiting; /* GP waiting for readers? */ struct swait_queue_head srcu_wq; @@ -59,7 +59,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) { int idx; - idx = READ_ONCE(ssp->srcu_idx); + idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], ssp->srcu_lock_nesting[idx] + 1); return idx; } @@ -80,7 +80,7 @@ static inline void srcu_torture_stats_print(struct srcu_struct *ssp, { int idx; - idx = READ_ONCE(ssp->srcu_idx) & 0x1; + idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n", tt, tf, idx, READ_ONCE(ssp->srcu_lock_nesting[!idx]), diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 6208c1dae5c9..5598cf6f1edc 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -124,11 +124,12 @@ void srcu_drive_gp(struct work_struct *wp) ssp->srcu_cb_head = NULL; ssp->srcu_cb_tail = &ssp->srcu_cb_head; local_irq_enable(); - idx = ssp->srcu_idx; - WRITE_ONCE(ssp->srcu_idx, !ssp->srcu_idx); + idx = (ssp->srcu_idx & 0x2) / 2; + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); /* Invoke the callbacks we removed above. */ while (lh) { -- cgit v1.2.3 From 1a893c711a600ab57526619b56e6f6b7be00956e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 09:37:39 -0800 Subject: srcu: Provide internal interface to start a Tiny SRCU grace period There is a need for a polling interface for SRCU grace periods. This polling needs to initiate an SRCU grace period without having to queue (and manage) a callback. This commit therefore splits the Tiny SRCU call_srcu() function into callback-queuing and start-grace-period portions, with the latter in a new function named srcu_gp_start_if_needed(). Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutiny.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 5598cf6f1edc..3bac1db85a85 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -152,6 +152,16 @@ void srcu_drive_gp(struct work_struct *wp) } EXPORT_SYMBOL_GPL(srcu_drive_gp); +static void srcu_gp_start_if_needed(struct srcu_struct *ssp) +{ + if (!READ_ONCE(ssp->srcu_gp_running)) { + if (likely(srcu_init_done)) + schedule_work(&ssp->srcu_work); + else if (list_empty(&ssp->srcu_work.entry)) + list_add(&ssp->srcu_work.entry, &srcu_boot_list); + } +} + /* * Enqueue an SRCU callback on the specified srcu_struct structure, * initiating grace-period processing if it is not already running. @@ -167,12 +177,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); - if (!READ_ONCE(ssp->srcu_gp_running)) { - if (likely(srcu_init_done)) - schedule_work(&ssp->srcu_work); - else if (list_empty(&ssp->srcu_work.entry)) - list_add(&ssp->srcu_work.entry, &srcu_boot_list); - } + srcu_gp_start_if_needed(ssp); } EXPORT_SYMBOL_GPL(call_srcu); -- cgit v1.2.3 From 29d2bb94a8a126ce80ffbb433b648b32fdea524e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 10:08:09 -0800 Subject: srcu: Provide internal interface to start a Tree SRCU grace period There is a need for a polling interface for SRCU grace periods. This polling needs to initiate an SRCU grace period without having to queue (and manage) a callback. This commit therefore splits the Tree SRCU __call_srcu() function into callback-initialization and queuing/start-grace-period portions, with the latter in a new function named srcu_gp_start_if_needed(). This function may be passed a NULL callback pointer, in which case it will refrain from queuing anything. Why have the new function mess with queuing? Locking considerations, of course! Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 66 +++++++++++++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0f23d20d485a..9a7b6505fdc8 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -807,6 +807,42 @@ static void srcu_leak_callback(struct rcu_head *rhp) { } +/* + * Start an SRCU grace period, and also queue the callback if non-NULL. + */ +static void srcu_gp_start_if_needed(struct srcu_struct *ssp, struct rcu_head *rhp, bool do_norm) +{ + unsigned long flags; + int idx; + bool needexp = false; + bool needgp = false; + unsigned long s; + struct srcu_data *sdp; + + idx = srcu_read_lock(ssp); + sdp = raw_cpu_ptr(ssp->sda); + spin_lock_irqsave_rcu_node(sdp, flags); + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&ssp->srcu_gp_seq)); + s = rcu_seq_snap(&ssp->srcu_gp_seq); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { + sdp->srcu_gp_seq_needed = s; + needgp = true; + } + if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { + sdp->srcu_gp_seq_needed_exp = s; + needexp = true; + } + spin_unlock_irqrestore_rcu_node(sdp, flags); + if (needgp) + srcu_funnel_gp_start(ssp, sdp, s, do_norm); + else if (needexp) + srcu_funnel_exp_start(ssp, sdp->mynode, s); + srcu_read_unlock(ssp, idx); +} + /* * Enqueue an SRCU callback on the srcu_data structure associated with * the current CPU and the specified srcu_struct structure, initiating @@ -838,13 +874,6 @@ static void srcu_leak_callback(struct rcu_head *rhp) static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func, bool do_norm) { - unsigned long flags; - int idx; - bool needexp = false; - bool needgp = false; - unsigned long s; - struct srcu_data *sdp; - check_init_srcu_struct(ssp); if (debug_rcu_head_queue(rhp)) { /* Probable double call_srcu(), so leak the callback. */ @@ -853,28 +882,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, return; } rhp->func = func; - idx = srcu_read_lock(ssp); - sdp = raw_cpu_ptr(ssp->sda); - spin_lock_irqsave_rcu_node(sdp, flags); - rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); - rcu_segcblist_advance(&sdp->srcu_cblist, - rcu_seq_current(&ssp->srcu_gp_seq)); - s = rcu_seq_snap(&ssp->srcu_gp_seq); - (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); - if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { - sdp->srcu_gp_seq_needed = s; - needgp = true; - } - if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { - sdp->srcu_gp_seq_needed_exp = s; - needexp = true; - } - spin_unlock_irqrestore_rcu_node(sdp, flags); - if (needgp) - srcu_funnel_gp_start(ssp, sdp, s, do_norm); - else if (needexp) - srcu_funnel_exp_start(ssp, sdp->mynode, s); - srcu_read_unlock(ssp, idx); + srcu_gp_start_if_needed(ssp, rhp, do_norm); } /** -- cgit v1.2.3 From 8b5bd67cf6422b63ee100d76d8de8960ca2df7f0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 12:54:48 -0800 Subject: srcu: Provide polling interfaces for Tiny SRCU grace periods There is a need for a polling interface for SRCU grace periods, so this commit supplies get_state_synchronize_srcu(), start_poll_synchronize_srcu(), and poll_state_synchronize_srcu() for this purpose. The first can be used if future grace periods are inevitable (perhaps due to a later call_srcu() invocation), the second if future grace periods might not otherwise happen, and the third to check if a grace period has elapsed since the corresponding call to either of the first two. As with get_state_synchronize_rcu() and cond_synchronize_rcu(), the return value from either get_state_synchronize_srcu() or start_poll_synchronize_srcu() must be passed in to a later call to poll_state_synchronize_srcu(). Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet [ paulmck: Add EXPORT_SYMBOL_GPL() per kernel test robot feedback. ] [ paulmck: Apply feedback from Neeraj Upadhyay. ] Link: https://lore.kernel.org/lkml/20201117004017.GA7444@paulmck-ThinkPad-P72/ Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 ++ include/linux/srcu.h | 3 +++ include/linux/srcutiny.h | 1 + kernel/rcu/srcutiny.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 59 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de0826411311..e09c0d87b3c3 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -33,6 +33,8 @@ #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define ulong2long(a) (*(long *)(&(a))) +#define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b))) +#define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b))) /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); diff --git a/include/linux/srcu.h b/include/linux/srcu.h index e432cc92c73d..a0895bbf71ce 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -60,6 +60,9 @@ void cleanup_srcu_struct(struct srcu_struct *ssp); int __srcu_read_lock(struct srcu_struct *ssp) __acquires(ssp); void __srcu_read_unlock(struct srcu_struct *ssp, int idx) __releases(ssp); void synchronize_srcu(struct srcu_struct *ssp); +unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp); +unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp); +bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie); #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index b8b42d0a24f1..0e0cf4d6a72a 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -16,6 +16,7 @@ struct srcu_struct { short srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ unsigned short srcu_idx; /* Current reader array element in bit 0x2. */ + unsigned short srcu_idx_max; /* Furthest future srcu_idx request. */ u8 srcu_gp_running; /* GP workqueue running? */ u8 srcu_gp_waiting; /* GP waiting for readers? */ struct swait_queue_head srcu_wq; diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index 3bac1db85a85..26344dc6483b 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -34,6 +34,7 @@ static int init_srcu_struct_fields(struct srcu_struct *ssp) ssp->srcu_gp_running = false; ssp->srcu_gp_waiting = false; ssp->srcu_idx = 0; + ssp->srcu_idx_max = 0; INIT_WORK(&ssp->srcu_work, srcu_drive_gp); INIT_LIST_HEAD(&ssp->srcu_work.entry); return 0; @@ -84,6 +85,8 @@ void cleanup_srcu_struct(struct srcu_struct *ssp) WARN_ON(ssp->srcu_gp_waiting); WARN_ON(ssp->srcu_cb_head); WARN_ON(&ssp->srcu_cb_head != ssp->srcu_cb_tail); + WARN_ON(ssp->srcu_idx != ssp->srcu_idx_max); + WARN_ON(ssp->srcu_idx & 0x1); } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); @@ -114,7 +117,7 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - if (ssp->srcu_gp_running || !READ_ONCE(ssp->srcu_cb_head)) + if (ssp->srcu_gp_running || USHORT_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) return; /* Already running or nothing to do. */ /* Remove recently arrived callbacks and wait for readers. */ @@ -147,13 +150,19 @@ void srcu_drive_gp(struct work_struct *wp) * straighten that out. */ WRITE_ONCE(ssp->srcu_gp_running, false); - if (READ_ONCE(ssp->srcu_cb_head)) + if (USHORT_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { + unsigned short cookie; + + cookie = get_state_synchronize_srcu(ssp); + if (USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) + return; + WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) schedule_work(&ssp->srcu_work); @@ -196,6 +205,48 @@ void synchronize_srcu(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(synchronize_srcu); +/* + * get_state_synchronize_srcu - Provide an end-of-grace-period cookie + */ +unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) +{ + unsigned long ret; + + barrier(); + ret = (READ_ONCE(ssp->srcu_idx) + 3) & ~0x1; + barrier(); + return ret & USHRT_MAX; +} +EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); + +/* + * start_poll_synchronize_srcu - Provide cookie and start grace period + * + * The difference between this and get_state_synchronize_srcu() is that + * this function ensures that the poll_state_synchronize_srcu() will + * eventually return the value true. + */ +unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) +{ + unsigned long ret = get_state_synchronize_srcu(ssp); + + srcu_gp_start_if_needed(ssp); + return ret; +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); + +/* + * poll_state_synchronize_srcu - Has cookie's grace period ended? + */ +bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) +{ + bool ret = USHORT_CMP_GE(READ_ONCE(ssp->srcu_idx), cookie); + + barrier(); + return ret; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); + /* Lockdep diagnostics. */ void __init rcu_scheduler_starting(void) { -- cgit v1.2.3 From 5358c9fa54b09b5d3d7811b033aa0838c1bbaaf2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 17:31:55 -0800 Subject: srcu: Provide polling interfaces for Tree SRCU grace periods There is a need for a polling interface for SRCU grace periods, so this commit supplies get_state_synchronize_srcu(), start_poll_synchronize_srcu(), and poll_state_synchronize_srcu() for this purpose. The first can be used if future grace periods are inevitable (perhaps due to a later call_srcu() invocation), the second if future grace periods might not otherwise happen, and the third to check if a grace period has elapsed since the corresponding call to either of the first two. As with get_state_synchronize_rcu() and cond_synchronize_rcu(), the return value from either get_state_synchronize_srcu() or start_poll_synchronize_srcu() must be passed in to a later call to poll_state_synchronize_srcu(). Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet [ paulmck: Add EXPORT_SYMBOL_GPL() per kernel test robot feedback. ] [ paulmck: Apply feedback from Neeraj Upadhyay. ] Link: https://lore.kernel.org/lkml/20201117004017.GA7444@paulmck-ThinkPad-P72/ Reviewed-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 63 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 9a7b6505fdc8..c5d0c036fab5 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -810,7 +810,8 @@ static void srcu_leak_callback(struct rcu_head *rhp) /* * Start an SRCU grace period, and also queue the callback if non-NULL. */ -static void srcu_gp_start_if_needed(struct srcu_struct *ssp, struct rcu_head *rhp, bool do_norm) +static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, + struct rcu_head *rhp, bool do_norm) { unsigned long flags; int idx; @@ -819,10 +820,12 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp, struct rcu_head *rh unsigned long s; struct srcu_data *sdp; + check_init_srcu_struct(ssp); idx = srcu_read_lock(ssp); sdp = raw_cpu_ptr(ssp->sda); spin_lock_irqsave_rcu_node(sdp, flags); - rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); + if (rhp) + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&ssp->srcu_gp_seq)); s = rcu_seq_snap(&ssp->srcu_gp_seq); @@ -841,6 +844,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp, struct rcu_head *rh else if (needexp) srcu_funnel_exp_start(ssp, sdp->mynode, s); srcu_read_unlock(ssp, idx); + return s; } /* @@ -874,7 +878,6 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp, struct rcu_head *rh static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rcu_callback_t func, bool do_norm) { - check_init_srcu_struct(ssp); if (debug_rcu_head_queue(rhp)) { /* Probable double call_srcu(), so leak the callback. */ WRITE_ONCE(rhp->func, srcu_leak_callback); @@ -882,7 +885,7 @@ static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, return; } rhp->func = func; - srcu_gp_start_if_needed(ssp, rhp, do_norm); + (void)srcu_gp_start_if_needed(ssp, rhp, do_norm); } /** @@ -1011,6 +1014,62 @@ void synchronize_srcu(struct srcu_struct *ssp) } EXPORT_SYMBOL_GPL(synchronize_srcu); +/** + * get_state_synchronize_srcu - Provide an end-of-grace-period cookie + * @ssp: srcu_struct to provide cookie for. + * + * This function returns a cookie that can be passed to + * poll_state_synchronize_srcu(), which will return true if a full grace + * period has elapsed in the meantime. It is the caller's responsibility + * to make sure that grace period happens, for example, by invoking + * call_srcu() after return from get_state_synchronize_srcu(). + */ +unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) +{ + // Any prior manipulation of SRCU-protected data must happen + // before the load from ->srcu_gp_seq. + smp_mb(); + return rcu_seq_snap(&ssp->srcu_gp_seq); +} +EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); + +/** + * start_poll_synchronize_srcu - Provide cookie and start grace period + * @ssp: srcu_struct to provide cookie for. + * + * This function returns a cookie that can be passed to + * poll_state_synchronize_srcu(), which will return true if a full grace + * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(), + * this function also ensures that any needed SRCU grace period will be + * started. This convenience does come at a cost in terms of CPU overhead. + */ +unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) +{ + return srcu_gp_start_if_needed(ssp, NULL, true); +} +EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); + +/** + * poll_state_synchronize_srcu - Has cookie's grace period ended? + * @ssp: srcu_struct to provide cookie for. + * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu(). + * + * This function takes the cookie that was returned from either + * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and + * returns @true if an SRCU grace period elapsed since the time that the + * cookie was created. + */ +bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) +{ + if (!rcu_seq_done(&ssp->srcu_gp_seq, cookie)) + return false; + // Ensure that the end of the SRCU grace period happens before + // any subsequent code that the caller might execute. + smp_mb(); // ^^^ + return true; +} +EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); + /* * Callback function for srcu_barrier() use. */ -- cgit v1.2.3 From 4e7ccfae52b39aeee93ed39d4184d50ea201fbef Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Nov 2020 20:33:38 -0800 Subject: srcu: Add comment explaining cookie overflow/wrap This commit adds to the poll_state_synchronize_srcu() header comment describing the issues surrounding SRCU cookie overflow/wrap for the different kernel configurations. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/srcutree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index c5d0c036fab5..119938d381e2 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1058,6 +1058,21 @@ EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and * returns @true if an SRCU grace period elapsed since the time that the * cookie was created. + * + * Because cookies are finite in size, wrapping/overflow is possible. + * This is more pronounced on 32-bit systems where cookies are 32 bits, + * where in theory wrapping could happen in about 14 hours assuming + * 25-microsecond expedited SRCU grace periods. However, a more likely + * overflow lower bound is on the order of 24 days in the case of + * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit + * system requires geologic timespans, as in more than seven million years + * even for expedited SRCU grace periods. + * + * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems + * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses + * a 16-bit cookie, which rcutorture routinely wraps in a matter of a + * few minutes. If this proves to be a problem, this counter will be + * expanded to the same size as for Tree SRCU. */ bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) { -- cgit v1.2.3 From fd56f64b4e3b9c53fbb12ef74c6f1f5fde4cc1c8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 20:14:27 -0800 Subject: rcutorture: Prepare for ->start_gp_poll and ->poll_gp_state The new get_state_synchronize_srcu(), start_poll_synchronize_srcu() and poll_state_synchronize_srcu() functions need to be tested, and so this commit prepares by renaming the rcu_torture_ops field ->get_state to ->get_gp_state in order to be consistent with the upcoming ->start_gp_poll and ->poll_gp_state fields. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 528ed10b78fd..bcea23ceceb6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -311,7 +311,7 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*exp_sync)(void); - unsigned long (*get_state)(void); + unsigned long (*get_gp_state)(void); void (*cond_sync)(unsigned long oldstate); call_rcu_func_t call; void (*cb_barrier)(void); @@ -461,7 +461,7 @@ static struct rcu_torture_ops rcu_ops = { .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .exp_sync = synchronize_rcu_expedited, - .get_state = get_state_synchronize_rcu, + .get_gp_state = get_state_synchronize_rcu, .cond_sync = cond_synchronize_rcu, .call = call_rcu, .cb_barrier = rcu_barrier, @@ -1050,10 +1050,10 @@ rcu_torture_writer(void *arg) /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; - if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) { + if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); - } else if (gp_cond && (!cur_ops->get_state || !cur_ops->cond_sync)) { + } else if (gp_cond && (!cur_ops->get_gp_state || !cur_ops->cond_sync)) { pr_alert("%s: gp_cond without primitives.\n", __func__); } if (gp_exp1 && cur_ops->exp_sync) { @@ -1119,7 +1119,7 @@ rcu_torture_writer(void *arg) break; case RTWS_COND_GET: rcu_torture_writer_state = RTWS_COND_GET; - gp_snap = cur_ops->get_state(); + gp_snap = cur_ops->get_gp_state(); i = torture_random(&rand) % 16; if (i != 0) schedule_timeout_interruptible(i); -- cgit v1.2.3 From 0fd0548db13346bfb3bb23860ab270a32d6e385a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Nov 2020 20:43:59 -0800 Subject: rcutorture: Add writer-side tests of polling grace-period API This commit adds writer-side testing of the polling grace-period API. One test verifies that the polling API sees a grace period caused by some other mechanism. Another test verifies that using the polling API to wait for a grace period does not result in too-short grace periods. A third test verifies that the polling API does not report completion within a read-side critical section. A fourth and final test verifies that the polling API does report completion given an intervening grace period. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 79 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index bcea23ceceb6..78ba95dfe05d 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -85,6 +85,7 @@ torture_param(bool, gp_cond, false, "Use conditional/async GP wait primitives"); torture_param(bool, gp_exp, false, "Use expedited GP wait primitives"); torture_param(bool, gp_normal, false, "Use normal (non-expedited) GP wait primitives"); +torture_param(bool, gp_poll, false, "Use polling GP wait primitives"); torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives"); torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers"); torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers"); @@ -183,9 +184,11 @@ static int rcu_torture_writer_state; #define RTWS_EXP_SYNC 4 #define RTWS_COND_GET 5 #define RTWS_COND_SYNC 6 -#define RTWS_SYNC 7 -#define RTWS_STUTTER 8 -#define RTWS_STOPPING 9 +#define RTWS_POLL_GET 7 +#define RTWS_POLL_WAIT 8 +#define RTWS_SYNC 9 +#define RTWS_STUTTER 10 +#define RTWS_STOPPING 11 static const char * const rcu_torture_writer_state_names[] = { "RTWS_FIXED_DELAY", "RTWS_DELAY", @@ -194,6 +197,8 @@ static const char * const rcu_torture_writer_state_names[] = { "RTWS_EXP_SYNC", "RTWS_COND_GET", "RTWS_COND_SYNC", + "RTWS_POLL_GET", + "RTWS_POLL_WAIT", "RTWS_SYNC", "RTWS_STUTTER", "RTWS_STOPPING", @@ -312,6 +317,8 @@ struct rcu_torture_ops { void (*sync)(void); void (*exp_sync)(void); unsigned long (*get_gp_state)(void); + unsigned long (*start_gp_poll)(void); + bool (*poll_gp_state)(unsigned long oldstate); void (*cond_sync)(unsigned long oldstate); call_rcu_func_t call; void (*cb_barrier)(void); @@ -570,6 +577,21 @@ static void srcu_torture_synchronize(void) synchronize_srcu(srcu_ctlp); } +static unsigned long srcu_torture_get_gp_state(void) +{ + return get_state_synchronize_srcu(srcu_ctlp); +} + +static unsigned long srcu_torture_start_gp_poll(void) +{ + return start_poll_synchronize_srcu(srcu_ctlp); +} + +static bool srcu_torture_poll_gp_state(unsigned long oldstate) +{ + return poll_state_synchronize_srcu(srcu_ctlp, oldstate); +} + static void srcu_torture_call(struct rcu_head *head, rcu_callback_t func) { @@ -601,6 +623,9 @@ static struct rcu_torture_ops srcu_ops = { .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, .exp_sync = srcu_torture_synchronize_expedited, + .get_gp_state = srcu_torture_get_gp_state, + .start_gp_poll = srcu_torture_start_gp_poll, + .poll_gp_state = srcu_torture_poll_gp_state, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, @@ -1027,18 +1052,20 @@ static int rcu_torture_writer(void *arg) { bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); + unsigned long cookie; int expediting = 0; unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; - bool gp_sync1 = gp_sync; + bool gp_poll1 = gp_poll, gp_sync1 = gp_sync; int i; + int idx; int oldnice = task_nice(current); struct rcu_torture *rp; struct rcu_torture *old_rp; static DEFINE_TORTURE_RANDOM(rand); bool stutter_waited; int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, - RTWS_COND_GET, RTWS_SYNC }; + RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); @@ -1048,8 +1075,8 @@ rcu_torture_writer(void *arg) torture_type, cur_ops->name); /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) - gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; + if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1) + gp_cond1 = gp_exp1 = gp_normal1 = gp_poll1 = gp_sync1 = true; if (gp_cond1 && cur_ops->get_gp_state && cur_ops->cond_sync) { synctype[nsynctypes++] = RTWS_COND_GET; pr_info("%s: Testing conditional GPs.\n", __func__); @@ -1068,6 +1095,12 @@ rcu_torture_writer(void *arg) } else if (gp_normal && !cur_ops->deferred_free) { pr_alert("%s: gp_normal without primitives.\n", __func__); } + if (gp_poll1 && cur_ops->start_gp_poll && cur_ops->poll_gp_state) { + synctype[nsynctypes++] = RTWS_POLL_GET; + pr_info("%s: Testing polling GPs.\n", __func__); + } else if (gp_poll && (!cur_ops->start_gp_poll || !cur_ops->poll_gp_state)) { + pr_alert("%s: gp_poll without primitives.\n", __func__); + } if (gp_sync1 && cur_ops->sync) { synctype[nsynctypes++] = RTWS_SYNC; pr_info("%s: Testing normal GPs.\n", __func__); @@ -1107,6 +1140,18 @@ rcu_torture_writer(void *arg) atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(old_rp->rtort_pipe_count, old_rp->rtort_pipe_count + 1); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) { + idx = cur_ops->readlock(); + cookie = cur_ops->get_gp_state(); + WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE && + cur_ops->poll_gp_state(cookie), + "%s: Cookie check 1 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); + cur_ops->readunlock(idx); + } switch (synctype[torture_random(&rand) % nsynctypes]) { case RTWS_DEF_FREE: rcu_torture_writer_state = RTWS_DEF_FREE; @@ -1128,6 +1173,18 @@ rcu_torture_writer(void *arg) cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); break; + case RTWS_POLL_GET: + rcu_torture_writer_state = RTWS_POLL_GET; + gp_snap = cur_ops->start_gp_poll(); + rcu_torture_writer_state = RTWS_POLL_WAIT; + while (!cur_ops->poll_gp_state(gp_snap)) { + i = torture_random(&rand) % 16; + if (i != 0) + schedule_timeout_interruptible(i); + udelay(torture_random(&rand) % 1000); + } + rcu_torture_pipe_update(old_rp); + break; case RTWS_SYNC: rcu_torture_writer_state = RTWS_SYNC; cur_ops->sync(); @@ -1137,6 +1194,14 @@ rcu_torture_writer(void *arg) WARN_ON_ONCE(1); break; } + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ONCE(rcu_torture_writer_state != RTWS_DEF_FREE && + !cur_ops->poll_gp_state(cookie), + "%s: Cookie check 2 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); } WRITE_ONCE(rcu_torture_current_version, rcu_torture_current_version + 1); -- cgit v1.2.3 From bc480a6354ef2e15c26c3bdbd0db647026e788a7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Nov 2020 12:45:57 -0800 Subject: rcutorture: Add reader-side tests of polling grace-period API This commit adds reader-side testing of the polling grace-period API. This testing verifies that a cookie obtained in an SRCU read-side critical section does not get a true return from poll_state_synchronize_srcu() within that same critical section. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 78ba95dfe05d..96d55f05e344 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1429,6 +1429,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, */ static bool rcu_torture_one_read(struct torture_random_state *trsp) { + unsigned long cookie; int i; unsigned long started; unsigned long completed; @@ -1444,6 +1445,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) WARN_ON_ONCE(!rcu_is_watching()); newstate = rcutorture_extend_mask(readstate, trsp); rcutorture_one_extend(&readstate, newstate, trsp, rtrsp++); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + cookie = cur_ops->get_gp_state(); started = cur_ops->get_gp_seq(); ts = rcu_trace_clock_local(); p = rcu_dereference_check(rcu_torture_current, @@ -1480,6 +1483,13 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) } __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); + if (cur_ops->get_gp_state && cur_ops->poll_gp_state) + WARN_ONCE(cur_ops->poll_gp_state(cookie), + "%s: Cookie check 3 failed %s(%d) %lu->%lu\n", + __func__, + rcu_torture_writer_state_getname(), + rcu_torture_writer_state, + cookie, cur_ops->get_gp_state()); rcutorture_one_extend(&readstate, 0, trsp, rtrsp); WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK); // This next splat is expected behavior if leakpointer, especially -- cgit v1.2.3 From 00504537f44422a99d97f615f2b3ee17cfba194d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 29 Oct 2020 15:08:57 -0700 Subject: rcutorture: Add testing for RCU's global memory ordering RCU guarantees that anything seen by a given reader will also be seen after any grace period that must wait on that reader. This is very likely to hold based on inspection, but the advantage of having rcutorture do the inspecting is that rcutorture doesn't mind inspecting frequently and often. This commit therefore adds code to test RCU's global memory ordering. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 92 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 96d55f05e344..338e1182b4b5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -143,11 +143,22 @@ static struct task_struct *read_exit_task; #define RCU_TORTURE_PIPE_LEN 10 +// Mailbox-like structure to check RCU global memory ordering. +struct rcu_torture_reader_check { + unsigned long rtc_myloops; + int rtc_chkrdr; + unsigned long rtc_chkloops; + int rtc_ready; + struct rcu_torture_reader_check *rtc_assigner; +} ____cacheline_internodealigned_in_smp; + +// Update-side data structure used to check RCU readers. struct rcu_torture { struct rcu_head rtort_rcu; int rtort_pipe_count; struct list_head rtort_free; int rtort_mbtest; + struct rcu_torture_reader_check *rtort_chkp; }; static LIST_HEAD(rcu_torture_freelist); @@ -158,10 +169,13 @@ static DEFINE_SPINLOCK(rcu_torture_lock); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count); static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch); static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; +static struct rcu_torture_reader_check *rcu_torture_reader_mbchk; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; static atomic_t n_rcu_torture_free; static atomic_t n_rcu_torture_mberror; +static atomic_t n_rcu_torture_mbchk_fail; +static atomic_t n_rcu_torture_mbchk_tries; static atomic_t n_rcu_torture_error; static long n_rcu_torture_barrier_error; static long n_rcu_torture_boost_ktrerror; @@ -393,7 +407,12 @@ static bool rcu_torture_pipe_update_one(struct rcu_torture *rp) { int i; + struct rcu_torture_reader_check *rtrcp = READ_ONCE(rp->rtort_chkp); + if (rtrcp) { + WRITE_ONCE(rp->rtort_chkp, NULL); + smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire(). + } i = READ_ONCE(rp->rtort_pipe_count); if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; @@ -1292,6 +1311,62 @@ static void rcu_torture_timer_cb(struct rcu_head *rhp) kfree(rhp); } +// Set up and carry out testing of RCU's global memory ordering +static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, + struct torture_random_state *trsp) +{ + unsigned long loops; + int noc = num_online_cpus(); + int rdrchked; + int rdrchker; + struct rcu_torture_reader_check *rtrcp; // Me. + struct rcu_torture_reader_check *rtrcp_assigner; // Assigned us to do checking. + struct rcu_torture_reader_check *rtrcp_chked; // Reader being checked. + struct rcu_torture_reader_check *rtrcp_chker; // Reader doing checking when not me. + + if (myid < 0) + return; // Don't try this from timer handlers. + + // Increment my counter. + rtrcp = &rcu_torture_reader_mbchk[myid]; + WRITE_ONCE(rtrcp->rtc_myloops, rtrcp->rtc_myloops + 1); + + // Attempt to assign someone else some checking work. + rdrchked = torture_random(trsp) % nrealreaders; + rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked]; + rdrchker = torture_random(trsp) % nrealreaders; + rtrcp_chker = &rcu_torture_reader_mbchk[rdrchker]; + if (rdrchked != myid && rdrchked != rdrchker && noc >= rdrchked && noc >= rdrchker && + smp_load_acquire(&rtrcp->rtc_chkrdr) < 0 && // Pairs with smp_store_release below. + !READ_ONCE(rtp->rtort_chkp) && + !smp_load_acquire(&rtrcp_chker->rtc_assigner)) { // Pairs with smp_store_release below. + rtrcp->rtc_chkloops = READ_ONCE(rtrcp_chked->rtc_myloops); + WARN_ON_ONCE(rtrcp->rtc_chkrdr >= 0); + rtrcp->rtc_chkrdr = rdrchked; + WARN_ON_ONCE(rtrcp->rtc_ready); // This gets set after the grace period ends. + if (cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, NULL, rtrcp) || + cmpxchg_relaxed(&rtp->rtort_chkp, NULL, rtrcp)) + (void)cmpxchg_relaxed(&rtrcp_chker->rtc_assigner, rtrcp, NULL); // Back out. + } + + // If assigned some completed work, do it! + rtrcp_assigner = READ_ONCE(rtrcp->rtc_assigner); + if (!rtrcp_assigner || !smp_load_acquire(&rtrcp_assigner->rtc_ready)) + return; // No work or work not yet ready. + rdrchked = rtrcp_assigner->rtc_chkrdr; + if (WARN_ON_ONCE(rdrchked < 0)) + return; + rtrcp_chked = &rcu_torture_reader_mbchk[rdrchked]; + loops = READ_ONCE(rtrcp_chked->rtc_myloops); + atomic_inc(&n_rcu_torture_mbchk_tries); + if (ULONG_CMP_LT(loops, rtrcp_assigner->rtc_chkloops)) + atomic_inc(&n_rcu_torture_mbchk_fail); + rtrcp_assigner->rtc_chkloops = loops + ULONG_MAX / 2; + rtrcp_assigner->rtc_ready = 0; + smp_store_release(&rtrcp->rtc_assigner, NULL); // Someone else can assign us work. + smp_store_release(&rtrcp_assigner->rtc_chkrdr, -1); // Assigner can again assign. +} + /* * Do one extension of an RCU read-side critical section using the * current reader state in readstate (set to zero for initial entry @@ -1427,7 +1502,7 @@ rcutorture_loop_extend(int *readstate, struct torture_random_state *trsp, * no data to read. Can be invoked both from process context and * from a timer handler. */ -static bool rcu_torture_one_read(struct torture_random_state *trsp) +static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) { unsigned long cookie; int i; @@ -1462,6 +1537,7 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp) } if (p->rtort_mbtest == 0) atomic_inc(&n_rcu_torture_mberror); + rcu_torture_reader_do_mbchk(myid, p, trsp); rtrsp = rcutorture_loop_extend(&readstate, trsp, rtrsp); preempt_disable(); pipe_count = READ_ONCE(p->rtort_pipe_count); @@ -1518,7 +1594,7 @@ static DEFINE_TORTURE_RANDOM_PERCPU(rcu_torture_timer_rand); static void rcu_torture_timer(struct timer_list *unused) { atomic_long_inc(&n_rcu_torture_timers); - (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand)); + (void)rcu_torture_one_read(this_cpu_ptr(&rcu_torture_timer_rand), -1); /* Test call_rcu() invocation from interrupt handler. */ if (cur_ops->call) { @@ -1554,7 +1630,7 @@ rcu_torture_reader(void *arg) if (!timer_pending(&t)) mod_timer(&t, jiffies + 1); } - if (!rcu_torture_one_read(&rand) && !torture_must_stop()) + if (!rcu_torture_one_read(&rand, myid) && !torture_must_stop()) schedule_timeout_interruptible(HZ); if (time_after(jiffies, lastsleep) && !torture_must_stop()) { schedule_timeout_interruptible(1); @@ -1614,8 +1690,9 @@ rcu_torture_stats_print(void) atomic_read(&n_rcu_torture_alloc), atomic_read(&n_rcu_torture_alloc_fail), atomic_read(&n_rcu_torture_free)); - pr_cont("rtmbe: %d rtbe: %ld rtbke: %ld rtbre: %ld ", + pr_cont("rtmbe: %d rtmbkf: %d/%d rtbe: %ld rtbke: %ld rtbre: %ld ", atomic_read(&n_rcu_torture_mberror), + atomic_read(&n_rcu_torture_mbchk_fail), atomic_read(&n_rcu_torture_mbchk_tries), n_rcu_torture_barrier_error, n_rcu_torture_boost_ktrerror, n_rcu_torture_boost_rterror); @@ -1632,12 +1709,14 @@ rcu_torture_stats_print(void) pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || + atomic_read(&n_rcu_torture_mbchk_fail) || n_rcu_torture_barrier_error || n_rcu_torture_boost_ktrerror || n_rcu_torture_boost_rterror || n_rcu_torture_boost_failure || i > 1) { pr_cont("%s", "!!! "); atomic_inc(&n_rcu_torture_error); WARN_ON_ONCE(atomic_read(&n_rcu_torture_mberror)); + WARN_ON_ONCE(atomic_read(&n_rcu_torture_mbchk_fail)); WARN_ON_ONCE(n_rcu_torture_barrier_error); // rcu_barrier() WARN_ON_ONCE(n_rcu_torture_boost_ktrerror); // no boost kthread WARN_ON_ONCE(n_rcu_torture_boost_rterror); // can't set RT prio @@ -2467,7 +2546,7 @@ static int rcu_torture_read_exit_child(void *trsp_in) // Minimize time between reading and exiting. while (!kthread_should_stop()) schedule_timeout_uninterruptible(1); - (void)rcu_torture_one_read(trsp); + (void)rcu_torture_one_read(trsp, -1); return 0; } @@ -2582,6 +2661,8 @@ rcu_torture_cleanup(void) kfree(reader_tasks); reader_tasks = NULL; } + kfree(rcu_torture_reader_mbchk); + rcu_torture_reader_mbchk = NULL; if (fakewriter_tasks) { for (i = 0; i < nfakewriters; i++) @@ -2785,6 +2866,8 @@ rcu_torture_init(void) atomic_set(&n_rcu_torture_alloc_fail, 0); atomic_set(&n_rcu_torture_free, 0); atomic_set(&n_rcu_torture_mberror, 0); + atomic_set(&n_rcu_torture_mbchk_fail, 0); + atomic_set(&n_rcu_torture_mbchk_tries, 0); atomic_set(&n_rcu_torture_error, 0); n_rcu_torture_barrier_error = 0; n_rcu_torture_boost_ktrerror = 0; @@ -2826,12 +2909,15 @@ rcu_torture_init(void) } reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]), GFP_KERNEL); - if (reader_tasks == NULL) { + rcu_torture_reader_mbchk = kcalloc(nrealreaders, sizeof(*rcu_torture_reader_mbchk), + GFP_KERNEL); + if (!reader_tasks || !rcu_torture_reader_mbchk) { VERBOSE_TOROUT_ERRSTRING("out of memory"); firsterr = -ENOMEM; goto unwind; } for (i = 0; i < nrealreaders; i++) { + rcu_torture_reader_mbchk[i].rtc_chkrdr = -1; firsterr = torture_create_kthread(rcu_torture_reader, (void *)i, reader_tasks[i]); if (firsterr) -- cgit v1.2.3 From f3ea978b712f768a02137e867aced5bfdcea670e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Nov 2020 10:12:05 -0800 Subject: scftorture: Add debug output for wrong-CPU warning This commit adds the desired CPU, the actual CPU, and nr_cpu_ids to the wrong-CPU warning in scftorture_invoker(), the better to help with debugging. Signed-off-by: Paul E. McKenney --- kernel/scftorture.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/scftorture.c b/kernel/scftorture.c index d55a9f8cda3d..2377cbb32474 100644 --- a/kernel/scftorture.c +++ b/kernel/scftorture.c @@ -398,6 +398,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra static int scftorture_invoker(void *arg) { int cpu; + int curcpu; DEFINE_TORTURE_RANDOM(rand); struct scf_statistics *scfp = (struct scf_statistics *)arg; bool was_offline = false; @@ -412,7 +413,10 @@ static int scftorture_invoker(void *arg) VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id()); // Make sure that the CPU is affinitized appropriately during testing. - WARN_ON_ONCE(smp_processor_id() != scfp->cpu); + curcpu = smp_processor_id(); + WARN_ONCE(curcpu != scfp->cpu % nr_cpu_ids, + "%s: Wanted CPU %d, running on %d, nr_cpu_ids = %d\n", + __func__, scfp->cpu, curcpu, nr_cpu_ids); if (!atomic_dec_return(&n_started)) while (atomic_read_acquire(&n_started)) { -- cgit v1.2.3 From b08ea1de6a8f8929c7dafd6f708799365fa90c11 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 6 Nov 2020 13:52:31 -0800 Subject: rcu: Mark obtuse portion of stall warning as internal debug There is a rather obtuse string that can be printed as part of an expedited RCU CPU stall-warning message that starts with "blocking rcu_node structures". Under normal conditions, most of this message is just repeating the list of CPUs blocking the current expedited grace period, but in a manner that is rather difficult to read. This commit therefore marks this message as "(internal RCU debug)" in an effort to give people the option of avoiding wasting time attempting to extract nonexistent additional meaning from this portion of the message. Reported-by: Jonathan Lemon Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_exp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 8760b6ead770..6c6ff06d4ae6 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -545,7 +545,7 @@ static void synchronize_rcu_expedited_wait(void) data_race(rnp_root->expmask), ".T"[!!data_race(rnp_root->exp_tasks)]); if (ndetected) { - pr_err("blocking rcu_node structures:"); + pr_err("blocking rcu_node structures (internal RCU debug):"); rcu_for_each_node_breadth_first(rnp) { if (rnp == rnp_root) continue; /* printed unconditionally */ -- cgit v1.2.3 From 243027a3c80564bf96e40437ffac46efb9f5f2b5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 11 Nov 2020 16:08:01 -0800 Subject: rcu: For RCU grace-period kthread starvation, dump last CPU it ran on When the RCU CPU stall-warning code detects that the RCU grace-period kthread is being starved, it dumps that kthread's stack. This can sometimes be useful, but it is also useful to know what is running on the CPU that this kthread is attempting to run on. This commit therefore adds a stack trace of this CPU in order to help track down whatever it is that might be preventing RCU's grace-period kthread from running. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 70d48c52fabc..35c1355d44e7 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -449,20 +449,27 @@ static void print_cpu_stall_info(int cpu) /* Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(void) { + int cpu; struct task_struct *gpk = rcu_state.gp_kthread; unsigned long j; if (rcu_is_gp_kthread_starving(&j)) { + cpu = gpk ? task_cpu(gpk) : -1; pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx ->cpu=%d\n", rcu_state.name, j, (long)rcu_seq_current(&rcu_state.gp_seq), data_race(rcu_state.gp_flags), gp_state_getname(rcu_state.gp_state), rcu_state.gp_state, - gpk ? gpk->state : ~0, gpk ? task_cpu(gpk) : -1); + gpk ? gpk->state : ~0, cpu); if (gpk) { pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name); pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); + if (cpu >= 0) { + pr_err("Stack dump where RCU grace-period kthread last ran:\n"); + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + } wake_up_process(gpk); } } -- cgit v1.2.3 From 725969ac11d7fa50aa701321daa600ce421fc21b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 12 Nov 2020 12:19:47 -0800 Subject: rcu: Do not NMI offline CPUs Currently, RCU CPU stall warning messages will NMI whatever CPU looks like it is blocking either the current grace period or the grace-period kthread. This can produce confusing output if the target CPU is offline. This commit therefore checks for offline CPUs. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_stall.h | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 35c1355d44e7..29cf096b5d20 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -333,9 +333,12 @@ static void rcu_dump_cpu_stacks(void) rcu_for_each_leaf_node(rnp) { raw_spin_lock_irqsave_rcu_node(rnp, flags); for_each_leaf_node_possible_cpu(rnp, cpu) - if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) - if (!trigger_single_cpu_backtrace(cpu)) + if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { + if (cpu_is_offline(cpu)) + pr_err("Offline CPU %d blocking current GP.\n", cpu); + else if (!trigger_single_cpu_backtrace(cpu)) dump_cpu_task(cpu); + } raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -466,9 +469,13 @@ static void rcu_check_gp_kthread_starvation(void) pr_err("RCU grace-period kthread stack dump:\n"); sched_show_task(gpk); if (cpu >= 0) { - pr_err("Stack dump where RCU grace-period kthread last ran:\n"); - if (!trigger_single_cpu_backtrace(cpu)) - dump_cpu_task(cpu); + if (cpu_is_offline(cpu)) { + pr_err("RCU GP kthread last ran on offline CPU %d.\n", cpu); + } else { + pr_err("Stack dump where RCU GP kthread last ran:\n"); + if (!trigger_single_cpu_backtrace(cpu)) + dump_cpu_task(cpu); + } } wake_up_process(gpk); } -- cgit v1.2.3 From 71a076f4a61a6c779794ad286f356b39725edc3b Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 24 Nov 2020 12:02:09 +0100 Subject: kcsan: Rewrite kcsan_prandom_u32_max() without prandom_u32_state() Rewrite kcsan_prandom_u32_max() to not depend on code that might be instrumented, removing any dependency on lib/random32.c. The rewrite implements a simple linear congruential generator, that is sufficient for our purposes (for udelay() and skip_watch counter randomness). The initial motivation for this was to allow enabling KCSAN for kernel/sched (remove KCSAN_SANITIZE := n from kernel/sched/Makefile), with CONFIG_DEBUG_PREEMPT=y. Without this change, we could observe recursion: check_access() [via instrumentation] kcsan_setup_watchpoint() reset_kcsan_skip() kcsan_prandom_u32_max() get_cpu_var() preempt_disable() preempt_count_add() [in kernel/sched/core.c] check_access() [via instrumentation] Note, while this currently does not affect an unmodified kernel, it'd be good to keep a KCSAN kernel working when KCSAN_SANITIZE := n is removed from kernel/sched/Makefile to permit testing scheduler code with KCSAN if desired. Fixes: cd290ec24633 ("kcsan: Use tracing-safe version of prandom") Signed-off-by: Marco Elver Signed-off-by: Paul E. McKenney --- kernel/kcsan/core.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c index 3994a217bde7..3bf98db9c702 100644 --- a/kernel/kcsan/core.c +++ b/kernel/kcsan/core.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include @@ -101,7 +100,7 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1]; static DEFINE_PER_CPU(long, kcsan_skip); /* For kcsan_prandom_u32_max(). */ -static DEFINE_PER_CPU(struct rnd_state, kcsan_rand_state); +static DEFINE_PER_CPU(u32, kcsan_rand_state); static __always_inline atomic_long_t *find_watchpoint(unsigned long addr, size_t size, @@ -275,20 +274,17 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx * } /* - * Returns a pseudo-random number in interval [0, ep_ro). See prandom_u32_max() - * for more details. - * - * The open-coded version here is using only safe primitives for all contexts - * where we can have KCSAN instrumentation. In particular, we cannot use - * prandom_u32() directly, as its tracepoint could cause recursion. + * Returns a pseudo-random number in interval [0, ep_ro). Simple linear + * congruential generator, using constants from "Numerical Recipes". */ static u32 kcsan_prandom_u32_max(u32 ep_ro) { - struct rnd_state *state = &get_cpu_var(kcsan_rand_state); - const u32 res = prandom_u32_state(state); + u32 state = this_cpu_read(kcsan_rand_state); + + state = 1664525 * state + 1013904223; + this_cpu_write(kcsan_rand_state, state); - put_cpu_var(kcsan_rand_state); - return (u32)(((u64) res * ep_ro) >> 32); + return state % ep_ro; } static inline void reset_kcsan_skip(void) @@ -639,10 +635,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size, void __init kcsan_init(void) { + int cpu; + BUG_ON(!in_task()); kcsan_debugfs_init(); - prandom_seed_full_state(&kcsan_rand_state); + + for_each_possible_cpu(cpu) + per_cpu(kcsan_rand_state, cpu) = (u32)get_cycles(); /* * We are in the init task, and no other tasks should be running; -- cgit v1.2.3 From 1b7af295541d75535374325fd617944534853919 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 29 Aug 2020 10:22:24 -0700 Subject: sched/core: Allow try_invoke_on_locked_down_task() with irqs disabled The try_invoke_on_locked_down_task() function currently requires that interrupts be enabled, but it is called with interrupts disabled from rcu_print_task_stall(), resulting in an "IRQs not enabled as expected" diagnostic. This commit therefore updates try_invoke_on_locked_down_task() to use raw_spin_lock_irqsave() instead of raw_spin_lock_irq(), thus allowing use from either context. Link: https://lore.kernel.org/lkml/000000000000903d5805ab908fc4@google.com/ Link: https://lore.kernel.org/lkml/20200928075729.GC2611@hirez.programming.kicks-ass.net/ Reported-by: syzbot+cb3b69ae80afd6535b0e@syzkaller.appspotmail.com Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney --- kernel/sched/core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e7e453492cff..f768bb0df74b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2989,7 +2989,7 @@ out: /** * try_invoke_on_locked_down_task - Invoke a function on task in fixed state - * @p: Process for which the function is to be invoked. + * @p: Process for which the function is to be invoked, can be @current. * @func: Function to invoke. * @arg: Argument to function. * @@ -3007,12 +3007,11 @@ out: */ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct task_struct *t, void *arg), void *arg) { - bool ret = false; struct rq_flags rf; + bool ret = false; struct rq *rq; - lockdep_assert_irqs_enabled(); - raw_spin_lock_irq(&p->pi_lock); + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); if (p->on_rq) { rq = __task_rq_lock(p, &rf); if (task_rq(p) == rq) @@ -3029,7 +3028,7 @@ bool try_invoke_on_locked_down_task(struct task_struct *p, bool (*func)(struct t ret = func(p, arg); } } - raw_spin_unlock_irq(&p->pi_lock); + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); return ret; } -- cgit v1.2.3 From c5586e32dfe258925c5dbb599bea3eadf34e79c1 Mon Sep 17 00:00:00 2001 From: Wang Qing Date: Sat, 7 Nov 2020 16:24:03 +0800 Subject: locking: Remove duplicate include of percpu-rwsem.h This commit removes an unnecessary #include. Signed-off-by: Wang Qing Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 62d215b2e39f..af99e9ca285a 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -27,7 +27,6 @@ #include #include #include -#include #include MODULE_LICENSE("GPL"); -- cgit v1.2.3 From a649d25dcc671a33b9cc3176411920fdc5fbd98e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Nov 2020 10:13:06 -0800 Subject: rcu: Add lockdep_assert_irqs_disabled() to rcu_sched_clock_irq() and callees This commit adds a number of lockdep_assert_irqs_disabled() calls to rcu_sched_clock_irq() and a number of the functions that it calls. The point of this is to help track down a situation where lockdep appears to be insisting that interrupts are enabled within these functions, which should only ever be invoked from the scheduling-clock interrupt handler. Link: https://lore.kernel.org/lkml/20201111133813.GA81547@elver.google.com/ Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++++ kernel/rcu/tree_plugin.h | 1 + kernel/rcu/tree_stall.h | 8 ++++++++ 3 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index bd04b09b84b3..f70634f7c3aa 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2553,6 +2553,7 @@ static void rcu_do_batch(struct rcu_data *rdp) void rcu_sched_clock_irq(int user) { trace_rcu_utilization(TPS("Start scheduler-tick")); + lockdep_assert_irqs_disabled(); raw_cpu_inc(rcu_data.ticks_this_gp); /* The load-acquire pairs with the store-release setting to true. */ if (smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) { @@ -2566,6 +2567,7 @@ void rcu_sched_clock_irq(int user) rcu_flavor_sched_clock_irq(user); if (rcu_pending(user)) invoke_rcu_core(); + lockdep_assert_irqs_disabled(); trace_rcu_utilization(TPS("End scheduler-tick")); } @@ -3690,6 +3692,8 @@ static int rcu_pending(int user) struct rcu_data *rdp = this_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; + lockdep_assert_irqs_disabled(); + /* Check for CPU stalls, if enabled. */ check_cpu_stall(rdp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fd8a52e9a887..cb76e70da8ca 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -682,6 +682,7 @@ static void rcu_flavor_sched_clock_irq(int user) { struct task_struct *t = current; + lockdep_assert_irqs_disabled(); if (user || rcu_is_cpu_rrupt_from_idle()) { rcu_note_voluntary_context_switch(current); } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index ca21d28a0f98..4024dcc78aac 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -260,6 +260,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) struct task_struct *t; struct task_struct *ts[8]; + lockdep_assert_irqs_disabled(); if (!rcu_preempt_blocked_readers_cgp(rnp)) return 0; pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", @@ -284,6 +285,7 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags) ".q"[rscr.rs.b.need_qs], ".e"[rscr.rs.b.exp_hint], ".l"[rscr.on_blkd_list]); + lockdep_assert_irqs_disabled(); put_task_struct(t); ndetected++; } @@ -472,6 +474,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) struct rcu_node *rnp; long totqlen = 0; + lockdep_assert_irqs_disabled(); + /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); if (rcu_stall_is_suppressed()) @@ -493,6 +497,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) } } ndetected += rcu_print_task_stall(rnp, flags); // Releases rnp->lock. + lockdep_assert_irqs_disabled(); } for_each_possible_cpu(cpu) @@ -538,6 +543,8 @@ static void print_cpu_stall(unsigned long gps) struct rcu_node *rnp = rcu_get_root(); long totqlen = 0; + lockdep_assert_irqs_disabled(); + /* Kick and suppress, if so configured. */ rcu_stall_kick_kthreads(); if (rcu_stall_is_suppressed()) @@ -592,6 +599,7 @@ static void check_cpu_stall(struct rcu_data *rdp) unsigned long js; struct rcu_node *rnp; + lockdep_assert_irqs_disabled(); if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) || !rcu_gp_in_progress()) return; -- cgit v1.2.3 From 7dffe01765d9309b8bd5505503933ec0ec53d192 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Nov 2020 13:30:33 -0800 Subject: rcu: Add lockdep_assert_irqs_disabled() to raw_spin_unlock_rcu_node() macros This commit adds a lockdep_assert_irqs_disabled() call to the helper macros that release the rcu_node structure's ->lock, namely to raw_spin_unlock_rcu_node(), raw_spin_unlock_irq_rcu_node() and raw_spin_unlock_irqrestore_rcu_node(). The point of this is to help track down a situation where lockdep appears to be insisting that interrupts are enabled while holding an rcu_node structure's ->lock. Link: https://lore.kernel.org/lkml/20201111133813.GA81547@elver.google.com/ Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu.h | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index e01cba5e4b52..839f5be65244 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -378,7 +378,11 @@ do { \ smp_mb__after_unlock_lock(); \ } while (0) -#define raw_spin_unlock_rcu_node(p) raw_spin_unlock(&ACCESS_PRIVATE(p, lock)) +#define raw_spin_unlock_rcu_node(p) \ +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock(&ACCESS_PRIVATE(p, lock)); \ +} while (0) #define raw_spin_lock_irq_rcu_node(p) \ do { \ @@ -387,7 +391,10 @@ do { \ } while (0) #define raw_spin_unlock_irq_rcu_node(p) \ - raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock_irq(&ACCESS_PRIVATE(p, lock)); \ +} while (0) #define raw_spin_lock_irqsave_rcu_node(p, flags) \ do { \ @@ -396,7 +403,10 @@ do { \ } while (0) #define raw_spin_unlock_irqrestore_rcu_node(p, flags) \ - raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) +do { \ + lockdep_assert_irqs_disabled(); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags); \ +} while (0) #define raw_spin_trylock_rcu_node(p) \ ({ \ -- cgit v1.2.3 From bfba7ed084f8ab0269a5a1d2f51b07865456c334 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 9 Dec 2020 21:27:32 +0100 Subject: rcu-tasks: Add RCU-tasks self tests This commit adds self tests for early-boot use of RCU-tasks grace periods. It tests all three variants (Rude, Tasks, and Tasks Trace) and covers both synchronous (e.g., synchronize_rcu_tasks()) and asynchronous (e.g., call_rcu_tasks()) grace-period APIs. Self-tests are run only in kernels built with CONFIG_PROVE_RCU=y. Signed-off-by: Uladzislau Rezki (Sony) [ paulmck: Handle CONFIG_PROVE_RCU=n and identify test cases' callbacks. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/tasks.h | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 73bbe792fe1e..74767d365752 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1231,6 +1231,82 @@ void show_rcu_tasks_gp_kthreads(void) } #endif /* #ifndef CONFIG_TINY_RCU */ +#ifdef CONFIG_PROVE_RCU +struct rcu_tasks_test_desc { + struct rcu_head rh; + const char *name; + bool notrun; +}; + +static struct rcu_tasks_test_desc tests[] = { + { + .name = "call_rcu_tasks()", + /* If not defined, the test is skipped. */ + .notrun = !IS_ENABLED(CONFIG_TASKS_RCU), + }, + { + .name = "call_rcu_tasks_rude()", + /* If not defined, the test is skipped. */ + .notrun = !IS_ENABLED(CONFIG_TASKS_RUDE_RCU), + }, + { + .name = "call_rcu_tasks_trace()", + /* If not defined, the test is skipped. */ + .notrun = !IS_ENABLED(CONFIG_TASKS_TRACE_RCU) + } +}; + +static void test_rcu_tasks_callback(struct rcu_head *rhp) +{ + struct rcu_tasks_test_desc *rttd = + container_of(rhp, struct rcu_tasks_test_desc, rh); + + pr_info("Callback from %s invoked.\n", rttd->name); + + rttd->notrun = true; +} + +static void rcu_tasks_initiate_self_tests(void) +{ + pr_info("Running RCU-tasks wait API self tests\n"); +#ifdef CONFIG_TASKS_RCU + synchronize_rcu_tasks(); + call_rcu_tasks(&tests[0].rh, test_rcu_tasks_callback); +#endif + +#ifdef CONFIG_TASKS_RUDE_RCU + synchronize_rcu_tasks_rude(); + call_rcu_tasks_rude(&tests[1].rh, test_rcu_tasks_callback); +#endif + +#ifdef CONFIG_TASKS_TRACE_RCU + synchronize_rcu_tasks_trace(); + call_rcu_tasks_trace(&tests[2].rh, test_rcu_tasks_callback); +#endif +} + +static int rcu_tasks_verify_self_tests(void) +{ + int ret = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(tests); i++) { + if (!tests[i].notrun) { // still hanging. + pr_err("%s has been failed.\n", tests[i].name); + ret = -1; + } + } + + if (ret) + WARN_ON(1); + + return ret; +} +late_initcall(rcu_tasks_verify_self_tests); +#else /* #ifdef CONFIG_PROVE_RCU */ +static void rcu_tasks_initiate_self_tests(void) { } +#endif /* #else #ifdef CONFIG_PROVE_RCU */ + void __init rcu_init_tasks_generic(void) { #ifdef CONFIG_TASKS_RCU @@ -1244,6 +1320,9 @@ void __init rcu_init_tasks_generic(void) #ifdef CONFIG_TASKS_TRACE_RCU rcu_spawn_tasks_trace_kthread(); #endif + + // Run the self-tests. + rcu_tasks_initiate_self_tests(); } #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ -- cgit v1.2.3 From c26165efac41bce0c7764262b21f5897e771f34f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Dec 2020 21:00:18 -0800 Subject: rcu: Make TASKS_TRACE_RCU select IRQ_WORK Tasks Trace RCU uses irq_work_queue() to safely awaken its grace-period kthread, so this commit therefore causes the TASKS_TRACE_RCU Kconfig option select the IRQ_WORK Kconfig option. Reported-by: kernel test robot Acked-by: Randy Dunlap # build-tested Signed-off-by: Paul E. McKenney --- kernel/rcu/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index b71e21f73c40..84dfa8dae1b2 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -95,6 +95,7 @@ config TASKS_RUDE_RCU config TASKS_TRACE_RCU def_bool 0 + select IRQ_WORK help This option enables a task-based RCU implementation that uses explicit rcu_read_lock_trace() read-side markers, and allows -- cgit v1.2.3 From f2485a2dc9f0f30fbdd013ad5772975100c71360 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Jun 2020 00:08:44 -0400 Subject: elf_prstatus: collect the common part (everything before pr_reg) into a struct Preparations to doing i386 compat elf_prstatus sanely - rather than duplicating the beginning of compat_elf_prstatus, take these fields into a separate structure (compat_elf_prstatus_common), so that it could be reused. Due to the incestous relationship between binfmt_elf.c and compat_binfmt_elf.c we need the same shape change done to native struct elf_prstatus, gathering the fields prior to pr_reg into a new structure (struct elf_prstatus_common). Fortunately, offset of pr_reg is always a multiple of 16 with no padding right before it, so it's possible to turn all the stuff prior to it into a single member without disturbing the layout. [build fix from Geert Uytterhoeven folded in] Signed-off-by: Al Viro --- arch/ia64/kernel/crash.c | 2 +- arch/mips/kernel/binfmt_elfn32.c | 7 ++++++- arch/mips/kernel/binfmt_elfo32.c | 6 +++++- arch/powerpc/platforms/powernv/opal-core.c | 6 +++--- arch/s390/kernel/crash_dump.c | 2 +- fs/binfmt_elf.c | 8 ++++---- fs/binfmt_elf_fdpic.c | 22 +++++----------------- fs/compat_binfmt_elf.c | 1 + include/linux/elfcore-compat.h | 7 ++++++- include/linux/elfcore.h | 7 ++++++- kernel/kexec_core.c | 2 +- 11 files changed, 39 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/crash.c b/arch/ia64/kernel/crash.c index fec70d662d0c..4f47741005d2 100644 --- a/arch/ia64/kernel/crash.c +++ b/arch/ia64/kernel/crash.c @@ -43,7 +43,7 @@ crash_save_this_cpu(void) elf_greg_t *dst = (elf_greg_t *)&(prstatus->pr_reg); memset(prstatus, 0, sizeof(*prstatus)); - prstatus->pr_pid = current->pid; + prstatus->common.pr_pid = current->pid; ia64_dump_cpu_regs(dst); cfm = dst[43]; diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c index 6ee3f7218c67..136dc0c9300d 100644 --- a/arch/mips/kernel/binfmt_elfn32.c +++ b/arch/mips/kernel/binfmt_elfn32.c @@ -44,7 +44,8 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; #include #define elf_prstatus elf_prstatus32 -struct elf_prstatus32 +#define elf_prstatus_common elf_prstatus32_common +struct elf_prstatus32_common { struct elf_siginfo pr_info; /* Info associated with signal */ short pr_cursig; /* Current signal */ @@ -58,6 +59,10 @@ struct elf_prstatus32 struct old_timeval32 pr_stime; /* System time */ struct old_timeval32 pr_cutime;/* Cumulative user time */ struct old_timeval32 pr_cstime;/* Cumulative system time */ +}; +struct elf_prstatus32 +{ + struct elf_prstatus32_common common: elf_gregset_t pr_reg; /* GP registers */ int pr_fpvalid; /* True if math co-processor being used. */ }; diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c index 6dd103d3cebb..b1f4b8f1dee7 100644 --- a/arch/mips/kernel/binfmt_elfo32.c +++ b/arch/mips/kernel/binfmt_elfo32.c @@ -49,7 +49,7 @@ typedef elf_fpreg_t elf_fpregset_t[ELF_NFPREG]; #include #define elf_prstatus elf_prstatus32 -struct elf_prstatus32 +struct elf_prstatus32_common { struct elf_siginfo pr_info; /* Info associated with signal */ short pr_cursig; /* Current signal */ @@ -63,6 +63,10 @@ struct elf_prstatus32 struct old_timeval32 pr_stime; /* System time */ struct old_timeval32 pr_cutime;/* Cumulative user time */ struct old_timeval32 pr_cstime;/* Cumulative system time */ +}; +struct elf_prstatus32 +{ + struct elf_prstatus32_common common: elf_gregset_t pr_reg; /* GP registers */ int pr_fpvalid; /* True if math co-processor being used. */ }; diff --git a/arch/powerpc/platforms/powernv/opal-core.c b/arch/powerpc/platforms/powernv/opal-core.c index 23571f0b555a..0d9ba70f7251 100644 --- a/arch/powerpc/platforms/powernv/opal-core.c +++ b/arch/powerpc/platforms/powernv/opal-core.c @@ -119,8 +119,8 @@ static void fill_prstatus(struct elf_prstatus *prstatus, int pir, * As a PIR value could also be '0', add an offset of '100' * to every PIR to avoid misinterpretations in GDB. */ - prstatus->pr_pid = cpu_to_be32(100 + pir); - prstatus->pr_ppid = cpu_to_be32(1); + prstatus->common.pr_pid = cpu_to_be32(100 + pir); + prstatus->common.pr_ppid = cpu_to_be32(1); /* * Indicate SIGUSR1 for crash initiated from kernel. @@ -130,7 +130,7 @@ static void fill_prstatus(struct elf_prstatus *prstatus, int pir, short sig; sig = kernel_initiated ? SIGUSR1 : SIGTERM; - prstatus->pr_cursig = cpu_to_be16(sig); + prstatus->common.pr_cursig = cpu_to_be16(sig); } } diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c index 205b2e2648aa..0e36dfc9ccd6 100644 --- a/arch/s390/kernel/crash_dump.c +++ b/arch/s390/kernel/crash_dump.c @@ -365,7 +365,7 @@ static void *fill_cpu_elf_notes(void *ptr, int cpu, struct save_area *sa) memcpy(&nt_prstatus.pr_reg.gprs, sa->gprs, sizeof(sa->gprs)); memcpy(&nt_prstatus.pr_reg.psw, sa->psw, sizeof(sa->psw)); memcpy(&nt_prstatus.pr_reg.acrs, sa->acrs, sizeof(sa->acrs)); - nt_prstatus.pr_pid = cpu; + nt_prstatus.common.pr_pid = cpu; /* Prepare fpregset (floating point) note */ memset(&nt_fpregset, 0, sizeof(nt_fpregset)); memcpy(&nt_fpregset.fpc, &sa->fpc, sizeof(sa->fpc)); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8380478d3d92..4c1550b13899 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1495,7 +1495,7 @@ static void fill_note(struct memelfnote *note, const char *name, int type, * fill up all the fields in prstatus from the given task struct, except * registers which need to be filled up separately. */ -static void fill_prstatus(struct elf_prstatus *prstatus, +static void fill_prstatus(struct elf_prstatus_common *prstatus, struct task_struct *p, long signr) { prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; @@ -1736,7 +1736,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, * than being the whole note contents. We fill the reset in here. * We assume that regset 0 is NT_PRSTATUS. */ - fill_prstatus(&t->prstatus, t->task, signr); + fill_prstatus(&t->prstatus.common, t->task, signr); regset_get(t->task, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); @@ -1958,7 +1958,7 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t) struct task_struct *p = t->thread; t->num_notes = 0; - fill_prstatus(&t->prstatus, p, signr); + fill_prstatus(&t->prstatus.common, p, signr); elf_core_copy_task_regs(p, &t->prstatus.pr_reg); fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), @@ -2037,7 +2037,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, } /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(info->prstatus, current, siginfo->si_signo); + fill_prstatus(&info->prstatus->common, current, siginfo->si_signo); elf_core_copy_regs(&info->prstatus->pr_reg, regs); /* Set up header */ diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index be4062b8ba75..03d81a14bcbf 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1191,18 +1191,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, struct elf_prstatus_fdpic { - struct elf_siginfo pr_info; /* Info associated with signal */ - short pr_cursig; /* Current signal */ - unsigned long pr_sigpend; /* Set of pending signals */ - unsigned long pr_sighold; /* Set of held signals */ - pid_t pr_pid; - pid_t pr_ppid; - pid_t pr_pgrp; - pid_t pr_sid; - struct __kernel_old_timeval pr_utime; /* User time */ - struct __kernel_old_timeval pr_stime; /* System time */ - struct __kernel_old_timeval pr_cutime; /* Cumulative user time */ - struct __kernel_old_timeval pr_cstime; /* Cumulative system time */ + struct elf_prstatus_common common; elf_gregset_t pr_reg; /* GP registers */ /* When using FDPIC, the loadmap addresses need to be communicated * to GDB in order for GDB to do the necessary relocations. The @@ -1301,7 +1290,7 @@ static inline void fill_note(struct memelfnote *note, const char *name, int type * fill up all the fields in prstatus from the given task struct, except * registers which need to be filled up separately. */ -static void fill_prstatus(struct elf_prstatus_fdpic *prstatus, +static void fill_prstatus(struct elf_prstatus_common *prstatus, struct task_struct *p, long signr) { prstatus->pr_info.si_signo = prstatus->pr_cursig = signr; @@ -1332,9 +1321,6 @@ static void fill_prstatus(struct elf_prstatus_fdpic *prstatus, } prstatus->pr_cutime = ns_to_kernel_old_timeval(p->signal->cutime); prstatus->pr_cstime = ns_to_kernel_old_timeval(p->signal->cstime); - - prstatus->pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; - prstatus->pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; } static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, @@ -1405,7 +1391,9 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ if (!t) return t; - fill_prstatus(&t->prstatus, p, signr); + fill_prstatus(&t->prstatus.common, p, signr); + t->prstatus.pr_exec_fdpic_loadmap = p->mm->context.exec_fdpic_loadmap; + t->prstatus.pr_interp_fdpic_loadmap = p->mm->context.interp_fdpic_loadmap; regset_get(p, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 962df845ed51..feb48a5c2d44 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -50,6 +50,7 @@ * which requires asm/elf.h to define compat_elf_gregset_t et al. */ #define elf_prstatus compat_elf_prstatus +#define elf_prstatus_common compat_elf_prstatus_common #define elf_prpsinfo compat_elf_prpsinfo #undef ns_to_kernel_old_timeval diff --git a/include/linux/elfcore-compat.h b/include/linux/elfcore-compat.h index 10485f0c9740..4aeda5f1f038 100644 --- a/include/linux/elfcore-compat.h +++ b/include/linux/elfcore-compat.h @@ -17,7 +17,7 @@ struct compat_elf_siginfo compat_int_t si_errno; }; -struct compat_elf_prstatus +struct compat_elf_prstatus_common { struct compat_elf_siginfo pr_info; short pr_cursig; @@ -31,6 +31,11 @@ struct compat_elf_prstatus struct old_timeval32 pr_stime; struct old_timeval32 pr_cutime; struct old_timeval32 pr_cstime; +}; + +struct compat_elf_prstatus +{ + struct compat_elf_prstatus_common common; compat_elf_gregset_t pr_reg; compat_int_t pr_fpvalid; }; diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index de51c1bef27d..2aaa15779d50 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -29,7 +29,7 @@ struct elf_siginfo * the SVR4 structure, but more Linuxy, with things that Linux does * not support and which gdb doesn't really use excluded. */ -struct elf_prstatus +struct elf_prstatus_common { struct elf_siginfo pr_info; /* Info associated with signal */ short pr_cursig; /* Current signal */ @@ -43,6 +43,11 @@ struct elf_prstatus struct __kernel_old_timeval pr_stime; /* System time */ struct __kernel_old_timeval pr_cutime; /* Cumulative user time */ struct __kernel_old_timeval pr_cstime; /* Cumulative system time */ +}; + +struct elf_prstatus +{ + struct elf_prstatus_common common; elf_gregset_t pr_reg; /* GP registers */ int pr_fpvalid; /* True if math co-processor being used. */ }; diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 4f8efc278aa7..80905e5aa8ae 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1076,7 +1076,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) if (!buf) return; memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; + prstatus.common.pr_pid = current->pid; elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); -- cgit v1.2.3 From 00b8c557d096f0930d5c07df618223d3d06902d6 Mon Sep 17 00:00:00 2001 From: Matthias Maennich Date: Wed, 6 Jan 2021 15:52:01 +0000 Subject: staging: ION: remove some references to CONFIG_ION With commit e722a295cf49 ("staging: ion: remove from the tree"), ION and its corresponding config CONFIG_ION is gone. Remove stale references from drivers/staging/media/atomisp/pci and from the recommended Android kernel config. Fixes: e722a295cf49 ("staging: ion: remove from the tree") Cc: Hridya Valsaraju Cc: Rob Herring Cc: linux-media@vger.kernel.org Cc: devel@driverdev.osuosl.org Signed-off-by: Matthias Maennich Link: https://lore.kernel.org/r/20210106155201.2845319-1-maennich@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/media/atomisp/pci/atomisp_subdev.c | 20 -------------------- kernel/configs/android-recommended.config | 1 - 2 files changed, 21 deletions(-) (limited to 'kernel') diff --git a/drivers/staging/media/atomisp/pci/atomisp_subdev.c b/drivers/staging/media/atomisp/pci/atomisp_subdev.c index 52b9fb18c87f..b666cb23e5ca 100644 --- a/drivers/staging/media/atomisp/pci/atomisp_subdev.c +++ b/drivers/staging/media/atomisp/pci/atomisp_subdev.c @@ -1062,26 +1062,6 @@ static const struct v4l2_ctrl_config ctrl_select_isp_version = { .def = 0, }; -#if 0 /* #ifdef CONFIG_ION */ -/* - * Control for ISP ion device fd - * - * userspace will open ion device and pass the fd to kernel. - * this fd will be used to map shared fd to buffer. - */ -/* V4L2_CID_ATOMISP_ION_DEVICE_FD is not defined */ -static const struct v4l2_ctrl_config ctrl_ion_dev_fd = { - .ops = &ctrl_ops, - .id = V4L2_CID_ATOMISP_ION_DEVICE_FD, - .type = V4L2_CTRL_TYPE_INTEGER, - .name = "Ion Device Fd", - .min = -1, - .max = 1024, - .step = 1, - .def = ION_FD_UNSET -}; -#endif - static void atomisp_init_subdev_pipe(struct atomisp_sub_device *asd, struct atomisp_video_pipe *pipe, enum v4l2_buf_type buf_type) { diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config index 53d688bdd894..eb0029c9a6a6 100644 --- a/kernel/configs/android-recommended.config +++ b/kernel/configs/android-recommended.config @@ -81,7 +81,6 @@ CONFIG_INPUT_JOYSTICK=y CONFIG_INPUT_MISC=y CONFIG_INPUT_TABLET=y CONFIG_INPUT_UINPUT=y -CONFIG_ION=y CONFIG_JOYSTICK_XPAD=y CONFIG_JOYSTICK_XPAD_FF=y CONFIG_JOYSTICK_XPAD_LEDS=y -- cgit v1.2.3 From c2e13112e830c06825339cbadf0b3bc2bdb9a716 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Tue, 3 Nov 2020 09:26:03 -0500 Subject: rcu/segcblist: Add additional comments to explain smp_mb() One counter-intuitive property of RCU is the fact that full memory barriers are needed both before and after updates to the full (non-segmented) length. This patch therefore helps to assist the reader's intuition by adding appropriate comments. [ paulmck: Wordsmithing. ] Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 68 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 64 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index bb246d8c6ef1..3cff8003f707 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -94,17 +94,77 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) * field to disagree with the actual number of callbacks on the structure. * This increase is fully ordered with respect to the callers accesses * both before and after. + * + * So why on earth is a memory barrier required both before and after + * the update to the ->len field??? + * + * The reason is that rcu_barrier() locklessly samples each CPU's ->len + * field, and if a given CPU's field is zero, avoids IPIing that CPU. + * This can of course race with both queuing and invoking of callbacks. + * Failing to correctly handle either of these races could result in + * rcu_barrier() failing to IPI a CPU that actually had callbacks queued + * which rcu_barrier() was obligated to wait on. And if rcu_barrier() + * failed to wait on such a callback, unloading certain kernel modules + * would result in calls to functions whose code was no longer present in + * the kernel, for but one example. + * + * Therefore, ->len transitions from 1->0 and 0->1 have to be carefully + * ordered with respect with both list modifications and the rcu_barrier(). + * + * The queuing case is CASE 1 and the invoking case is CASE 2. + * + * CASE 1: Suppose that CPU 0 has no callbacks queued, but invokes + * call_rcu() just as CPU 1 invokes rcu_barrier(). CPU 0's ->len field + * will transition from 0->1, which is one of the transitions that must + * be handled carefully. Without the full memory barriers after the ->len + * update and at the beginning of rcu_barrier(), the following could happen: + * + * CPU 0 CPU 1 + * + * call_rcu(). + * rcu_barrier() sees ->len as 0. + * set ->len = 1. + * rcu_barrier() does nothing. + * module is unloaded. + * callback invokes unloaded function! + * + * With the full barriers, any case where rcu_barrier() sees ->len as 0 will + * have unambiguously preceded the return from the racing call_rcu(), which + * means that this call_rcu() invocation is OK to not wait on. After all, + * you are supposed to make sure that any problematic call_rcu() invocations + * happen before the rcu_barrier(). + * + * + * CASE 2: Suppose that CPU 0 is invoking its last callback just as + * CPU 1 invokes rcu_barrier(). CPU 0's ->len field will transition from + * 1->0, which is one of the transitions that must be handled carefully. + * Without the full memory barriers before the ->len update and at the + * end of rcu_barrier(), the following could happen: + * + * CPU 0 CPU 1 + * + * start invoking last callback + * set ->len = 0 (reordered) + * rcu_barrier() sees ->len as 0 + * rcu_barrier() does nothing. + * module is unloaded + * callback executing after unloaded! + * + * With the full barriers, any case where rcu_barrier() sees ->len as 0 + * will be fully ordered after the completion of the callback function, + * so that the module unloading operation is completely safe. + * */ void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v) { #ifdef CONFIG_RCU_NOCB_CPU - smp_mb__before_atomic(); /* Up to the caller! */ + smp_mb__before_atomic(); // Read header comment above. atomic_long_add(v, &rsclp->len); - smp_mb__after_atomic(); /* Up to the caller! */ + smp_mb__after_atomic(); // Read header comment above. #else - smp_mb(); /* Up to the caller! */ + smp_mb(); // Read header comment above. WRITE_ONCE(rsclp->len, rsclp->len + v); - smp_mb(); /* Up to the caller! */ + smp_mb(); // Read header comment above. #endif } -- cgit v1.2.3 From ae5c2341ed3987bd434ed495bd4f3d8b2bc3e623 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 23 Sep 2020 11:22:09 -0400 Subject: rcu/segcblist: Add counters to segcblist datastructure Add counting of segment lengths of segmented callback list. This will be useful for a number of things such as knowing how big the ready-to-execute segment have gotten. The immediate benefit is ability to trace how the callbacks in the segmented callback list change. Also this patch remove hacks related to using donecbs's ->len field as a temporary variable to save the segmented callback list's length. This cannot be done anymore and is not needed. Also fix SRCU: The negative counting of the unsegmented list cannot be used to adjust the segmented one. To fix this, sample the unsegmented length in advance, and use it after CB execution to adjust the segmented list's length. Reviewed-by: Frederic Weisbecker Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 1 + kernel/rcu/rcu_segcblist.c | 120 +++++++++++++++++++++++++++--------------- kernel/rcu/rcu_segcblist.h | 2 - kernel/rcu/srcutree.c | 5 +- 4 files changed, 82 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index b36afe7b22c9..6c01f09a6456 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -72,6 +72,7 @@ struct rcu_segcblist { #else long len; #endif + long seglen[RCU_CBLIST_NSEGS]; u8 enabled; u8 offloaded; }; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 3cff8003f707..777780447887 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -7,10 +7,10 @@ * Authors: Paul E. McKenney */ -#include -#include +#include #include -#include +#include +#include #include "rcu_segcblist.h" @@ -88,6 +88,46 @@ static void rcu_segcblist_set_len(struct rcu_segcblist *rsclp, long v) #endif } +/* Get the length of a segment of the rcu_segcblist structure. */ +static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) +{ + return READ_ONCE(rsclp->seglen[seg]); +} + +/* Set the length of a segment of the rcu_segcblist structure. */ +static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v) +{ + WRITE_ONCE(rsclp->seglen[seg], v); +} + +/* Increase the numeric length of a segment by a specified amount. */ +static void rcu_segcblist_add_seglen(struct rcu_segcblist *rsclp, int seg, long v) +{ + WRITE_ONCE(rsclp->seglen[seg], rsclp->seglen[seg] + v); +} + +/* Move from's segment length to to's segment. */ +static void rcu_segcblist_move_seglen(struct rcu_segcblist *rsclp, int from, int to) +{ + long len; + + if (from == to) + return; + + len = rcu_segcblist_get_seglen(rsclp, from); + if (!len) + return; + + rcu_segcblist_add_seglen(rsclp, to, len); + rcu_segcblist_set_seglen(rsclp, from, 0); +} + +/* Increment segment's length. */ +static void rcu_segcblist_inc_seglen(struct rcu_segcblist *rsclp, int seg) +{ + rcu_segcblist_add_seglen(rsclp, seg, 1); +} + /* * Increase the numeric length of an rcu_segcblist structure by the * specified amount, which can be negative. This can cause the ->len @@ -179,26 +219,6 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp) rcu_segcblist_add_len(rsclp, 1); } -/* - * Exchange the numeric length of the specified rcu_segcblist structure - * with the specified value. This can cause the ->len field to disagree - * with the actual number of callbacks on the structure. This exchange is - * fully ordered with respect to the callers accesses both before and after. - */ -static long rcu_segcblist_xchg_len(struct rcu_segcblist *rsclp, long v) -{ -#ifdef CONFIG_RCU_NOCB_CPU - return atomic_long_xchg(&rsclp->len, v); -#else - long ret = rsclp->len; - - smp_mb(); /* Up to the caller! */ - WRITE_ONCE(rsclp->len, v); - smp_mb(); /* Up to the caller! */ - return ret; -#endif -} - /* * Initialize an rcu_segcblist structure. */ @@ -209,8 +229,10 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); rsclp->head = NULL; - for (i = 0; i < RCU_CBLIST_NSEGS; i++) + for (i = 0; i < RCU_CBLIST_NSEGS; i++) { rsclp->tails[i] = &rsclp->head; + rcu_segcblist_set_seglen(rsclp, i, 0); + } rcu_segcblist_set_len(rsclp, 0); rsclp->enabled = 1; } @@ -306,6 +328,7 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, { rcu_segcblist_inc_len(rsclp); smp_mb(); /* Ensure counts are updated before callback is enqueued. */ + rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL); rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], &rhp->next); @@ -334,27 +357,13 @@ bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) if (rsclp->tails[i] != rsclp->tails[i - 1]) break; + rcu_segcblist_inc_seglen(rsclp, i); WRITE_ONCE(*rsclp->tails[i], rhp); for (; i <= RCU_NEXT_TAIL; i++) WRITE_ONCE(rsclp->tails[i], &rhp->next); return true; } -/* - * Extract only the counts from the specified rcu_segcblist structure, - * and place them in the specified rcu_cblist structure. This function - * supports both callback orphaning and invocation, hence the separation - * of counts and callbacks. (Callbacks ready for invocation must be - * orphaned and adopted separately from pending callbacks, but counts - * apply to all callbacks. Locking must be used to make sure that - * both orphaned-callbacks lists are consistent.) - */ -void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp) -{ - rclp->len = rcu_segcblist_xchg_len(rsclp, 0); -} - /* * Extract only those callbacks ready to be invoked from the specified * rcu_segcblist structure and place them in the specified rcu_cblist @@ -367,6 +376,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_ready_cbs(rsclp)) return; /* Nothing to do. */ + rclp->len = rcu_segcblist_get_seglen(rsclp, RCU_DONE_TAIL); *rclp->tail = rsclp->head; WRITE_ONCE(rsclp->head, *rsclp->tails[RCU_DONE_TAIL]); WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); @@ -374,6 +384,7 @@ void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) WRITE_ONCE(rsclp->tails[i], &rsclp->head); + rcu_segcblist_set_seglen(rsclp, RCU_DONE_TAIL, 0); } /* @@ -390,11 +401,15 @@ void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, if (!rcu_segcblist_pend_cbs(rsclp)) return; /* Nothing to do. */ + rclp->len = 0; *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; WRITE_ONCE(*rsclp->tails[RCU_DONE_TAIL], NULL); - for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) { + rclp->len += rcu_segcblist_get_seglen(rsclp, i); WRITE_ONCE(rsclp->tails[i], rsclp->tails[RCU_DONE_TAIL]); + rcu_segcblist_set_seglen(rsclp, i, 0); + } } /* @@ -405,7 +420,6 @@ void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp) { rcu_segcblist_add_len(rsclp, rclp->len); - rclp->len = 0; } /* @@ -419,6 +433,7 @@ void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, if (!rclp->head) return; /* No callbacks to move. */ + rcu_segcblist_add_seglen(rsclp, RCU_DONE_TAIL, rclp->len); *rclp->tail = rsclp->head; WRITE_ONCE(rsclp->head, rclp->head); for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) @@ -439,6 +454,8 @@ void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, { if (!rclp->head) return; /* Nothing to do. */ + + rcu_segcblist_add_seglen(rsclp, RCU_NEXT_TAIL, rclp->len); WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rclp->head); WRITE_ONCE(rsclp->tails[RCU_NEXT_TAIL], rclp->tail); } @@ -463,6 +480,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) break; WRITE_ONCE(rsclp->tails[RCU_DONE_TAIL], rsclp->tails[i]); + rcu_segcblist_move_seglen(rsclp, i, RCU_DONE_TAIL); } /* If no callbacks moved, nothing more need be done. */ @@ -483,6 +501,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) break; /* No more callbacks. */ WRITE_ONCE(rsclp->tails[j], rsclp->tails[i]); + rcu_segcblist_move_seglen(rsclp, i, j); rsclp->gp_seq[j] = rsclp->gp_seq[i]; } } @@ -504,7 +523,7 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) */ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) { - int i; + int i, j; WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) @@ -547,6 +566,10 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL) return false; + /* Accounting: everything below i is about to get merged into i. */ + for (j = i + 1; j <= RCU_NEXT_TAIL; j++) + rcu_segcblist_move_seglen(rsclp, j, i); + /* * Merge all later callbacks, including newly arrived callbacks, * into the segment located by the for-loop above. Assign "seq" @@ -574,13 +597,24 @@ void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp, struct rcu_cblist donecbs; struct rcu_cblist pendcbs; + lockdep_assert_cpus_held(); + rcu_cblist_init(&donecbs); rcu_cblist_init(&pendcbs); - rcu_segcblist_extract_count(src_rsclp, &donecbs); + rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs); rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs); + + /* + * No need smp_mb() before setting length to 0, because CPU hotplug + * lock excludes rcu_barrier. + */ + rcu_segcblist_set_len(src_rsclp, 0); + rcu_segcblist_insert_count(dst_rsclp, &donecbs); + rcu_segcblist_insert_count(dst_rsclp, &pendcbs); rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs); rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs); + rcu_segcblist_init(src_rsclp); } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 1d2d61406463..cd35c9faaf51 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -89,8 +89,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp); bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, struct rcu_head *rhp); -void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, - struct rcu_cblist *rclp); void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, struct rcu_cblist *rclp); void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 0f23d20d485a..79b7081143a7 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1160,6 +1160,7 @@ static void srcu_advance_state(struct srcu_struct *ssp) */ static void srcu_invoke_callbacks(struct work_struct *work) { + long len; bool more; struct rcu_cblist ready_cbs; struct rcu_head *rhp; @@ -1182,6 +1183,7 @@ static void srcu_invoke_callbacks(struct work_struct *work) /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); + len = ready_cbs.len; spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { @@ -1190,13 +1192,14 @@ static void srcu_invoke_callbacks(struct work_struct *work) rhp->func(rhp); local_bh_enable(); } + WARN_ON_ONCE(ready_cbs.len); /* * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ spin_lock_irq_rcu_node(sdp); - rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); + rcu_segcblist_add_len(&sdp->srcu_cblist, -len); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&ssp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; -- cgit v1.2.3 From 68804cf1c905ce227e4e1d0bc252c216811c59fd Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 14 Oct 2020 18:21:53 -0400 Subject: rcu/tree: segcblist: Remove redundant smp_mb()s The full memory barriers in rcu_segcblist_enqueue() and in rcu_do_batch() are not needed because rcu_segcblist_add_len(), and thus also rcu_segcblist_inc_len(), already includes a memory barrier *before* and *after* the length of the list is updated. This commit therefore removes these redundant smp_mb() invocations. Reviewed-by: Frederic Weisbecker Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 1 - kernel/rcu/tree.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 777780447887..1e80a0a9036a 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -327,7 +327,6 @@ void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, struct rcu_head *rhp) { rcu_segcblist_inc_len(rsclp); - smp_mb(); /* Ensure counts are updated before callback is enqueued. */ rcu_segcblist_inc_seglen(rsclp, RCU_NEXT_TAIL); rhp->next = NULL; WRITE_ONCE(*rsclp->tails[RCU_NEXT_TAIL], rhp); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index cc6f379c7ddf..b0fb654cba80 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2523,7 +2523,6 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Update counts and requeue any remaining callbacks. */ rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); - smp_mb(); /* List handling before counting for rcu_barrier(). */ rcu_segcblist_add_len(&rdp->cblist, -count); /* Reinstate batch limit if we have worked down the excess. */ -- cgit v1.2.3 From 3afe7fa535491ecd0382c3968dc2349602bff8a2 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sat, 14 Nov 2020 14:31:32 -0500 Subject: rcu/trace: Add tracing for how segcb list changes This commit adds tracing to track how the segcb list changes before/after acceleration, during queuing and during dequeuing. This tracing helped discover an optimization that avoided needless GP requests when no callbacks were accelerated. The tracing overhead is minimal as each segment's length is now stored in the respective segment. Reviewed-by: Frederic Weisbecker Reviewed-by: Neeraj Upadhyay Signed-off-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney --- include/trace/events/rcu.h | 26 ++++++++++++++++++++++++++ kernel/rcu/tree.c | 9 +++++++++ 2 files changed, 35 insertions(+) (limited to 'kernel') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 155b5cb43cfd..5fc29400e1a2 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -505,6 +505,32 @@ TRACE_EVENT_RCU(rcu_callback, __entry->qlen) ); +TRACE_EVENT_RCU(rcu_segcb_stats, + + TP_PROTO(struct rcu_segcblist *rs, const char *ctx), + + TP_ARGS(rs, ctx), + + TP_STRUCT__entry( + __field(const char *, ctx) + __array(unsigned long, gp_seq, RCU_CBLIST_NSEGS) + __array(long, seglen, RCU_CBLIST_NSEGS) + ), + + TP_fast_assign( + __entry->ctx = ctx; + memcpy(__entry->seglen, rs->seglen, RCU_CBLIST_NSEGS * sizeof(long)); + memcpy(__entry->gp_seq, rs->gp_seq, RCU_CBLIST_NSEGS * sizeof(unsigned long)); + + ), + + TP_printk("%s seglen: (DONE=%ld, WAIT=%ld, NEXT_READY=%ld, NEXT=%ld) " + "gp_seq: (DONE=%lu, WAIT=%lu, NEXT_READY=%lu, NEXT=%lu)", __entry->ctx, + __entry->seglen[0], __entry->seglen[1], __entry->seglen[2], __entry->seglen[3], + __entry->gp_seq[0], __entry->gp_seq[1], __entry->gp_seq[2], __entry->gp_seq[3]) + +); + /* * Tracepoint for the registration of a single RCU callback of the special * kvfree() form. The first argument is the RCU type, the second argument diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b0fb654cba80..6bf269c91393 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1495,6 +1495,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPreAcc")); + /* * Callbacks are often registered with incomplete grace-period * information. Something about the fact that getting exact @@ -1515,6 +1517,8 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp) else trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB")); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbPostAcc")); + return ret; } @@ -2471,11 +2475,14 @@ static void rcu_do_batch(struct rcu_data *rdp) rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); if (offloaded) rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); + + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCbDequeued")); rcu_nocb_unlock_irqrestore(rdp, flags); /* Invoke callbacks. */ tick_dep_set_task(current, TICK_DEP_BIT_RCU); rhp = rcu_cblist_dequeue(&rcl); + for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { rcu_callback_t f; @@ -2987,6 +2994,8 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) trace_rcu_callback(rcu_state.name, head, rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); + /* Go handle any RCU core processing required. */ if (unlikely(rcu_segcblist_is_offloaded(&rdp->cblist))) { __call_rcu_nocb_wake(rdp, was_alldone, flags); /* unlocks */ -- cgit v1.2.3 From b4e6039e8af8c20dfbbdfcaebfcbd7c9d9ffe713 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Wed, 18 Nov 2020 11:15:41 -0500 Subject: rcu/segcblist: Add debug checks for segment lengths This commit adds debug checks near the end of rcu_do_batch() that emit warnings if an empty rcu_segcblist structure has non-zero segment counts, or, conversely, if a non-empty structure has all-zero segment counts. Signed-off-by: Joel Fernandes (Google) [ paulmck: Fix queue/segment-length checks. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.c | 12 ++++++++++++ kernel/rcu/rcu_segcblist.h | 3 +++ kernel/rcu/tree.c | 8 ++++++-- 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 1e80a0a9036a..89e0dff890bf 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -94,6 +94,18 @@ static long rcu_segcblist_get_seglen(struct rcu_segcblist *rsclp, int seg) return READ_ONCE(rsclp->seglen[seg]); } +/* Return number of callbacks in segmented callback list by summing seglen. */ +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp) +{ + long len = 0; + int i; + + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) + len += rcu_segcblist_get_seglen(rsclp, i); + + return len; +} + /* Set the length of a segment of the rcu_segcblist structure. */ static void rcu_segcblist_set_seglen(struct rcu_segcblist *rsclp, int seg, long v) { diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index cd35c9faaf51..18e101de8747 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -15,6 +15,9 @@ static inline long rcu_cblist_n_cbs(struct rcu_cblist *rclp) return READ_ONCE(rclp->len); } +/* Return number of callbacks in segmented callback list by summing seglen. */ +long rcu_segcblist_n_segment_cbs(struct rcu_segcblist *rsclp); + void rcu_cblist_init(struct rcu_cblist *rclp); void rcu_cblist_enqueue(struct rcu_cblist *rclp, struct rcu_head *rhp); void rcu_cblist_flush_enqueue(struct rcu_cblist *drclp, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 6bf269c91393..8086c0467c15 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2434,6 +2434,7 @@ int rcutree_dead_cpu(unsigned int cpu) static void rcu_do_batch(struct rcu_data *rdp) { int div; + bool __maybe_unused empty; unsigned long flags; const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); struct rcu_head *rhp; @@ -2548,9 +2549,12 @@ static void rcu_do_batch(struct rcu_data *rdp) * The following usually indicates a double call_rcu(). To track * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. */ - WARN_ON_ONCE(count == 0 && !rcu_segcblist_empty(&rdp->cblist)); + empty = rcu_segcblist_empty(&rdp->cblist); + WARN_ON_ONCE(count == 0 && !empty); WARN_ON_ONCE(!IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - count != 0 && rcu_segcblist_empty(&rdp->cblist)); + count != 0 && empty); + WARN_ON_ONCE(count == 0 && rcu_segcblist_n_segment_cbs(&rdp->cblist) != 0); + WARN_ON_ONCE(!empty && rcu_segcblist_n_segment_cbs(&rdp->cblist) == 0); rcu_nocb_unlock_irqrestore(rdp, flags); -- cgit v1.2.3 From 65e560327fe68153a9ad7452d5fd3171a1927d33 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:16 +0100 Subject: rcu/nocb: Turn enabled/offload states into a common flag This commit gathers the rcu_segcblist ->enabled and ->offloaded property field into a single ->flags bitmask to avoid further proliferation of individual u8 fields in the structure. This change prepares for the state formerly known as ->offloaded state to be modified at runtime. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 6 ++++-- kernel/rcu/rcu_segcblist.c | 6 +++--- kernel/rcu/rcu_segcblist.h | 23 +++++++++++++++++++++-- 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 6c01f09a6456..4714b0263c76 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -63,6 +63,9 @@ struct rcu_cblist { #define RCU_NEXT_TAIL 3 #define RCU_CBLIST_NSEGS 4 +#define SEGCBLIST_ENABLED BIT(0) +#define SEGCBLIST_OFFLOADED BIT(1) + struct rcu_segcblist { struct rcu_head *head; struct rcu_head **tails[RCU_CBLIST_NSEGS]; @@ -73,8 +76,7 @@ struct rcu_segcblist { long len; #endif long seglen[RCU_CBLIST_NSEGS]; - u8 enabled; - u8 offloaded; + u8 flags; }; #define RCU_SEGCBLIST_INITIALIZER(n) \ diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 89e0dff890bf..934945d72ca5 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -246,7 +246,7 @@ void rcu_segcblist_init(struct rcu_segcblist *rsclp) rcu_segcblist_set_seglen(rsclp, i, 0); } rcu_segcblist_set_len(rsclp, 0); - rsclp->enabled = 1; + rcu_segcblist_set_flags(rsclp, SEGCBLIST_ENABLED); } /* @@ -257,7 +257,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) { WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); - rsclp->enabled = 0; + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_ENABLED); } /* @@ -266,7 +266,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) { - rsclp->offloaded = 1; + rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); } /* diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 18e101de8747..ff372db09f8b 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -53,19 +53,38 @@ static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) #endif } +static inline void rcu_segcblist_set_flags(struct rcu_segcblist *rsclp, + int flags) +{ + rsclp->flags |= flags; +} + +static inline void rcu_segcblist_clear_flags(struct rcu_segcblist *rsclp, + int flags) +{ + rsclp->flags &= ~flags; +} + +static inline bool rcu_segcblist_test_flags(struct rcu_segcblist *rsclp, + int flags) +{ + return READ_ONCE(rsclp->flags) & flags; +} + /* * Is the specified rcu_segcblist enabled, for example, not corresponding * to an offline CPU? */ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) { - return rsclp->enabled; + return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED); } /* Is the specified rcu_segcblist offloaded? */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && rsclp->offloaded; + return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED); } /* -- cgit v1.2.3 From 8d346d438f93b5344e99d429727ec9c2f392d4ec Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:17 +0100 Subject: rcu/nocb: Provide basic callback offloading state machine bits Offloading and de-offloading RCU callback processes must be done carefully. There must never be a time at which callback processing is disabled because the task driving the offloading or de-offloading might be preempted or otherwise stalled at that point in time, which would result in OOM due to calbacks piling up indefinitely. This implies that there will be times during which a given CPU's callbacks might be concurrently invoked by both that CPU's RCU_SOFTIRQ handler (or, equivalently, that CPU's rcuc kthread) and by that CPU's rcuo kthread. This situation could fatally confuse both rcu_barrier() and the CPU-hotplug offlining process, so these must be excluded during any concurrent-callback-invocation period. In addition, during times of concurrent callback invocation, changes to ->cblist must be protected both as needed for RCU_SOFTIRQ and as needed for the rcuo kthread. This commit therefore defines and documents the states for a state machine that coordinates offloading and deoffloading. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/rcu_segcblist.h | 115 +++++++++++++++++++++++++++++++++++++++++- kernel/rcu/rcu_segcblist.c | 1 + kernel/rcu/rcu_segcblist.h | 12 ++++- kernel/rcu/tree.c | 3 ++ 4 files changed, 128 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h index 4714b0263c76..8afe886e85f1 100644 --- a/include/linux/rcu_segcblist.h +++ b/include/linux/rcu_segcblist.h @@ -63,8 +63,121 @@ struct rcu_cblist { #define RCU_NEXT_TAIL 3 #define RCU_CBLIST_NSEGS 4 + +/* + * ==NOCB Offloading state machine== + * + * + * ---------------------------------------------------------------------------- + * | SEGCBLIST_SOFTIRQ_ONLY | + * | | + * | Callbacks processed by rcu_core() from softirqs or local | + * | rcuc kthread, without holding nocb_lock. | + * ---------------------------------------------------------------------------- + * | + * v + * ---------------------------------------------------------------------------- + * | SEGCBLIST_OFFLOADED | + * | | + * | Callbacks processed by rcu_core() from softirqs or local | + * | rcuc kthread, while holding nocb_lock. Waking up CB and GP kthreads, | + * | allowing nocb_timer to be armed. | + * ---------------------------------------------------------------------------- + * | + * v + * ----------------------------------- + * | | + * v v + * --------------------------------------- ----------------------------------| + * | SEGCBLIST_OFFLOADED | | | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_CB | | SEGCBLIST_KTHREAD_GP | + * | | | | + * | | | | + * | CB kthread woke up and | | GP kthread woke up and | + * | acknowledged SEGCBLIST_OFFLOADED. | | acknowledged SEGCBLIST_OFFLOADED| + * | Processes callbacks concurrently | | | + * | with rcu_core(), holding | | | + * | nocb_lock. | | | + * --------------------------------------- ----------------------------------- + * | | + * ----------------------------------- + * | + * v + * |--------------------------------------------------------------------------| + * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_CB | | + * | SEGCBLIST_KTHREAD_GP | + * | | + * | Kthreads handle callbacks holding nocb_lock, local rcu_core() stops | + * | handling callbacks. | + * ---------------------------------------------------------------------------- + */ + + + +/* + * ==NOCB De-Offloading state machine== + * + * + * |--------------------------------------------------------------------------| + * | SEGCBLIST_OFFLOADED | | + * | SEGCBLIST_KTHREAD_CB | | + * | SEGCBLIST_KTHREAD_GP | + * | | + * | CB/GP kthreads handle callbacks holding nocb_lock, local rcu_core() | + * | ignores callbacks. | + * ---------------------------------------------------------------------------- + * | + * v + * |--------------------------------------------------------------------------| + * | SEGCBLIST_KTHREAD_CB | | + * | SEGCBLIST_KTHREAD_GP | + * | | + * | CB/GP kthreads and local rcu_core() handle callbacks concurrently | + * | holding nocb_lock. Wake up CB and GP kthreads if necessary. | + * ---------------------------------------------------------------------------- + * | + * v + * ----------------------------------- + * | | + * v v + * ---------------------------------------------------------------------------| + * | | + * | SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | + * | | | + * | GP kthread woke up and | CB kthread woke up and | + * | acknowledged the fact that | acknowledged the fact that | + * | SEGCBLIST_OFFLOADED got cleared. | SEGCBLIST_OFFLOADED got cleared. | + * | | The CB kthread goes to sleep | + * | The callbacks from the target CPU | until it ever gets re-offloaded. | + * | will be ignored from the GP kthread | | + * | loop. | | + * ---------------------------------------------------------------------------- + * | | + * ----------------------------------- + * | + * v + * ---------------------------------------------------------------------------- + * | 0 | + * | | + * | Callbacks processed by rcu_core() from softirqs or local | + * | rcuc kthread, while holding nocb_lock. Forbid nocb_timer to be armed. | + * | Flush pending nocb_timer. Flush nocb bypass callbacks. | + * ---------------------------------------------------------------------------- + * | + * v + * ---------------------------------------------------------------------------- + * | SEGCBLIST_SOFTIRQ_ONLY | + * | | + * | Callbacks processed by rcu_core() from softirqs or local | + * | rcuc kthread, without holding nocb_lock. | + * ---------------------------------------------------------------------------- + */ #define SEGCBLIST_ENABLED BIT(0) -#define SEGCBLIST_OFFLOADED BIT(1) +#define SEGCBLIST_SOFTIRQ_ONLY BIT(1) +#define SEGCBLIST_KTHREAD_CB BIT(2) +#define SEGCBLIST_KTHREAD_GP BIT(3) +#define SEGCBLIST_OFFLOADED BIT(4) struct rcu_segcblist { struct rcu_head *head; diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 934945d72ca5..7fc6362625b2 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -266,6 +266,7 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) */ void rcu_segcblist_offload(struct rcu_segcblist *rsclp) { + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); } diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index ff372db09f8b..e05952ab9b87 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -83,8 +83,16 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) /* Is the specified rcu_segcblist offloaded? */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - return IS_ENABLED(CONFIG_RCU_NOCB_CPU) && - rcu_segcblist_test_flags(rsclp, SEGCBLIST_OFFLOADED); + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU)) { + /* + * Complete de-offloading happens only when SEGCBLIST_SOFTIRQ_ONLY + * is set. + */ + if (!rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY)) + return true; + } + + return false; } /* diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8086c0467c15..7cfc2e8a2500 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -83,6 +83,9 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .dynticks_nesting = 1, .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), +#ifdef CONFIG_RCU_NOCB_CPU + .cblist.flags = SEGCBLIST_SOFTIRQ_ONLY, +#endif }; static struct rcu_state rcu_state = { .level = { &rcu_state.node[0] }, -- cgit v1.2.3 From 126d9d49528dae792859e5f11f3b447ce8a9a9b4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:18 +0100 Subject: rcu/nocb: Always init segcblist on CPU up How the rdp->cblist enabled state is treated at CPU-hotplug time depends on whether or not that ->cblist is offloaded. 1) Not offloaded: The ->cblist is disabled when the CPU goes down. All its callbacks are migrated and none can to enqueued until after some later CPU-hotplug operation brings the CPU back up. 2) Offloaded: The ->cblist is not disabled on CPU down because the CB/GP kthreads must finish invoking the remaining callbacks. There is thus no need to re-enable it on CPU up. Since the ->cblist offloaded state is set in stone at boot, it cannot change between CPU down and CPU up. So 1) and 2) are symmetrical. However, given runtime toggling of the offloaded state, there are two additional asymmetrical scenarios: 3) The ->cblist is not offloaded when the CPU goes down. The ->cblist is later toggled to offloaded and then the CPU comes back up. 4) The ->cblist is offloaded when the CPU goes down. The ->cblist is later toggled to no longer be offloaded and then the CPU comes back up. Scenario 4) is currently handled correctly. The ->cblist remains enabled on CPU down and gets re-initialized on CPU up. The toggling operation will wait until ->cblist is empty, so ->cblist will remain empty until CPU-up time. The scenario 3) would run into trouble though, as the rdp is disabled on CPU down and not re-initialized/re-enabled on CPU up. Except that in this case, ->cblist is guaranteed to be empty because all its callbacks were migrated away at CPU-down time. And the CPU-up code already initializes and enables any empty ->cblist structures in order to handle the possibility of early-boot invocations of call_rcu() in the case where such invocations don't occur. So all that need be done is to adjust the locking. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 7cfc2e8a2500..83362f6f1119 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4015,12 +4015,18 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rcu_state.n_force_qs; rdp->blimit = blimit; - if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ - !rcu_segcblist_is_offloaded(&rdp->cblist)) - rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + /* + * Lock in case the CB/GP kthreads are still around handling + * old callbacks (longer term we should flush all callbacks + * before completing CPU offline) + */ + rcu_nocb_lock(rdp); + if (rcu_segcblist_empty(&rdp->cblist)) /* No early-boot CBs? */ + rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ + rcu_nocb_unlock(rdp); /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed -- cgit v1.2.3 From d97b078182406c0bd0aacd36fc0a693e118e608f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:19 +0100 Subject: rcu/nocb: De-offloading CB kthread To de-offload callback processing back onto a CPU, it is necessary to clear SEGCBLIST_OFFLOAD and notify the nocb CB kthread, which will then clear its own bit flag and go to sleep to stop handling callbacks. This commit makes that change. It will also be necessary to notify the nocb GP kthread in this same way, which is the subject of a follow-on commit. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker [ paulmck: Add export per kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 + kernel/rcu/rcu_segcblist.c | 10 ++-- kernel/rcu/rcu_segcblist.h | 2 +- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 130 ++++++++++++++++++++++++++++++++++++++------- 5 files changed, 123 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de0826411311..40266eb418b6 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -104,8 +104,10 @@ static inline void rcu_user_exit(void) { } #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); +int rcu_nocb_cpu_deoffload(int cpu); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } +static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /** diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c index 7fc6362625b2..7f181c9675f7 100644 --- a/kernel/rcu/rcu_segcblist.c +++ b/kernel/rcu/rcu_segcblist.c @@ -264,10 +264,14 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp) * Mark the specified rcu_segcblist structure as offloaded. This * structure must be empty. */ -void rcu_segcblist_offload(struct rcu_segcblist *rsclp) +void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload) { - rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); - rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); + if (offload) { + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY); + rcu_segcblist_set_flags(rsclp, SEGCBLIST_OFFLOADED); + } else { + rcu_segcblist_clear_flags(rsclp, SEGCBLIST_OFFLOADED); + } } /* diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index e05952ab9b87..28c9a5225afc 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -109,7 +109,7 @@ void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); void rcu_segcblist_disable(struct rcu_segcblist *rsclp); -void rcu_segcblist_offload(struct rcu_segcblist *rsclp); +void rcu_segcblist_offload(struct rcu_segcblist *rsclp, bool offload); bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7708ed161f4a..e0deb4829847 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -201,6 +201,7 @@ struct rcu_data { /* 5) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU struct swait_queue_head nocb_cb_wq; /* For nocb kthreads to sleep on. */ + struct swait_queue_head nocb_state_wq; /* For offloading state changes */ struct task_struct *nocb_gp_kthread; raw_spinlock_t nocb_lock; /* Guard following pair of fields. */ atomic_t nocb_lock_contended; /* Contention experienced. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7e291ce0a1d6..1b870d0d2445 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2081,16 +2081,29 @@ static int rcu_nocb_gp_kthread(void *arg) return 0; } +static inline bool nocb_cb_can_run(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB; + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) +{ + return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep); +} + /* * Invoke any ready callbacks from the corresponding no-CBs CPU, * then, if there are no more, wait for more to appear. */ static void nocb_cb_wait(struct rcu_data *rdp) { + struct rcu_segcblist *cblist = &rdp->cblist; + struct rcu_node *rnp = rdp->mynode; + bool needwake_state = false; + bool needwake_gp = false; unsigned long cur_gp_seq; unsigned long flags; - bool needwake_gp = false; - struct rcu_node *rnp = rdp->mynode; local_irq_save(flags); rcu_momentary_dyntick_idle(); @@ -2100,32 +2113,50 @@ static void nocb_cb_wait(struct rcu_data *rdp) local_bh_enable(); lockdep_assert_irqs_enabled(); rcu_nocb_lock_irqsave(rdp, flags); - if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) && + if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) && rcu_seq_done(&rnp->gp_seq, cur_gp_seq) && raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } - if (rcu_segcblist_ready_cbs(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - if (needwake_gp) - rcu_gp_kthread_wake(); - return; - } - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); WRITE_ONCE(rdp->nocb_cb_sleep, true); + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (rcu_segcblist_ready_cbs(cblist)) + WRITE_ONCE(rdp->nocb_cb_sleep, false); + } else { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We won't touch the callbacks and keep sleeping until we ever + * get re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } + + if (rdp->nocb_cb_sleep) + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep")); + rcu_nocb_unlock_irqrestore(rdp, flags); if (needwake_gp) rcu_gp_kthread_wake(); - swait_event_interruptible_exclusive(rdp->nocb_cb_wq, - !READ_ONCE(rdp->nocb_cb_sleep)); - if (!smp_load_acquire(&rdp->nocb_cb_sleep)) { /* VVV */ + + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + + do { + swait_event_interruptible_exclusive(rdp->nocb_cb_wq, + nocb_cb_wait_cond(rdp)); + /* ^^^ Ensure CB invocation follows _sleep test. */ - return; - } - WARN_ON(signal_pending(current)); - trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { + WARN_ON(signal_pending(current)); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); + } + } while (!nocb_cb_can_run(rdp)); } /* @@ -2187,6 +2218,67 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) do_nocb_deferred_wakeup_common(rdp); } +static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + bool wake_cb = false; + unsigned long flags; + + printk("De-offloading %d\n", rdp->cpu); + + rcu_nocb_lock_irqsave(rdp, flags); + rcu_segcblist_offload(cblist, false); + + if (rdp->nocb_cb_sleep) { + rdp->nocb_cb_sleep = false; + wake_cb = true; + } + rcu_nocb_unlock_irqrestore(rdp, flags); + + if (wake_cb) + swake_up_one(&rdp->nocb_cb_wq); + + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + + return 0; +} + +static long rcu_nocb_rdp_deoffload(void *arg) +{ + struct rcu_data *rdp = arg; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + return __rcu_nocb_rdp_deoffload(rdp); +} + +int rcu_nocb_cpu_deoffload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + if (rdp == rdp->nocb_gp_rdp) { + pr_info("Can't deoffload an rdp GP leader (yet)\n"); + return -EINVAL; + } + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); + } else { + ret = __rcu_nocb_rdp_deoffload(rdp); + } + if (!ret) + cpumask_clear_cpu(cpu, rcu_nocb_mask); + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); + void __init rcu_init_nohz(void) { int cpu; @@ -2229,7 +2321,8 @@ void __init rcu_init_nohz(void) rdp = per_cpu_ptr(&rcu_data, cpu); if (rcu_segcblist_empty(&rdp->cblist)) rcu_segcblist_init(&rdp->cblist); - rcu_segcblist_offload(&rdp->cblist); + rcu_segcblist_offload(&rdp->cblist, true); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); } rcu_organize_nocb_kthreads(); } @@ -2239,6 +2332,7 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { init_swait_queue_head(&rdp->nocb_cb_wq); init_swait_queue_head(&rdp->nocb_gp_wq); + init_swait_queue_head(&rdp->nocb_state_wq); raw_spin_lock_init(&rdp->nocb_lock); raw_spin_lock_init(&rdp->nocb_bypass_lock); raw_spin_lock_init(&rdp->nocb_gp_lock); -- cgit v1.2.3 From ef005345e6e49859e225f549c88c985e79477bb9 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:20 +0100 Subject: rcu/nocb: Don't deoffload an offline CPU with pending work Offloaded CPUs do not migrate their callbacks, instead relying on their rcuo kthread to invoke them. But if the CPU is offline, it will be running neither its RCU_SOFTIRQ handler nor its rcuc kthread. This means that de-offloading an offline CPU that still has pending callbacks will strand those callbacks. This commit therefore refuses to toggle offline CPUs having pending callbacks. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Suggested-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 1b870d0d2445..b70cc91a7831 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2227,6 +2227,15 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) printk("De-offloading %d\n", rdp->cpu); rcu_nocb_lock_irqsave(rdp, flags); + /* + * If there are still pending work offloaded, the offline + * CPU won't help much handling them. + */ + if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + return -EBUSY; + } + rcu_segcblist_offload(cblist, false); if (rdp->nocb_cb_sleep) { -- cgit v1.2.3 From 5bb39dc956f3d4f1bb75b5962b503426c45340ae Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:21 +0100 Subject: rcu/nocb: De-offloading GP kthread To de-offload callback processing back onto a CPU, it is necessary to clear SEGCBLIST_OFFLOAD and notify the nocb GP kthread, which will then clear its own bit flag and ignore this CPU until further notice. Whichever of the nocb CB and nocb GP kthreads is last to clear its own bit notifies the de-offloading worker kthread. Once notified, this worker kthread can proceed safe in the knowledge that the nocb CB and GP kthreads will no longer be manipulating this CPU's RCU callback list. This commit makes this change. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 54 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index b70cc91a7831..fe46e7008667 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1928,6 +1928,33 @@ static void do_nocb_bypass_wakeup_timer(struct timer_list *t) __call_rcu_nocb_wake(rdp, true, flags); } +static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) +{ + u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; + + return rcu_segcblist_test_flags(&rdp->cblist, flags); +} + +static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_state) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + return true; + } else { + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + return false; + } +} + + /* * No-CBs GP kthreads come here to wait for additional callbacks to show up * or for grace periods to end. @@ -1956,8 +1983,17 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) */ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { + bool needwake_state = false; + if (!nocb_gp_enabled_cb(rdp)) + continue; trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); rcu_nocb_lock_irqsave(rdp, flags); + if (!nocb_gp_update_state(rdp, &needwake_state)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); + continue; + } bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); if (bypass_ncbs && (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) || @@ -2221,7 +2257,8 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; - bool wake_cb = false; + struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; + bool wake_cb = false, wake_gp = false; unsigned long flags; printk("De-offloading %d\n", rdp->cpu); @@ -2247,9 +2284,19 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) if (wake_cb) swake_up_one(&rdp->nocb_cb_wq); - swait_event_exclusive(rdp->nocb_state_wq, - !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)); + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + if (rdp_gp->nocb_gp_sleep) { + rdp_gp->nocb_gp_sleep = false; + wake_gp = true; + } + raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); + if (wake_gp) + wake_up_process(rdp_gp->nocb_gp_kthread); + + swait_event_exclusive(rdp->nocb_state_wq, + !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | + SEGCBLIST_KTHREAD_GP)); return 0; } @@ -2332,6 +2379,7 @@ void __init rcu_init_nohz(void) rcu_segcblist_init(&rdp->cblist); rcu_segcblist_offload(&rdp->cblist, true); rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB); + rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP); } rcu_organize_nocb_kthreads(); } -- cgit v1.2.3 From 254e11efde66ca0a0ce0c99a62c377314b5984ff Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:22 +0100 Subject: rcu/nocb: Re-offload support To re-offload the callback processing off of a CPU, it is necessary to clear SEGCBLIST_SOFTIRQ_ONLY, set SEGCBLIST_OFFLOADED, and then notify both the CB and GP kthreads so that they both set their own bit flag and start processing the callbacks remotely. The re-offloading worker is then notified that it can stop the RCU_SOFTIRQ handler (or rcuc kthread, as the case may be) from processing the callbacks locally. Ordering must be carefully enforced so that the callbacks that used to be processed locally without locking will have the same ordering properties when they are invoked by the nocb CB and GP kthreads. This commit makes this change. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker [ paulmck: Export rcu_nocb_cpu_offload(). ] Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 2 + kernel/rcu/tree_plugin.h | 158 ++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 138 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 40266eb418b6..e0ee52e2756d 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -104,9 +104,11 @@ static inline void rcu_user_exit(void) { } #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); +int rcu_nocb_cpu_offload(int cpu); int rcu_nocb_cpu_deoffload(int cpu); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } +static inline int rcu_nocb_cpu_offload(int cpu) { return -EINVAL; } static inline int rcu_nocb_cpu_deoffload(int cpu) { return 0; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index fe46e7008667..03ae1ce4790f 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1928,6 +1928,20 @@ static void do_nocb_bypass_wakeup_timer(struct timer_list *t) __call_rcu_nocb_wake(rdp, true, flags); } +/* + * Check if we ignore this rdp. + * + * We check that without holding the nocb lock but + * we make sure not to miss a freshly offloaded rdp + * with the current ordering: + * + * rdp_offload_toggle() nocb_gp_enabled_cb() + * ------------------------- ---------------------------- + * WRITE flags LOCK nocb_gp_lock + * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep + * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock + * UNLOCK nocb_gp_lock READ flags + */ static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp) { u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP; @@ -1940,6 +1954,11 @@ static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_sta struct rcu_segcblist *cblist = &rdp->cblist; if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + } return true; } else { /* @@ -2003,6 +2022,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass); } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) { rcu_nocb_unlock_irqrestore(rdp, flags); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); continue; /* No callbacks here, try next. */ } if (bypass_ncbs) { @@ -2054,6 +2075,8 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) } if (needwake_gp) rcu_gp_kthread_wake(); + if (needwake_state) + swake_up_one(&rdp->nocb_state_wq); } my_rdp->nocb_gp_bypass = bypass; @@ -2159,6 +2182,11 @@ static void nocb_cb_wait(struct rcu_data *rdp) WRITE_ONCE(rdp->nocb_cb_sleep, true); if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) { + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) { + rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB); + if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) + needwake_state = true; + } if (rcu_segcblist_ready_cbs(cblist)) WRITE_ONCE(rdp->nocb_cb_sleep, false); } else { @@ -2254,35 +2282,25 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) do_nocb_deferred_wakeup_common(rdp); } -static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +static int rdp_offload_toggle(struct rcu_data *rdp, + bool offload, unsigned long flags) + __releases(rdp->nocb_lock) { struct rcu_segcblist *cblist = &rdp->cblist; struct rcu_data *rdp_gp = rdp->nocb_gp_rdp; - bool wake_cb = false, wake_gp = false; - unsigned long flags; - - printk("De-offloading %d\n", rdp->cpu); - - rcu_nocb_lock_irqsave(rdp, flags); - /* - * If there are still pending work offloaded, the offline - * CPU won't help much handling them. - */ - if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) { - rcu_nocb_unlock_irqrestore(rdp, flags); - return -EBUSY; - } + bool wake_gp = false; - rcu_segcblist_offload(cblist, false); + rcu_segcblist_offload(cblist, offload); - if (rdp->nocb_cb_sleep) { + if (rdp->nocb_cb_sleep) rdp->nocb_cb_sleep = false; - wake_cb = true; - } rcu_nocb_unlock_irqrestore(rdp, flags); - if (wake_cb) - swake_up_one(&rdp->nocb_cb_wq); + /* + * Ignore former value of nocb_cb_sleep and force wake up as it could + * have been spuriously set to false already. + */ + swake_up_one(&rdp->nocb_cb_wq); raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); if (rdp_gp->nocb_gp_sleep) { @@ -2294,10 +2312,32 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) if (wake_gp) wake_up_process(rdp_gp->nocb_gp_kthread); + return 0; +} + +static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + printk("De-offloading %d\n", rdp->cpu); + + rcu_nocb_lock_irqsave(rdp, flags); + /* + * If there are still pending work offloaded, the offline + * CPU won't help much handling them. + */ + if (cpu_is_offline(rdp->cpu) && !rcu_segcblist_empty(&rdp->cblist)) { + rcu_nocb_unlock_irqrestore(rdp, flags); + return -EBUSY; + } + + ret = rdp_offload_toggle(rdp, false, flags); swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); - return 0; + return ret; } static long rcu_nocb_rdp_deoffload(void *arg) @@ -2335,6 +2375,80 @@ int rcu_nocb_cpu_deoffload(int cpu) } EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload); +static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) +{ + struct rcu_segcblist *cblist = &rdp->cblist; + unsigned long flags; + int ret; + + /* + * For now we only support re-offload, ie: the rdp must have been + * offloaded on boot first. + */ + if (!rdp->nocb_gp_rdp) + return -EINVAL; + + printk("Offloading %d\n", rdp->cpu); + /* + * Can't use rcu_nocb_lock_irqsave() while we are in + * SEGCBLIST_SOFTIRQ_ONLY mode. + */ + raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + /* + * We didn't take the nocb lock while working on the + * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. + * Every modifications that have been done previously on + * rdp->cblist must be visible remotely by the nocb kthreads + * upon wake up after reading the cblist flags. + * + * The layout against nocb_lock enforces that ordering: + * + * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait() + * ------------------------- ---------------------------- + * WRITE callbacks rcu_nocb_lock() + * rcu_nocb_lock() READ flags + * WRITE flags READ callbacks + * rcu_nocb_unlock() rcu_nocb_unlock() + */ + ret = rdp_offload_toggle(rdp, true, flags); + swait_event_exclusive(rdp->nocb_state_wq, + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) && + rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + + return ret; +} + +static long rcu_nocb_rdp_offload(void *arg) +{ + struct rcu_data *rdp = arg; + + WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id()); + return __rcu_nocb_rdp_offload(rdp); +} + +int rcu_nocb_cpu_offload(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); + int ret = 0; + + mutex_lock(&rcu_state.barrier_mutex); + cpus_read_lock(); + if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { + if (cpu_online(cpu)) { + ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); + } else { + ret = __rcu_nocb_rdp_offload(rdp); + } + if (!ret) + cpumask_set_cpu(cpu, rcu_nocb_mask); + } + cpus_read_unlock(); + mutex_unlock(&rcu_state.barrier_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload); + void __init rcu_init_nohz(void) { int cpu; -- cgit v1.2.3 From 69cdea873cde261586a2cae2440178df1a313bbe Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:23 +0100 Subject: rcu/nocb: Shutdown nocb timer on de-offloading This commit ensures that the nocb timer is shut down before reaching the final de-offloaded state. The key goal is to prevent the timer handler from manipulating the callbacks without the protection of the nocb locks. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.h | 1 + kernel/rcu/tree_plugin.h | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index e0deb4829847..5d359b9f9fec 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -257,6 +257,7 @@ struct rcu_data { }; /* Values for nocb_defer_wakeup field in struct rcu_data. */ +#define RCU_NOCB_WAKE_OFF -1 #define RCU_NOCB_WAKE_NOT 0 #define RCU_NOCB_WAKE 1 #define RCU_NOCB_WAKE_FORCE 2 diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 03ae1ce4790f..c88ad628595b 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1665,6 +1665,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype, const char *reason) { + if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_OFF) + return; if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT) mod_timer(&rdp->nocb_timer, jiffies + 1); if (rdp->nocb_defer_wakeup < waketype) @@ -2243,7 +2245,7 @@ static int rcu_nocb_cb_kthread(void *arg) /* Is a deferred wakeup of rcu_nocb_kthread() required? */ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) { - return READ_ONCE(rdp->nocb_defer_wakeup); + return READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT; } /* Do a deferred wakeup of rcu_nocb_kthread(). */ @@ -2337,6 +2339,12 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); + /* Make sure nocb timer won't stay around */ + rcu_nocb_lock_irqsave(rdp, flags); + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF); + rcu_nocb_unlock_irqrestore(rdp, flags); + del_timer_sync(&rdp->nocb_timer); + return ret; } @@ -2394,6 +2402,8 @@ static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) * SEGCBLIST_SOFTIRQ_ONLY mode. */ raw_spin_lock_irqsave(&rdp->nocb_lock, flags); + /* Re-enable nocb timer */ + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); /* * We didn't take the nocb lock while working on the * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode. -- cgit v1.2.3 From 314202f84ddd61e4d7576ef62570ad2e2d9db06b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:24 +0100 Subject: rcu/nocb: Flush bypass before setting SEGCBLIST_SOFTIRQ_ONLY This commit flushes the bypass queue and sets state to avoid its being refilled before switching to the final de-offloaded state. To avoid refilling, this commit sets SEGCBLIST_SOFTIRQ_ONLY before re-enabling IRQs. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index c88ad628595b..35dc9b31e9a4 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2339,12 +2339,21 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) swait_event_exclusive(rdp->nocb_state_wq, !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP)); - /* Make sure nocb timer won't stay around */ rcu_nocb_lock_irqsave(rdp, flags); + /* Make sure nocb timer won't stay around */ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_OFF); rcu_nocb_unlock_irqrestore(rdp, flags); del_timer_sync(&rdp->nocb_timer); + /* + * Flush bypass. While IRQs are disabled and once we set + * SEGCBLIST_SOFTIRQ_ONLY, no callback is supposed to be + * enqueued on bypass. + */ + rcu_nocb_lock_irqsave(rdp, flags); + rcu_nocb_flush_bypass(rdp, NULL, jiffies); + rcu_nocb_unlock_irqrestore(rdp, flags); + return ret; } -- cgit v1.2.3 From b9ced9e1ab51ed6057ac8198fd1eeb404a32a867 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:25 +0100 Subject: rcu/nocb: Set SEGCBLIST_SOFTIRQ_ONLY at the very last stage of de-offloading This commit sets SEGCBLIST_SOFTIRQ_ONLY once toggling is otherwise fully complete, allowing further RCU callback manipulation to be carried out locklessly and locally. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 35dc9b31e9a4..8641b72f6d0d 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2352,7 +2352,14 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) */ rcu_nocb_lock_irqsave(rdp, flags); rcu_nocb_flush_bypass(rdp, NULL, jiffies); - rcu_nocb_unlock_irqrestore(rdp, flags); + rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY); + /* + * With SEGCBLIST_SOFTIRQ_ONLY, we can't use + * rcu_nocb_unlock_irqrestore() anymore. Theoretically we + * could set SEGCBLIST_SOFTIRQ_ONLY with cb unlocked and IRQs + * disabled now, but let's be paranoid. + */ + raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags); return ret; } -- cgit v1.2.3 From e3abe959fbd57aa751bc533677a35c411cee9b16 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:26 +0100 Subject: rcu/nocb: Only cond_resched() from actual offloaded batch processing During a toggle operations, rcu_do_batch() may be invoked concurrently by softirqs and offloaded processing for a given CPU's callbacks. This commit therefore makes sure cond_resched() is invoked only from the offloaded context. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 83362f6f1119..4ef59a5416a3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2516,8 +2516,7 @@ static void rcu_do_batch(struct rcu_data *rdp) /* Exceeded the time limit, so leave. */ break; } - if (offloaded) { - WARN_ON_ONCE(in_serving_softirq()); + if (!in_serving_softirq()) { local_bh_enable(); lockdep_assert_irqs_enabled(); cond_resched_tasks_rcu_qs(); -- cgit v1.2.3 From 32aa2f4170d22f0b9fcb75ab05679ab122fae373 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:27 +0100 Subject: rcu/nocb: Process batch locally as long as offloading isn't complete This commit makes sure to process the callbacks locally (via either RCU_SOFTIRQ or the rcuc kthread) whenever the segcblist isn't entirely offloaded. This ensures that callbacks are invoked one way or another while a CPU is in the middle of a toggle operation. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.h | 12 ++++++++++++ kernel/rcu/tree.c | 3 ++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 28c9a5225afc..afad6fc6311c 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -95,6 +95,18 @@ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) return false; } +static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rsclp) +{ + int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED; + + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU)) { + if ((rsclp->flags & flags) == flags) + return true; + } + + return false; +} + /* * Are all segments following the specified segment of the specified * rcu_segcblist structure empty of callbacks? (The specified diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4ef59a5416a3..ec14c017c0e3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2700,6 +2700,7 @@ static __latent_entropy void rcu_core(void) struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); + const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) return; @@ -2729,7 +2730,7 @@ static __latent_entropy void rcu_core(void) rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); /* If there are callbacks ready, invoke them. */ - if (!offloaded && rcu_segcblist_ready_cbs(&rdp->cblist) && + if (do_batch && rcu_segcblist_ready_cbs(&rdp->cblist) && likely(READ_ONCE(rcu_scheduler_fully_active))) rcu_do_batch(rdp); -- cgit v1.2.3 From 634954c2dbf88e67aa267798f60af6b9a476cf4b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:28 +0100 Subject: rcu/nocb: Locally accelerate callbacks as long as offloading isn't complete The local callbacks processing checks if any callbacks need acceleration. This commit carries out this checking under nocb lock protection in the middle of toggle operations, during which time rcu_core() executes concurrently with GP/CB kthreads. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Inspired-by: Paul E. McKenney Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index ec14c017c0e3..03810a58fdd0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2699,7 +2699,6 @@ static __latent_entropy void rcu_core(void) unsigned long flags; struct rcu_data *rdp = raw_cpu_ptr(&rcu_data); struct rcu_node *rnp = rdp->mynode; - const bool offloaded = rcu_segcblist_is_offloaded(&rdp->cblist); const bool do_batch = !rcu_segcblist_completely_offloaded(&rdp->cblist); if (cpu_is_offline(smp_processor_id())) @@ -2720,11 +2719,11 @@ static __latent_entropy void rcu_core(void) /* No grace period and unregistered callbacks? */ if (!rcu_gp_in_progress() && - rcu_segcblist_is_enabled(&rdp->cblist) && !offloaded) { - local_irq_save(flags); + rcu_segcblist_is_enabled(&rdp->cblist) && do_batch) { + rcu_nocb_lock_irqsave(rdp, flags); if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) rcu_accelerate_cbs_unlocked(rnp, rdp); - local_irq_restore(flags); + rcu_nocb_unlock_irqrestore(rdp, flags); } rcu_check_gp_start_stall(rnp, rdp, rcu_jiffies_till_stall_check()); -- cgit v1.2.3 From 43759fe5a137389e94ed6d4680c3c63c17273158 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 11 Nov 2020 23:53:13 +0100 Subject: cpu/hotplug: Add lockdep_is_cpus_held() This commit adds a lockdep_is_cpus_held() function to verify that the proper locks are held and that various operations are running in the correct context. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Cc: Boqun Feng Signed-off-by: Paul E. McKenney --- include/linux/cpu.h | 2 ++ kernel/cpu.c | 7 +++++++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index d6428aaf67e7..3aaa0687e8df 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -111,6 +111,8 @@ static inline void cpu_maps_update_done(void) #endif /* CONFIG_SMP */ extern struct bus_type cpu_subsys; +extern int lockdep_is_cpus_held(void); + #ifdef CONFIG_HOTPLUG_CPU extern void cpus_write_lock(void); extern void cpus_write_unlock(void); diff --git a/kernel/cpu.c b/kernel/cpu.c index 4e11e91010e1..1b6302ecbabe 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -330,6 +330,13 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(&cpu_hotplug_lock); } +#ifdef CONFIG_LOCKDEP +int lockdep_is_cpus_held(void) +{ + return percpu_rwsem_is_held(&cpu_hotplug_lock); +} +#endif + static void lockdep_acquire_cpus_lock(void) { rwsem_acquire(&cpu_hotplug_lock.dep_map, 0, 0, _THIS_IP_); -- cgit v1.2.3 From dcd42591ebb8a25895b551a5297ea9c24414ba54 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 13 Nov 2020 13:13:33 +0100 Subject: timer: Add timer_curr_running() This commit adds a timer_curr_running() function that verifies that the current code is running in the context of the specified timer's handler. Cc: Josh Triplett Cc: Steven Rostedt Cc: Mathieu Desnoyers Cc: Lai Jiangshan Cc: Joel Fernandes Cc: Neeraj Upadhyay Cc: Thomas Gleixner Tested-by: Boqun Feng Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- include/linux/timer.h | 2 ++ kernel/time/timer.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/include/linux/timer.h b/include/linux/timer.h index fda13c9d1256..4118a97e62fb 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -192,6 +192,8 @@ extern int try_to_del_timer_sync(struct timer_list *timer); #define del_singleshot_timer_sync(t) del_timer_sync(t) +extern bool timer_curr_running(struct timer_list *timer); + extern void init_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8dbc008f8942..f9b2096456e5 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1237,6 +1237,19 @@ int try_to_del_timer_sync(struct timer_list *timer) } EXPORT_SYMBOL(try_to_del_timer_sync); +bool timer_curr_running(struct timer_list *timer) +{ + int i; + + for (i = 0; i < NR_BASES; i++) { + struct timer_base *base = this_cpu_ptr(&timer_bases[i]); + if (base->running_timer == timer) + return true; + } + + return false; +} + #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) { -- cgit v1.2.3 From 2c4319bd1d14d01f5b6654a90c2b6362f3a407d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 23 Sep 2020 17:39:46 -0700 Subject: rcutorture: Test runtime toggling of CPUs' callback offloading Frederic Weisbecker is adding the ability to change the rcu_nocbs state of CPUs at runtime, that is, to offload and deoffload their RCU callback processing without the need to reboot. As the old saying goes, "if it ain't tested, it don't work", so this commit therefore adds prototype rcutorture testing for this capability. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker --- Documentation/admin-guide/kernel-parameters.txt | 8 +++ kernel/rcu/rcutorture.c | 90 ++++++++++++++++++++++++- 2 files changed, 95 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c722ec19cd00..9f8ac776ae41 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4338,6 +4338,14 @@ stress RCU, they don't participate in the actual test, hence the "fake". + rcutorture.nocbs_nthreads= [KNL] + Set number of RCU callback-offload togglers. + Zero (the default) disables toggling. + + rcutorture.nocbs_toggle= [KNL] + Set the delay in milliseconds between successive + callback-offload toggling attempts. + rcutorture.nreaders= [KNL] Set number of RCU readers. The value -1 selects N-1, where N is the number of CPUs. A value diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 528ed10b78fd..22735bc3eacc 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -97,6 +97,8 @@ torture_param(int, object_debug, 0, torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)"); torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (jiffies), 0=disable"); +torture_param(int, nocbs_nthreads, 0, "Number of NOCB toggle threads, 0 to disable"); +torture_param(int, nocbs_toggle, 1000, "Time between toggling nocb state (ms)"); torture_param(int, read_exit_delay, 13, "Delay between read-then-exit episodes (s)"); torture_param(int, read_exit_burst, 16, @@ -127,10 +129,12 @@ static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, srcu, ...)"); +static int nrealnocbers; static int nrealreaders; static struct task_struct *writer_task; static struct task_struct **fakewriter_tasks; static struct task_struct **reader_tasks; +static struct task_struct **nocb_tasks; static struct task_struct *stats_task; static struct task_struct *fqs_task; static struct task_struct *boost_tasks[NR_CPUS]; @@ -174,6 +178,8 @@ static unsigned long n_read_exits; static struct list_head rcu_torture_removed; static unsigned long shutdown_jiffies; static unsigned long start_gp_seq; +static atomic_long_t n_nocb_offload; +static atomic_long_t n_nocb_deoffload; static int rcu_torture_writer_state; #define RTWS_FIXED_DELAY 0 @@ -1498,6 +1504,53 @@ rcu_torture_reader(void *arg) return 0; } +/* + * Randomly Toggle CPUs' callback-offload state. This uses hrtimers to + * increase race probabilities and fuzzes the interval between toggling. + */ +static int rcu_nocb_toggle(void *arg) +{ + int cpu; + int maxcpu = -1; + int oldnice = task_nice(current); + long r; + DEFINE_TORTURE_RANDOM(rand); + ktime_t toggle_delay; + unsigned long toggle_fuzz; + ktime_t toggle_interval = ms_to_ktime(nocbs_toggle); + + VERBOSE_TOROUT_STRING("rcu_nocb_toggle task started"); + while (!rcu_inkernel_boot_has_ended()) + schedule_timeout_interruptible(HZ / 10); + for_each_online_cpu(cpu) + maxcpu = cpu; + WARN_ON(maxcpu < 0); + if (toggle_interval > ULONG_MAX) + toggle_fuzz = ULONG_MAX >> 3; + else + toggle_fuzz = toggle_interval >> 3; + if (toggle_fuzz <= 0) + toggle_fuzz = NSEC_PER_USEC; + do { + r = torture_random(&rand); + cpu = (r >> 4) % (maxcpu + 1); + if (r & 0x1) { + rcu_nocb_cpu_offload(cpu); + atomic_long_inc(&n_nocb_offload); + } else { + rcu_nocb_cpu_deoffload(cpu); + atomic_long_inc(&n_nocb_deoffload); + } + toggle_delay = torture_random(&rand) % toggle_fuzz + toggle_interval; + set_current_state(TASK_INTERRUPTIBLE); + schedule_hrtimeout(&toggle_delay, HRTIMER_MODE_REL); + if (stutter_wait("rcu_nocb_toggle")) + sched_set_normal(current, oldnice); + } while (!torture_must_stop()); + torture_kthread_stopping("rcu_nocb_toggle"); + return 0; +} + /* * Print torture statistics. Caller must ensure that there is only * one call to this function at a given time!!! This is normally @@ -1553,7 +1606,9 @@ rcu_torture_stats_print(void) data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); - pr_cont("read-exits: %ld\n", data_race(n_read_exits)); + pr_cont("read-exits: %ld ", data_race(n_read_exits)); + pr_cont("nocb-toggles: %ld:%ld\n", + atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload)); pr_alert("%s%s ", torture_type, TORTURE_FLAG); if (atomic_read(&n_rcu_torture_mberror) || @@ -1647,7 +1702,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) "stall_cpu_block=%d " "n_barrier_cbs=%d " "onoff_interval=%d onoff_holdoff=%d " - "read_exit_delay=%d read_exit_burst=%d\n", + "read_exit_delay=%d read_exit_burst=%d " + "nocbs_nthreads=%d nocbs_toggle=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, @@ -1657,7 +1713,8 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) stall_cpu_block, n_barrier_cbs, onoff_interval, onoff_holdoff, - read_exit_delay, read_exit_burst); + read_exit_delay, read_exit_burst, + nocbs_nthreads, nocbs_toggle); } static int rcutorture_booster_cleanup(unsigned int cpu) @@ -2500,6 +2557,13 @@ rcu_torture_cleanup(void) torture_stop_kthread(rcu_torture_stall, stall_task); torture_stop_kthread(rcu_torture_writer, writer_task); + if (nocb_tasks) { + for (i = 0; i < nrealnocbers; i++) + torture_stop_kthread(rcu_nocb_toggle, nocb_tasks[i]); + kfree(nocb_tasks); + nocb_tasks = NULL; + } + if (reader_tasks) { for (i = 0; i < nrealreaders; i++) torture_stop_kthread(rcu_torture_reader, @@ -2762,6 +2826,26 @@ rcu_torture_init(void) if (firsterr) goto unwind; } + nrealnocbers = nocbs_nthreads; + if (WARN_ON(nrealnocbers < 0)) + nrealnocbers = 1; + if (WARN_ON(nocbs_toggle < 0)) + nocbs_toggle = HZ; + if (nrealnocbers > 0) { + nocb_tasks = kcalloc(nrealnocbers, sizeof(nocb_tasks[0]), GFP_KERNEL); + if (nocb_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("out of memory"); + firsterr = -ENOMEM; + goto unwind; + } + } else { + nocb_tasks = NULL; + } + for (i = 0; i < nrealnocbers; i++) { + firsterr = torture_create_kthread(rcu_nocb_toggle, NULL, nocb_tasks[i]); + if (firsterr) + goto unwind; + } if (stat_interval > 0) { firsterr = torture_create_kthread(rcu_torture_stats, NULL, stats_task); -- cgit v1.2.3 From 341690611f8d488859f42a761f5d7cbac6ba2940 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Dec 2020 10:20:34 -0800 Subject: rcu/nocb: Add grace period and task state to show_rcu_nocb_state() output This commit improves debuggability by indicating which grace period each batch of nocb callbacks is waiting on and by showing the task state and last CPU for reach nocb kthread. [ paulmck: Handle !SMP CB offloading per kernel test robot feedback. ] Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.h | 11 +++++++++++ kernel/rcu/tree_plugin.h | 39 ++++++++++++++++++++++++++++++++------- 2 files changed, 43 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index afad6fc6311c..311060279f1a 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -117,6 +117,17 @@ static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) return !READ_ONCE(*READ_ONCE(rsclp->tails[seg])); } +/* + * Is the specified segment of the specified rcu_segcblist structure + * empty of callbacks? + */ +static inline bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) +{ + if (seg == RCU_DONE_TAIL) + return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; + return rsclp->tails[seg - 1] == rsclp->tails[seg]; +} + void rcu_segcblist_inc_len(struct rcu_segcblist *rsclp); void rcu_segcblist_add_len(struct rcu_segcblist *rsclp, long v); void rcu_segcblist_init(struct rcu_segcblist *rsclp); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8641b72f6d0d..5ee1113fbf39 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2672,6 +2672,19 @@ void rcu_bind_current_to_nocb(void) } EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb); +// The ->on_cpu field is available only in CONFIG_SMP=y, so... +#ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return tsp && tsp->state == TASK_RUNNING && !tsp->on_cpu ? "!" : ""; +} +#else // #ifdef CONFIG_SMP +static char *show_rcu_should_be_on_cpu(struct task_struct *tsp) +{ + return ""; +} +#endif // #else #ifdef CONFIG_SMP + /* * Dump out nocb grace-period kthread state for the specified rcu_data * structure. @@ -2680,7 +2693,7 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) { struct rcu_node *rnp = rdp->mynode; - pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu\n", + pr_info("nocb GP %d %c%c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n", rdp->cpu, "kK"[!!rdp->nocb_gp_kthread], "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)], @@ -2694,12 +2707,17 @@ static void show_rcu_nocb_gp_state(struct rcu_data *rdp) ".B"[!!rdp->nocb_gp_bypass], ".G"[!!rdp->nocb_gp_gp], (long)rdp->nocb_gp_seq, - rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops)); + rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops), + rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); } /* Dump out nocb kthread state for the specified rcu_data structure. */ static void show_rcu_nocb_state(struct rcu_data *rdp) { + char bufw[20]; + char bufr[20]; struct rcu_segcblist *rsclp = &rdp->cblist; bool waslocked; bool wastimer; @@ -2708,7 +2726,9 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) if (rdp->nocb_gp_rdp == rdp) show_rcu_nocb_gp_state(rdp); - pr_info(" CB %d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%c%c%c q%ld\n", + sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]); + sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); + pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], @@ -2720,11 +2740,16 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) jiffies - rdp->nocb_nobypass_last, rdp->nocb_nobypass_count, ".D"[rcu_segcblist_ready_cbs(rsclp)], - ".W"[!rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)], - ".R"[!rcu_segcblist_restempty(rsclp, RCU_WAIT_TAIL)], - ".N"[!rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL)], + ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw, + ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)], + rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr, + ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)], ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)], - rcu_segcblist_n_cbs(&rdp->cblist)); + rcu_segcblist_n_cbs(&rdp->cblist), + rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.', + rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1, + show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread)); /* It is OK for GP kthreads to have GP state. */ if (rdp->nocb_gp_rdp == rdp) -- cgit v1.2.3 From 3d0cef50f32e2bc69f60909584c18623bba9a6c6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 18 Dec 2020 13:17:37 -0800 Subject: rcu/nocb: Add nocb CB kthread list to show_rcu_nocb_state() output This commit improves debuggability by indicating laying out the order in which rcuoc kthreads appear in the ->nocb_next_cb_rdp list. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 5ee1113fbf39..bc63a6b9d532 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2730,6 +2730,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp) sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]); pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n", rdp->cpu, rdp->nocb_gp_rdp->cpu, + rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1, "kK"[!!rdp->nocb_cb_kthread], "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)], "cC"[!!atomic_read(&rdp->nocb_lock_contended)], -- cgit v1.2.3 From f759081e8f5ac640df1c7125540759bbcb4eb0e2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 21 Dec 2020 11:17:16 -0800 Subject: rcu/nocb: Code-style nits in callback-offloading toggling This commit addresses a few code-style nits in callback-offloading toggling, including one that predates this toggling. Cc: Frederic Weisbecker Signed-off-by: Paul E. McKenney --- kernel/rcu/rcu_segcblist.h | 19 ++++++------------- kernel/rcu/rcutorture.c | 2 +- kernel/rcu/tree_plugin.h | 45 ++++++++++++++++++++++----------------------- kernel/time/timer.c | 1 + 4 files changed, 30 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h index 311060279f1a..9a19328ff251 100644 --- a/kernel/rcu/rcu_segcblist.h +++ b/kernel/rcu/rcu_segcblist.h @@ -80,17 +80,12 @@ static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) return rcu_segcblist_test_flags(rsclp, SEGCBLIST_ENABLED); } -/* Is the specified rcu_segcblist offloaded? */ +/* Is the specified rcu_segcblist offloaded, or is SEGCBLIST_SOFTIRQ_ONLY set? */ static inline bool rcu_segcblist_is_offloaded(struct rcu_segcblist *rsclp) { - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU)) { - /* - * Complete de-offloading happens only when SEGCBLIST_SOFTIRQ_ONLY - * is set. - */ - if (!rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY)) - return true; - } + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && + !rcu_segcblist_test_flags(rsclp, SEGCBLIST_SOFTIRQ_ONLY)) + return true; return false; } @@ -99,10 +94,8 @@ static inline bool rcu_segcblist_completely_offloaded(struct rcu_segcblist *rscl { int flags = SEGCBLIST_KTHREAD_CB | SEGCBLIST_KTHREAD_GP | SEGCBLIST_OFFLOADED; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU)) { - if ((rsclp->flags & flags) == flags) - return true; - } + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU) && (rsclp->flags & flags) == flags) + return true; return false; } diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 22735bc3eacc..b9dd63c166b9 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1606,7 +1606,7 @@ rcu_torture_stats_print(void) data_race(n_barrier_successes), data_race(n_barrier_attempts), data_race(n_rcu_torture_barrier_error)); - pr_cont("read-exits: %ld ", data_race(n_read_exits)); + pr_cont("read-exits: %ld ", data_race(n_read_exits)); // Statistic. pr_cont("nocb-toggles: %ld:%ld\n", atomic_long_read(&n_nocb_offload), atomic_long_read(&n_nocb_deoffload)); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index bc63a6b9d532..6f56f9e51e67 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1962,17 +1962,17 @@ static inline bool nocb_gp_update_state(struct rcu_data *rdp, bool *needwake_sta *needwake_state = true; } return true; - } else { - /* - * De-offloading. Clear our flag and notify the de-offload worker. - * We will ignore this rdp until it ever gets re-offloaded. - */ - WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); - rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); - if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) - *needwake_state = true; - return false; } + + /* + * De-offloading. Clear our flag and notify the de-offload worker. + * We will ignore this rdp until it ever gets re-offloaded. + */ + WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)); + rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP); + if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) + *needwake_state = true; + return false; } @@ -2005,6 +2005,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp) WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp); for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) { bool needwake_state = false; + if (!nocb_gp_enabled_cb(rdp)) continue; trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check")); @@ -2160,11 +2161,11 @@ static inline bool nocb_cb_wait_cond(struct rcu_data *rdp) static void nocb_cb_wait(struct rcu_data *rdp) { struct rcu_segcblist *cblist = &rdp->cblist; - struct rcu_node *rnp = rdp->mynode; - bool needwake_state = false; - bool needwake_gp = false; unsigned long cur_gp_seq; unsigned long flags; + bool needwake_state = false; + bool needwake_gp = false; + struct rcu_node *rnp = rdp->mynode; local_irq_save(flags); rcu_momentary_dyntick_idle(); @@ -2217,8 +2218,8 @@ static void nocb_cb_wait(struct rcu_data *rdp) swait_event_interruptible_exclusive(rdp->nocb_cb_wq, nocb_cb_wait_cond(rdp)); - /* ^^^ Ensure CB invocation follows _sleep test. */ - if (smp_load_acquire(&rdp->nocb_cb_sleep)) { + // VVV Ensure CB invocation follows _sleep test. + if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^ WARN_ON(signal_pending(current)); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty")); } @@ -2323,7 +2324,7 @@ static int __rcu_nocb_rdp_deoffload(struct rcu_data *rdp) unsigned long flags; int ret; - printk("De-offloading %d\n", rdp->cpu); + pr_info("De-offloading %d\n", rdp->cpu); rcu_nocb_lock_irqsave(rdp, flags); /* @@ -2384,11 +2385,10 @@ int rcu_nocb_cpu_deoffload(int cpu) mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); if (rcu_segcblist_is_offloaded(&rdp->cblist)) { - if (cpu_online(cpu)) { + if (cpu_online(cpu)) ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp); - } else { + else ret = __rcu_nocb_rdp_deoffload(rdp); - } if (!ret) cpumask_clear_cpu(cpu, rcu_nocb_mask); } @@ -2412,7 +2412,7 @@ static int __rcu_nocb_rdp_offload(struct rcu_data *rdp) if (!rdp->nocb_gp_rdp) return -EINVAL; - printk("Offloading %d\n", rdp->cpu); + pr_info("Offloading %d\n", rdp->cpu); /* * Can't use rcu_nocb_lock_irqsave() while we are in * SEGCBLIST_SOFTIRQ_ONLY mode. @@ -2460,11 +2460,10 @@ int rcu_nocb_cpu_offload(int cpu) mutex_lock(&rcu_state.barrier_mutex); cpus_read_lock(); if (!rcu_segcblist_is_offloaded(&rdp->cblist)) { - if (cpu_online(cpu)) { + if (cpu_online(cpu)) ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp); - } else { + else ret = __rcu_nocb_rdp_offload(rdp); - } if (!ret) cpumask_set_cpu(cpu, rcu_nocb_mask); } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f9b2096456e5..f475f1a027c8 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1243,6 +1243,7 @@ bool timer_curr_running(struct timer_list *timer) for (i = 0; i < NR_BASES; i++) { struct timer_base *base = this_cpu_ptr(&timer_bases[i]); + if (base->running_timer == timer) return true; } -- cgit v1.2.3 From 147c6852d34563b87ff0e67383c2bf675e8248f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 22 Dec 2020 16:49:11 -0800 Subject: rcu: Do any deferred nocb wakeups at CPU offline time Because the need to wake a nocb GP kthread ("rcuog") is sometimes detected when wakeups cannot be done, these wakeups can be deferred. The wakeups are then carried out by calls to do_nocb_deferred_wakeup() at various safe points in the code, including RCU's idle hooks. However, when a CPU goes offline, it invokes arch_cpu_idle_dead() without invoking any of RCU's idle hooks. This commit therefore adds a call to do_nocb_deferred_wakeup() in rcu_report_dead() in order to handle any deferred wakeups that have been requested by the outgoing CPU. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 03810a58fdd0..e6dee714efe0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -4178,6 +4178,9 @@ void rcu_report_dead(unsigned int cpu) struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + // Do any dangling deferred wakeups. + do_nocb_deferred_wakeup(rdp); + /* QS for any half-done expedited grace period. */ preempt_disable(); rcu_report_exp_rdp(this_cpu_ptr(&rcu_data)); -- cgit v1.2.3 From 683954e55c981467bfd4688417e914bafc40959f Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Mon, 16 Nov 2020 21:36:00 +0530 Subject: rcu: Check and report missed fqs timer wakeup on RCU stall For a new grace period request, the RCU GP kthread transitions through following states: a. [RCU_GP_WAIT_GPS] -> [RCU_GP_DONE_GPS] The RCU_GP_WAIT_GPS state is where the GP kthread waits for a request for a new GP. Once it receives a request (for example, when a new RCU callback is queued), the GP kthread transitions to RCU_GP_DONE_GPS. b. [RCU_GP_DONE_GPS] -> [RCU_GP_ONOFF] Grace period initialization starts in rcu_gp_init(), which records the start of new GP in rcu_state.gp_seq and transitions to RCU_GP_ONOFF. c. [RCU_GP_ONOFF] -> [RCU_GP_INIT] The purpose of the RCU_GP_ONOFF state is to apply the online/offline information that was buffered for any CPUs that recently came online or went offline. This state is maintained in per-leaf rcu_node bitmasks, with the buffered state in ->qsmaskinitnext and the state for the upcoming GP in ->qsmaskinit. At the end of this RCU_GP_ONOFF state, each bit in ->qsmaskinit will correspond to a CPU that must pass through a quiescent state before the upcoming grace period is allowed to complete. However, a leaf rcu_node structure with an all-zeroes ->qsmaskinit cannot necessarily be ignored. In preemptible RCU, there might well be tasks still in RCU read-side critical sections that were first preempted while running on one of the CPUs managed by this structure. Such tasks will be queued on this structure's ->blkd_tasks list. Only after this list fully drains can this leaf rcu_node structure be ignored, and even then only if none of its CPUs have come back online in the meantime. Once that happens, the ->qsmaskinit masks further up the tree will be updated to exclude this leaf rcu_node structure. Once the ->qsmaskinitnext and ->qsmaskinit fields have been updated as needed, the GP kthread transitions to RCU_GP_INIT. d. [RCU_GP_INIT] -> [RCU_GP_WAIT_FQS] The purpose of the RCU_GP_INIT state is to copy each ->qsmaskinit to the ->qsmask field within each rcu_node structure. This copying is done breadth-first from the root to the leaves. Why not just copy directly from ->qsmaskinitnext to ->qsmask? Because the ->qsmaskinitnext masks can change in the meantime as additional CPUs come online or go offline. Such changes would result in inconsistencies in the ->qsmask fields up and down the tree, which could in turn result in too-short grace periods or grace-period hangs. These issues are avoided by snapshotting the leaf rcu_node structures' ->qsmaskinitnext fields into their ->qsmaskinit counterparts, generating a consistent set of ->qsmaskinit fields throughout the tree, and only then copying these consistent ->qsmaskinit fields to their ->qsmask counterparts. Once this initialization step is complete, the GP kthread transitions to RCU_GP_WAIT_FQS, where it waits to do a force-quiescent-state scan on the one hand or for the end of the grace period on the other. e. [RCU_GP_WAIT_FQS] -> [RCU_GP_DOING_FQS] The RCU_GP_WAIT_FQS state waits for one of three things: (1) An explicit request to do a force-quiescent-state scan, (2) The end of the grace period, or (3) A short interval of time, after which it will do a force-quiescent-state (FQS) scan. The explicit request can come from rcutorture or from any CPU that has too many RCU callbacks queued (see the qhimark kernel parameter and the RCU_GP_FLAG_OVLD flag). The aforementioned "short period of time" is specified by the jiffies_till_first_fqs boot parameter for a given grace period's first FQS scan and by the jiffies_till_next_fqs for later FQS scans. Either way, once the wait is over, the GP kthread transitions to RCU_GP_DOING_FQS. f. [RCU_GP_DOING_FQS] -> [RCU_GP_CLEANUP] The RCU_GP_DOING_FQS state performs an FQS scan. Each such scan carries out two functions for any CPU whose bit is still set in its leaf rcu_node structure's ->qsmask field, that is, for any CPU that has not yet reported a quiescent state for the current grace period: i. Report quiescent states on behalf of CPUs that have been observed to be idle (from an RCU perspective) since the beginning of the grace period. ii. If the current grace period is too old, take various actions to encourage holdout CPUs to pass through quiescent states, including enlisting the aid of any calls to cond_resched() and might_sleep(), and even including IPIing the holdout CPUs. These checks are skipped for any leaf rcu_node structure with a all-zero ->qsmask field, however such structures are subject to RCU priority boosting if there are tasks on a given structure blocking the current grace period. The end of the grace period is detected when the root rcu_node structure's ->qsmask is zero and when there are no longer any preempted tasks blocking the current grace period. (No, this last check is not redundant. To see this, consider an rcu_node tree having exactly one structure that serves as both root and leaf.) Once the end of the grace period is detected, the GP kthread transitions to RCU_GP_CLEANUP. g. [RCU_GP_CLEANUP] -> [RCU_GP_CLEANED] The RCU_GP_CLEANUP state marks the end of grace period by updating the rcu_state structure's ->gp_seq field and also all rcu_node structures' ->gp_seq field. As before, the rcu_node tree is traversed in breadth first order. Once this update is complete, the GP kthread transitions to the RCU_GP_CLEANED state. i. [RCU_GP_CLEANED] -> [RCU_GP_INIT] Once in the RCU_GP_CLEANED state, the GP kthread immediately transitions into the RCU_GP_INIT state. j. The role of timers. If there is at least one idle CPU, and if timers are not firing, the transition from RCU_GP_DOING_FQS to RCU_GP_CLEANUP will never happen. Timers can fail to fire for a number of reasons, including issues in timer configuration, issues in the timer framework, and failure to handle softirqs (for example, when there is a storm of interrupts). Whatever the reason, if the timers fail to fire, the GP kthread will never be awakened, resulting in RCU CPU stall warnings and eventually in OOM. However, an RCU CPU stall warning has a large number of potential causes, as documented in Documentation/RCU/stallwarn.rst. This commit therefore adds analysis to the RCU CPU stall-warning code to emit an additional message if the cause of the stall is likely to be timer failure. Signed-off-by: Neeraj Upadhyay Signed-off-by: Paul E. McKenney --- Documentation/RCU/stallwarn.rst | 23 ++++++++++++++++++++++- kernel/rcu/tree.c | 25 +++++++++++++++---------- kernel/rcu/tree_stall.h | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index c9ab6af4d3be..d53856cc6390 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -92,7 +92,9 @@ warnings: buggy timer hardware through bugs in the interrupt or exception path (whether hardware, firmware, or software) through bugs in Linux's timer subsystem through bugs in the scheduler, and, - yes, even including bugs in RCU itself. + yes, even including bugs in RCU itself. It can also result in + the ``rcu_.*timer wakeup didn't happen for`` console-log message, + which will include additional debugging information. - A bug in the RCU implementation. @@ -292,6 +294,25 @@ kthread is waiting for a short timeout, the "state" precedes value of the task_struct ->state field, and the "cpu" indicates that the grace-period kthread last ran on CPU 5. +If the relevant grace-period kthread does not wake from FQS wait in a +reasonable time, then the following additional line is printed:: + + kthread timer wakeup didn't happen for 23804 jiffies! g7076 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x402 + +The "23804" indicates that kthread's timer expired more than 23 thousand +jiffies ago. The rest of the line has meaning similar to the kthread +starvation case. + +Additionally, the following line is printed:: + + Possible timer handling issue on cpu=4 timer-softirq=11142 + +Here "cpu" indicates that the grace-period kthread last ran on CPU 4, +where it queued the fqs timer. The number following the "timer-softirq" +is the current ``TIMER_SOFTIRQ`` count on cpu 4. If this value does not +change on successive RCU CPU stall warnings, there is further reason to +suspect a timer problem. + Multiple Warnings From One Stall ================================ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..e918f100cc34 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1765,7 +1765,7 @@ static bool rcu_gp_init(void) * go offline later. Please also refer to "Hotplug CPU" section * of RCU's Requirements documentation. */ - rcu_state.gp_state = RCU_GP_ONOFF; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_ONOFF); rcu_for_each_leaf_node(rnp) { smp_mb(); // Pair with barriers used when updating ->ofl_seq to odd values. firstseq = READ_ONCE(rnp->ofl_seq); @@ -1831,7 +1831,7 @@ static bool rcu_gp_init(void) * The grace period cannot complete until the initialization * process finishes, because this kthread handles both. */ - rcu_state.gp_state = RCU_GP_INIT; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_INIT); rcu_for_each_node_breadth_first(rnp) { rcu_gp_slow(gp_init_delay); raw_spin_lock_irqsave_rcu_node(rnp, flags); @@ -1930,17 +1930,22 @@ static void rcu_gp_fqs_loop(void) ret = 0; for (;;) { if (!ret) { - rcu_state.jiffies_force_qs = jiffies + j; + WRITE_ONCE(rcu_state.jiffies_force_qs, jiffies + j); + /* + * jiffies_force_qs before RCU_GP_WAIT_FQS state + * update; required for stall checks. + */ + smp_wmb(); WRITE_ONCE(rcu_state.jiffies_kick_kthreads, jiffies + (j ? 3 * j : 2)); } trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("fqswait")); - rcu_state.gp_state = RCU_GP_WAIT_FQS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS); ret = swait_event_idle_timeout_exclusive( rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j); rcu_gp_torture_wait(); - rcu_state.gp_state = RCU_GP_DOING_FQS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS); /* Locking provides needed memory barriers. */ /* If grace period done, leave loop. */ if (!READ_ONCE(rnp->qsmask) && @@ -2054,7 +2059,7 @@ static void rcu_gp_cleanup(void) trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("end")); rcu_seq_end(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); - rcu_state.gp_state = RCU_GP_IDLE; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_IDLE); /* Check for GP requests since above loop. */ rdp = this_cpu_ptr(&rcu_data); if (!needgp && ULONG_CMP_LT(rnp->gp_seq, rnp->gp_seq_needed)) { @@ -2093,12 +2098,12 @@ static int __noreturn rcu_gp_kthread(void *unused) for (;;) { trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("reqwait")); - rcu_state.gp_state = RCU_GP_WAIT_GPS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_GPS); swait_event_idle_exclusive(rcu_state.gp_wq, READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_INIT); rcu_gp_torture_wait(); - rcu_state.gp_state = RCU_GP_DONE_GPS; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_DONE_GPS); /* Locking provides needed memory barrier. */ if (rcu_gp_init()) break; @@ -2113,9 +2118,9 @@ static int __noreturn rcu_gp_kthread(void *unused) rcu_gp_fqs_loop(); /* Handle grace-period end. */ - rcu_state.gp_state = RCU_GP_CLEANUP; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANUP); rcu_gp_cleanup(); - rcu_state.gp_state = RCU_GP_CLEANED; + WRITE_ONCE(rcu_state.gp_state, RCU_GP_CLEANED); } } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 29cf096b5d20..353da30bd558 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -482,6 +482,36 @@ static void rcu_check_gp_kthread_starvation(void) } } +/* Complain about missing wakeups from expired fqs wait timer */ +static void rcu_check_gp_kthread_expired_fqs_timer(void) +{ + struct task_struct *gpk = rcu_state.gp_kthread; + short gp_state; + unsigned long jiffies_fqs; + int cpu; + + /* + * Order reads of .gp_state and .jiffies_force_qs. + * Matching smp_wmb() is present in rcu_gp_fqs_loop(). + */ + gp_state = smp_load_acquire(&rcu_state.gp_state); + jiffies_fqs = READ_ONCE(rcu_state.jiffies_force_qs); + + if (gp_state == RCU_GP_WAIT_FQS && + time_after(jiffies, jiffies_fqs + RCU_STALL_MIGHT_MIN) && + gpk && !READ_ONCE(gpk->on_rq)) { + cpu = task_cpu(gpk); + pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#lx\n", + rcu_state.name, (jiffies - jiffies_fqs), + (long)rcu_seq_current(&rcu_state.gp_seq), + data_race(rcu_state.gp_flags), + gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS, + gpk->state); + pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n", + cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu)); + } +} + static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) { int cpu; @@ -543,6 +573,7 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) WRITE_ONCE(rcu_state.jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); + rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); panic_on_rcu_stall(); @@ -578,6 +609,7 @@ static void print_cpu_stall(unsigned long gps) jiffies - gps, (long)rcu_seq_current(&rcu_state.gp_seq), totqlen); + rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); rcu_dump_cpu_stacks(); -- cgit v1.2.3 From e76506f0e85129d726c487c873a2245c92446515 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Nov 2020 10:24:52 -0800 Subject: refscale: Allow summarization of verbose output The refscale test prints enough per-kthread console output to provoke RCU CPU stall warnings on large systems. This commit therefore allows this output to be summarized. For example, the refscale.verbose_batched=32 boot parameter would causes only every 32nd line of output to be logged. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 6 ++++++ kernel/rcu/refscale.c | 21 ++++++++++++++++----- 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index c722ec19cd00..3244f9e04a64 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4557,6 +4557,12 @@ refscale.verbose= [KNL] Enable additional printk() statements. + refscale.verbose_batched= [KNL] + Batch the additional printk() statements. If zero + (the default) or negative, print everything. Otherwise, + print every Nth verbose statement, where N is the value + specified. + relax_domain_level= [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/admin-guide/cgroup-v1/cpusets.rst. diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 23ff36a66f97..3da246f2ccf5 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -46,6 +46,16 @@ #define VERBOSE_SCALEOUT(s, x...) \ do { if (verbose) pr_alert("%s" SCALE_FLAG s, scale_type, ## x); } while (0) +static atomic_t verbose_batch_ctr; + +#define VERBOSE_SCALEOUT_BATCH(s, x...) \ +do { \ + if (verbose && \ + (verbose_batched <= 0 || \ + !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) \ + pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \ +} while (0) + #define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! " s, scale_type, ## x); } while (0) @@ -57,6 +67,7 @@ module_param(scale_type, charp, 0444); MODULE_PARM_DESC(scale_type, "Type of test (rcu, srcu, refcnt, rwsem, rwlock."); torture_param(int, verbose, 0, "Enable verbose debugging printk()s"); +torture_param(int, verbose_batched, 0, "Batch verbose debugging printk()s"); // Wait until there are multiple CPUs before starting test. torture_param(int, holdoff, IS_BUILTIN(CONFIG_RCU_REF_SCALE_TEST) ? 10 : 0, @@ -368,14 +379,14 @@ ref_scale_reader(void *arg) u64 start; s64 duration; - VERBOSE_SCALEOUT("ref_scale_reader %ld: task started", me); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: task started", me); set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids)); set_user_nice(current, MAX_NICE); atomic_inc(&n_init); if (holdoff) schedule_timeout_interruptible(holdoff * HZ); repeat: - VERBOSE_SCALEOUT("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: waiting to start next experiment on cpu %d", me, smp_processor_id()); // Wait for signal that this reader can start. wait_event(rt->wq, (atomic_read(&nreaders_exp) && smp_load_acquire(&rt->start_reader)) || @@ -392,7 +403,7 @@ repeat: while (atomic_read_acquire(&n_started)) cpu_relax(); - VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d started", me, exp_idx); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d started", me, exp_idx); // To reduce noise, do an initial cache-warming invocation, check @@ -421,8 +432,8 @@ repeat: if (atomic_dec_and_test(&nreaders_exp)) wake_up(&main_wq); - VERBOSE_SCALEOUT("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", - me, exp_idx, atomic_read(&nreaders_exp)); + VERBOSE_SCALEOUT_BATCH("ref_scale_reader %ld: experiment %d ended, (readers remaining=%d)", + me, exp_idx, atomic_read(&nreaders_exp)); if (!torture_must_stop()) goto repeat; -- cgit v1.2.3 From 12a910e3cd3d11e00b2a2df24ea995ffa3e27ae5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Nov 2020 16:01:50 -0800 Subject: rcutorture: Require entire stutter period be post-boot Currently, the rcu_torture_writer() function checks that all required grace periods elapse during a stutter interval, which is a multi-second time period during which the test load is removed. However, this check is suppressed during early boot (that is, before init is spawned) in order to avoid false positives that otherwise occur due to heavy load on the single boot CPU. Unfortunately, this approach is insufficient. It is possible that the stutter interval might end just as init is spawned, so that early boot conditions prevailed during almost the entire stutter interval. This commit therefore takes a snapshot of boot-complete state just before the stutter interval, thus suppressing the check for failure to complete grace periods unless the entire stutter interval took place after early boot. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 338e1182b4b5..1930d92f4d15 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1070,6 +1070,7 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { + bool boot_ended; bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); unsigned long cookie; int expediting = 0; @@ -1239,12 +1240,13 @@ rcu_torture_writer(void *arg) !rcu_gp_is_normal(); } rcu_torture_writer_state = RTWS_STUTTER; + boot_ended = rcu_inkernel_boot_has_ended(); stutter_waited = stutter_wait("rcu_torture_writer"); if (stutter_waited && !READ_ONCE(rcu_fwd_cb_nodelay) && !cur_ops->slow_gps && !torture_must_stop() && - rcu_inkernel_boot_has_ended()) + boot_ended) for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != -- cgit v1.2.3 From 18fbf307b7319af3725c36e16af6ae9f35a8699c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Nov 2020 16:46:06 -0800 Subject: rcutorture: Make synctype[] and nsynctype be static global Full testing of the new SRCU polling API requires that the fake writers also use it in order to test concurrent calls to all of the API members, especially start_poll_synchronize_srcu(). This commit prepares the ground for this by making the synctype[] and nsynctype variables be static globals so that the rcu_torture_fakewriter() function can access them. Initialization of these variables is moved from rcu_torture_writer() to a new rcu_torture_write_types() function that is invoked from rcu_torture_init() just before the first writer kthread is spawned. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 62 ++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 1930d92f4d15..0d257e2dc9c6 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1062,37 +1062,18 @@ rcu_torture_fqs(void *arg) return 0; } +// Used by writers to randomly choose from the available grace-period +// primitives. The only purpose of the initialization is to size the array. +static int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; +static int nsynctypes; + /* - * RCU torture writer kthread. Repeatedly substitutes a new structure - * for that pointed to by rcu_torture_current, freeing the old structure - * after a series of grace periods (the "pipeline"). + * Determine which grace-period primitives are available. */ -static int -rcu_torture_writer(void *arg) +static void rcu_torture_write_types(void) { - bool boot_ended; - bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); - unsigned long cookie; - int expediting = 0; - unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; bool gp_poll1 = gp_poll, gp_sync1 = gp_sync; - int i; - int idx; - int oldnice = task_nice(current); - struct rcu_torture *rp; - struct rcu_torture *old_rp; - static DEFINE_TORTURE_RANDOM(rand); - bool stutter_waited; - int synctype[] = { RTWS_DEF_FREE, RTWS_EXP_SYNC, - RTWS_COND_GET, RTWS_POLL_GET, RTWS_SYNC }; - int nsynctypes = 0; - - VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - if (!can_expedite) - pr_alert("%s" TORTURE_FLAG - " GP expediting controlled from boot/sysfs for %s.\n", - torture_type, cur_ops->name); /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_poll1 && !gp_sync1) @@ -1127,6 +1108,34 @@ rcu_torture_writer(void *arg) } else if (gp_sync && !cur_ops->sync) { pr_alert("%s: gp_sync without primitives.\n", __func__); } +} + +/* + * RCU torture writer kthread. Repeatedly substitutes a new structure + * for that pointed to by rcu_torture_current, freeing the old structure + * after a series of grace periods (the "pipeline"). + */ +static int +rcu_torture_writer(void *arg) +{ + bool boot_ended; + bool can_expedite = !rcu_gp_is_expedited() && !rcu_gp_is_normal(); + unsigned long cookie; + int expediting = 0; + unsigned long gp_snap; + int i; + int idx; + int oldnice = task_nice(current); + struct rcu_torture *rp; + struct rcu_torture *old_rp; + static DEFINE_TORTURE_RANDOM(rand); + bool stutter_waited; + + VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); + if (!can_expedite) + pr_alert("%s" TORTURE_FLAG + " GP expediting controlled from boot/sysfs for %s.\n", + torture_type, cur_ops->name); if (WARN_ONCE(nsynctypes == 0, "rcu_torture_writer: No update-side primitives.\n")) { /* @@ -2889,6 +2898,7 @@ rcu_torture_init(void) /* Start up the kthreads. */ + rcu_torture_write_types(); firsterr = torture_create_kthread(rcu_torture_writer, NULL, writer_task); if (firsterr) -- cgit v1.2.3 From 682189a3f874db57b3e755512f2a2953f61fc54e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 16 Nov 2020 17:10:39 -0800 Subject: rcutorture: Make rcu_torture_fakewriter() use blocking wait primitives Full testing of the new SRCU polling API requires that the fake writers also use it in order to test concurrent calls to all of the API members, especially start_poll_synchronize_srcu(). This commit makes rcu_torture_fakewriter() use all available blocking grace-period-wait primitives available from the RCU flavor under test. Link: https://lore.kernel.org/rcu/20201112201547.GF3365678@moria.home.lan/ Reported-by: Kent Overstreet Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 40 ++++++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0d257e2dc9c6..03bdf6752fe4 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1289,6 +1289,8 @@ rcu_torture_writer(void *arg) static int rcu_torture_fakewriter(void *arg) { + unsigned long gp_snap; + int i; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); @@ -1300,15 +1302,37 @@ rcu_torture_fakewriter(void *arg) if (cur_ops->cb_barrier != NULL && torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); - } else if (gp_normal == gp_exp) { - if (cur_ops->sync && torture_random(&rand) & 0x80) - cur_ops->sync(); - else if (cur_ops->exp_sync) + } else { + switch (synctype[torture_random(&rand) % nsynctypes]) { + case RTWS_DEF_FREE: + break; + case RTWS_EXP_SYNC: cur_ops->exp_sync(); - } else if (gp_normal && cur_ops->sync) { - cur_ops->sync(); - } else if (cur_ops->exp_sync) { - cur_ops->exp_sync(); + break; + case RTWS_COND_GET: + gp_snap = cur_ops->get_gp_state(); + i = torture_random(&rand) % 16; + if (i != 0) + schedule_timeout_interruptible(i); + udelay(torture_random(&rand) % 1000); + cur_ops->cond_sync(gp_snap); + break; + case RTWS_POLL_GET: + gp_snap = cur_ops->start_gp_poll(); + while (!cur_ops->poll_gp_state(gp_snap)) { + i = torture_random(&rand) % 16; + if (i != 0) + schedule_timeout_interruptible(i); + udelay(torture_random(&rand) % 1000); + } + break; + case RTWS_SYNC: + cur_ops->sync(); + break; + default: + WARN_ON_ONCE(1); + break; + } } stutter_wait("rcu_torture_fakewriter"); } while (!torture_must_stop()); -- cgit v1.2.3 From ae19aaafae95a5487469433e9cae4c208f8d15cd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2020 11:30:18 -0800 Subject: torture: Add fuzzed hrtimer-based sleep functions This commit adds torture_hrtimeout_ns(), torture_hrtimeout_us(), torture_hrtimeout_ms(), torture_hrtimeout_jiffies(), and torture_hrtimeout_s(), each of which uses hrtimers to block for a fuzzed time interval. These functions are intended to be used by the various torture tests to decouple wakeups from the timer wheel, thus providing more opportunity for Murphy to insert destructive race conditions. Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 7 +++++ kernel/torture.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index 7f65bd1dd307..32941f8106b2 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -61,6 +61,13 @@ static inline void torture_random_init(struct torture_random_state *trsp) trsp->trs_count = 0; } +/* Definitions for high-resolution-timer sleeps. */ +int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp); +int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp); +int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp); +int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp); +int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp); + /* Task shuffler, which causes CPUs to occasionally go idle. */ void torture_shuffle_task_register(struct task_struct *tp); int torture_shuffle_init(long shuffint); diff --git a/kernel/torture.c b/kernel/torture.c index 8562ac18d2eb..7548634e8a89 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -58,6 +58,81 @@ static int verbose; static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); +/* + * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit + * nanosecond random fuzz. This function and its friends desynchronize + * testing from the timer wheel. + */ +int torture_hrtimeout_ns(ktime_t baset_ns, u32 fuzzt_ns, struct torture_random_state *trsp) +{ + ktime_t hto = baset_ns; + + if (trsp) + hto += (torture_random(trsp) >> 3) % fuzzt_ns; + set_current_state(TASK_UNINTERRUPTIBLE); + return schedule_hrtimeout(&hto, HRTIMER_MODE_REL); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_ns); + +/* + * Schedule a high-resolution-timer sleep in microseconds, with a 32-bit + * nanosecond (not microsecond!) random fuzz. + */ +int torture_hrtimeout_us(u32 baset_us, u32 fuzzt_ns, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_us * NSEC_PER_USEC; + + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_us); + +/* + * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit + * microsecond (not millisecond!) random fuzz. + */ +int torture_hrtimeout_ms(u32 baset_ms, u32 fuzzt_us, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_ms * NSEC_PER_MSEC; + u32 fuzzt_ns; + + if ((u32)~0U / NSEC_PER_USEC < fuzzt_us) + fuzzt_ns = (u32)~0U; + else + fuzzt_ns = fuzzt_us * NSEC_PER_USEC; + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_ms); + +/* + * Schedule a high-resolution-timer sleep in jiffies, with an + * implied one-jiffy random fuzz. This is intended to replace calls to + * schedule_timeout_interruptible() and friends. + */ +int torture_hrtimeout_jiffies(u32 baset_j, struct torture_random_state *trsp) +{ + ktime_t baset_ns = jiffies_to_nsecs(baset_j); + + return torture_hrtimeout_ns(baset_ns, jiffies_to_nsecs(1), trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_jiffies); + +/* + * Schedule a high-resolution-timer sleep in milliseconds, with a 32-bit + * millisecond (not second!) random fuzz. + */ +int torture_hrtimeout_s(u32 baset_s, u32 fuzzt_ms, struct torture_random_state *trsp) +{ + ktime_t baset_ns = baset_s * NSEC_PER_SEC; + u32 fuzzt_ns; + + if ((u32)~0U / NSEC_PER_MSEC < fuzzt_ms) + fuzzt_ns = (u32)~0U; + else + fuzzt_ns = fuzzt_ms * NSEC_PER_MSEC; + return torture_hrtimeout_ns(baset_ns, fuzzt_ns, trsp); +} +EXPORT_SYMBOL_GPL(torture_hrtimeout_s); + #ifdef CONFIG_HOTPLUG_CPU /* -- cgit v1.2.3 From ea31fd9ca87399ac4e03cd6c215451fa7dc366e4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2020 11:32:54 -0800 Subject: rcutorture: Use torture_hrtimeout_jiffies() to avoid busy-waits Because rcu_torture_writer() and rcu_torture_fakewriter() predate hrtimers, they do timer-wheel-decoupled timed waits by using the timer-wheel-based schedule_timeout_interruptible() functions in conjunction with a random udelay()-based wait. This latter unnecessarily burns CPU time, so this commit instead uses torture_hrtimeout_jiffies() to decouple from the timer wheels without busy-waiting. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 03bdf6752fe4..9414e3027a04 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1194,10 +1194,7 @@ rcu_torture_writer(void *arg) case RTWS_COND_GET: rcu_torture_writer_state = RTWS_COND_GET; gp_snap = cur_ops->get_gp_state(); - i = torture_random(&rand) % 16; - if (i != 0) - schedule_timeout_interruptible(i); - udelay(torture_random(&rand) % 1000); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); rcu_torture_writer_state = RTWS_COND_SYNC; cur_ops->cond_sync(gp_snap); rcu_torture_pipe_update(old_rp); @@ -1206,12 +1203,9 @@ rcu_torture_writer(void *arg) rcu_torture_writer_state = RTWS_POLL_GET; gp_snap = cur_ops->start_gp_poll(); rcu_torture_writer_state = RTWS_POLL_WAIT; - while (!cur_ops->poll_gp_state(gp_snap)) { - i = torture_random(&rand) % 16; - if (i != 0) - schedule_timeout_interruptible(i); - udelay(torture_random(&rand) % 1000); - } + while (!cur_ops->poll_gp_state(gp_snap)) + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); rcu_torture_pipe_update(old_rp); break; case RTWS_SYNC: @@ -1290,7 +1284,6 @@ static int rcu_torture_fakewriter(void *arg) { unsigned long gp_snap; - int i; DEFINE_TORTURE_RANDOM(rand); VERBOSE_TOROUT_STRING("rcu_torture_fakewriter task started"); @@ -1311,19 +1304,14 @@ rcu_torture_fakewriter(void *arg) break; case RTWS_COND_GET: gp_snap = cur_ops->get_gp_state(); - i = torture_random(&rand) % 16; - if (i != 0) - schedule_timeout_interruptible(i); - udelay(torture_random(&rand) % 1000); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, &rand); cur_ops->cond_sync(gp_snap); break; case RTWS_POLL_GET: gp_snap = cur_ops->start_gp_poll(); while (!cur_ops->poll_gp_state(gp_snap)) { - i = torture_random(&rand) % 16; - if (i != 0) - schedule_timeout_interruptible(i); - udelay(torture_random(&rand) % 1000); + torture_hrtimeout_jiffies(torture_random(&rand) % 16, + &rand); } break; case RTWS_SYNC: -- cgit v1.2.3 From ed24affa71f7abf7d81698a99b6c2623491a35b0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2020 12:17:42 -0800 Subject: torture: Make stutter use torture_hrtimeout_*() functions This commit saves a few lines of code by making the stutter_wait() and torture_stutter() functions use torture_hrtimeout_jiffies() and torture_hrtimeout_us(). Signed-off-by: Paul E. McKenney --- kernel/torture.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 7548634e8a89..7ad5c9681580 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -677,7 +677,6 @@ static int stutter_gap; */ bool stutter_wait(const char *title) { - ktime_t delay; unsigned int i = 0; bool ret = false; int spt; @@ -693,11 +692,8 @@ bool stutter_wait(const char *title) schedule_timeout_interruptible(1); } else if (spt == 2) { while (READ_ONCE(stutter_pause_test)) { - if (!(i++ & 0xffff)) { - set_current_state(TASK_INTERRUPTIBLE); - delay = 10 * NSEC_PER_USEC; - schedule_hrtimeout(&delay, HRTIMER_MODE_REL); - } + if (!(i++ & 0xffff)) + torture_hrtimeout_us(10, 0, NULL); cond_resched(); } } else { @@ -715,7 +711,6 @@ EXPORT_SYMBOL_GPL(stutter_wait); */ static int torture_stutter(void *arg) { - ktime_t delay; DEFINE_TORTURE_RANDOM(rand); int wtime; @@ -726,20 +721,15 @@ static int torture_stutter(void *arg) if (stutter > 2) { WRITE_ONCE(stutter_pause_test, 1); wtime = stutter - 3; - delay = ktime_divns(NSEC_PER_SEC * wtime, HZ); - delay += (torture_random(&rand) >> 3) % NSEC_PER_MSEC; - set_current_state(TASK_INTERRUPTIBLE); - schedule_hrtimeout(&delay, HRTIMER_MODE_REL); + torture_hrtimeout_jiffies(wtime, &rand); wtime = 2; } WRITE_ONCE(stutter_pause_test, 2); - delay = ktime_divns(NSEC_PER_SEC * wtime, HZ); - set_current_state(TASK_INTERRUPTIBLE); - schedule_hrtimeout(&delay, HRTIMER_MODE_REL); + torture_hrtimeout_jiffies(wtime, NULL); } WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) - schedule_timeout_interruptible(stutter_gap); + torture_hrtimeout_jiffies(stutter_gap, NULL); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); -- cgit v1.2.3 From 1eba0ef981fd3b5d5e94243aeced8884f43aef50 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 17 Nov 2020 14:12:24 -0800 Subject: rcutorture: Use hrtimers for reader and writer delays This commit replaces schedule_timeout_uninterruptible() and schedule_timeout_interruptible() with torture_hrtimeout_us() and torture_hrtimeout_jiffies() to avoid timer-wheel synchronization. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 9414e3027a04..007595d4783f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1149,7 +1149,7 @@ rcu_torture_writer(void *arg) do { rcu_torture_writer_state = RTWS_FIXED_DELAY; - schedule_timeout_uninterruptible(1); + torture_hrtimeout_us(500, 1000, &rand); rp = rcu_torture_alloc(); if (rp == NULL) continue; @@ -1290,8 +1290,7 @@ rcu_torture_fakewriter(void *arg) set_user_nice(current, MAX_NICE); do { - schedule_timeout_uninterruptible(1 + torture_random(&rand)%10); - udelay(torture_random(&rand) & 0x3ff); + torture_hrtimeout_jiffies(torture_random(&rand) % 10, &rand); if (cur_ops->cb_barrier != NULL && torture_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); @@ -1656,7 +1655,7 @@ rcu_torture_reader(void *arg) if (!rcu_torture_one_read(&rand, myid) && !torture_must_stop()) schedule_timeout_interruptible(HZ); if (time_after(jiffies, lastsleep) && !torture_must_stop()) { - schedule_timeout_interruptible(1); + torture_hrtimeout_us(500, 1000, &rand); lastsleep = jiffies + 10; } while (num_online_cpus() < mynumonline && !torture_must_stop()) -- cgit v1.2.3 From 414c116e016584137118067f506125f6ace6128c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Nov 2020 10:50:35 -0800 Subject: torture: Make refscale throttle high-rate printk()s This commit adds a short delay for verbose_batched-throttled printk()s to further decrease console flooding. Signed-off-by: Paul E. McKenney --- kernel/rcu/refscale.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c index 3da246f2ccf5..02dd9767b559 100644 --- a/kernel/rcu/refscale.c +++ b/kernel/rcu/refscale.c @@ -52,8 +52,10 @@ static atomic_t verbose_batch_ctr; do { \ if (verbose && \ (verbose_batched <= 0 || \ - !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) \ + !(atomic_inc_return(&verbose_batch_ctr) % verbose_batched))) { \ + schedule_timeout_uninterruptible(1); \ pr_alert("%s" SCALE_FLAG s, scale_type, ## x); \ + } \ } while (0) #define VERBOSE_SCALEOUT_ERRSTRING(s, x...) \ -- cgit v1.2.3 From 8a67a20bf257ca378d6e5588fbe4382966395ac8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Nov 2020 13:00:04 -0800 Subject: torture: Throttle VERBOSE_TOROUT_*() output This commit adds kernel boot parameters torture.verbose_sleep_frequency and torture.verbose_sleep_duration, which allow VERBOSE_TOROUT_*() output to be throttled with periodic sleeps on large systems. Signed-off-by: Paul E. McKenney --- Documentation/admin-guide/kernel-parameters.txt | 8 ++++++++ include/linux/torture.h | 15 +++++++++++++-- kernel/torture.c | 20 ++++++++++++++++++++ 3 files changed, 41 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3244f9e04a64..18f3aa759e54 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5337,6 +5337,14 @@ are running concurrently, especially on systems with rotating-rust storage. + torture.verbose_sleep_frequency= [KNL] + Specifies how many verbose printk()s should be + emitted between each sleep. The default of zero + disables verbose-printk() sleeping. + + torture.verbose_sleep_duration= [KNL] + Duration of each verbose-printk() sleep in jiffies. + tp720= [HW,PS2] tpm_suspend_pcr=[HW,TPM] diff --git a/include/linux/torture.h b/include/linux/torture.h index 32941f8106b2..d62d13c8c69a 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -32,9 +32,20 @@ #define TOROUT_STRING(s) \ pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s) #define VERBOSE_TOROUT_STRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); } while (0) +do { \ + if (verbose) { \ + verbose_torout_sleep(); \ + pr_alert("%s" TORTURE_FLAG " %s\n", torture_type, s); \ + } \ +} while (0) #define VERBOSE_TOROUT_ERRSTRING(s) \ - do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); } while (0) +do { \ + if (verbose) { \ + verbose_torout_sleep(); \ + pr_alert("%s" TORTURE_FLAG "!!! %s\n", torture_type, s); \ + } \ +} while (0) +void verbose_torout_sleep(void); /* Definitions for online/offline exerciser. */ typedef void torture_ofl_func(void); diff --git a/kernel/torture.c b/kernel/torture.c index 7ad5c9681580..93eeeb2b3d88 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -48,6 +48,12 @@ module_param(disable_onoff_at_boot, bool, 0444); static bool ftrace_dump_at_shutdown; module_param(ftrace_dump_at_shutdown, bool, 0444); +static int verbose_sleep_frequency; +module_param(verbose_sleep_frequency, int, 0444); + +static int verbose_sleep_duration = 1; +module_param(verbose_sleep_duration, int, 0444); + static char *torture_type; static int verbose; @@ -58,6 +64,20 @@ static int verbose; static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); +static atomic_t verbose_sleep_counter; + +/* + * Sleep if needed from VERBOSE_TOROUT*(). + */ +void verbose_torout_sleep(void) +{ + if (verbose_sleep_frequency > 0 && + verbose_sleep_duration > 0 && + !(atomic_inc_return(&verbose_sleep_counter) % verbose_sleep_frequency)) + schedule_timeout_uninterruptible(verbose_sleep_duration); +} +EXPORT_SYMBOL_GPL(verbose_torout_sleep); + /* * Schedule a high-resolution-timer sleep in nanoseconds, with a 32-bit * nanosecond random fuzz. This function and its friends desynchronize -- cgit v1.2.3 From edf7b8417834c89d00ef88355ea507b0b0a630ae Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 2 Dec 2020 17:52:07 -0800 Subject: rcutorture: Make object_debug also double call_rcu() heap object This commit provides a test for call_rcu() printing the allocation address of a double-freed callback by double-freeing a callback allocated via kmalloc(). However, this commit does not depend on any other commit. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 007595d4783f..76c838696366 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2782,6 +2782,7 @@ static void rcu_test_debug_objects(void) #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD struct rcu_head rh1; struct rcu_head rh2; + struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_KERNEL); init_rcu_head_on_stack(&rh1); init_rcu_head_on_stack(&rh2); @@ -2794,6 +2795,10 @@ static void rcu_test_debug_objects(void) local_irq_disable(); /* Make it harder to start a new grace period. */ call_rcu(&rh2, rcu_torture_leak_cb); call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + if (rhp) { + call_rcu(rhp, rcu_torture_leak_cb); + call_rcu(rhp, rcu_torture_err_cb); /* Another duplicate callback. */ + } local_irq_enable(); rcu_read_unlock(); preempt_enable(); -- cgit v1.2.3 From 0b962c8fe0e5c72a252b236814a6b6e9df799061 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 19 Dec 2020 07:05:58 -0800 Subject: torture: Clean up after torture-test CPU hotplugging This commit puts all CPUs back online at the end of a torture test, and also unconditionally puts them online at the beginning of the test, rather than just in the case of built-in tests. This allows torture tests to behave in a predictable manner, whether built-in or based on modules. Signed-off-by: Paul E. McKenney --- kernel/torture.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/torture.c b/kernel/torture.c index 93eeeb2b3d88..507a20be6950 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -291,6 +291,26 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, } EXPORT_SYMBOL_GPL(torture_online); +/* + * Get everything online at the beginning and ends of tests. + */ +static void torture_online_all(char *phase) +{ + int cpu; + int ret; + + for_each_possible_cpu(cpu) { + if (cpu_online(cpu)) + continue; + ret = add_cpu(cpu); + if (ret && verbose) { + pr_alert("%s" TORTURE_FLAG + "%s: %s online %d: errno %d\n", + __func__, phase, torture_type, cpu, ret); + } + } +} + /* * Execute random CPU-hotplug operations at the interval specified * by the onoff_interval. @@ -301,25 +321,12 @@ torture_onoff(void *arg) int cpu; int maxcpu = -1; DEFINE_TORTURE_RANDOM(rand); - int ret; VERBOSE_TOROUT_STRING("torture_onoff task started"); for_each_online_cpu(cpu) maxcpu = cpu; WARN_ON(maxcpu < 0); - if (!IS_MODULE(CONFIG_TORTURE_TEST)) { - for_each_possible_cpu(cpu) { - if (cpu_online(cpu)) - continue; - ret = add_cpu(cpu); - if (ret && verbose) { - pr_alert("%s" TORTURE_FLAG - "%s: Initial online %d: errno %d\n", - __func__, torture_type, cpu, ret); - } - } - } - + torture_online_all("Initial"); if (maxcpu == 0) { VERBOSE_TOROUT_STRING("Only one CPU, so CPU-hotplug testing is disabled"); goto stop; @@ -347,6 +354,7 @@ torture_onoff(void *arg) stop: torture_kthread_stopping("torture_onoff"); + torture_online_all("Final"); return 0; } -- cgit v1.2.3 From 1afb95fee0342b8d9e05b0433e8e44a6dfd7c4a3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 19 Dec 2020 07:34:35 -0800 Subject: torture: Maintain torture-specific set of CPUs-online books The TREE01 rcutorture scenario intentionally creates confusion as to the number of available CPUs by specifying the "maxcpus=8 nr_cpus=43" kernel boot parameters. This can disable rcutorture's load shedding, which currently uses num_online_cpus(), which would count the extra 35 CPUs. However, the rcutorture guest OS will be provisioned with only 8 CPUs, which means that rcutorture will present full load even when all but one of the original 8 CPUs are offline. This can result in spurious errors due to extreme overloading of that single remaining CPU. This commit therefore keeps a separate set of books on the number of usable online CPUs, so that torture_num_online_cpus() is used for load shedding instead of num_online_cpus(). Note that initial sizing must use num_online_cpus() because torture_num_online_cpus() will return NR_CPUS until shortly after torture_onoff_init() is invoked. Reported-by: Frederic Weisbecker [ paulmck: Apply feedback from kernel test robot. ] Signed-off-by: Paul E. McKenney --- include/linux/torture.h | 5 +++++ kernel/rcu/rcutorture.c | 4 ++-- kernel/torture.c | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/torture.h b/include/linux/torture.h index d62d13c8c69a..0910c5803f35 100644 --- a/include/linux/torture.h +++ b/include/linux/torture.h @@ -48,6 +48,11 @@ do { \ void verbose_torout_sleep(void); /* Definitions for online/offline exerciser. */ +#ifdef CONFIG_HOTPLUG_CPU +int torture_num_online_cpus(void); +#else /* #ifdef CONFIG_HOTPLUG_CPU */ +static inline int torture_num_online_cpus(void) { return 1; } +#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ typedef void torture_ofl_func(void); bool torture_offline(int cpu, long *n_onl_attempts, long *n_onl_successes, unsigned long *sum_offl, int *min_onl, int *max_onl); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 76c838696366..a816df4e86e0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1338,7 +1338,7 @@ static void rcu_torture_reader_do_mbchk(long myid, struct rcu_torture *rtp, struct torture_random_state *trsp) { unsigned long loops; - int noc = num_online_cpus(); + int noc = torture_num_online_cpus(); int rdrchked; int rdrchker; struct rcu_torture_reader_check *rtrcp; // Me. @@ -1658,7 +1658,7 @@ rcu_torture_reader(void *arg) torture_hrtimeout_us(500, 1000, &rand); lastsleep = jiffies + 10; } - while (num_online_cpus() < mynumonline && !torture_must_stop()) + while (torture_num_online_cpus() < mynumonline && !torture_must_stop()) schedule_timeout_interruptible(HZ / 5); stutter_wait("rcu_torture_reader"); } while (!torture_must_stop()); diff --git a/kernel/torture.c b/kernel/torture.c index 507a20be6950..01e336f1e5b2 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -175,6 +175,19 @@ static unsigned long sum_online; static int min_online = -1; static int max_online; +static int torture_online_cpus = NR_CPUS; + +/* + * Some torture testing leverages confusion as to the number of online + * CPUs. This function returns the torture-testing view of this number, + * which allows torture tests to load-balance appropriately. + */ +int torture_num_online_cpus(void) +{ + return READ_ONCE(torture_online_cpus); +} +EXPORT_SYMBOL_GPL(torture_num_online_cpus); + /* * Attempt to take a CPU offline. Return false if the CPU is already * offline or if it is not subject to CPU-hotplug operations. The @@ -229,6 +242,8 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes, *min_offl = delta; if (*max_offl < delta) *max_offl = delta; + WRITE_ONCE(torture_online_cpus, torture_online_cpus - 1); + WARN_ON_ONCE(torture_online_cpus <= 0); } return true; @@ -285,6 +300,7 @@ bool torture_online(int cpu, long *n_onl_attempts, long *n_onl_successes, *min_onl = delta; if (*max_onl < delta) *max_onl = delta; + WRITE_ONCE(torture_online_cpus, torture_online_cpus + 1); } return true; -- cgit v1.2.3 From 35d0b389f3b23439ad15b610d6e43fc72fc75779 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Jan 2021 11:32:43 -0700 Subject: task_work: unconditionally run task_work from get_signal() Song reported a boot regression in a kvm image with 5.11-rc, and bisected it down to the below patch. Debugging this issue, turns out that the boot stalled when a task is waiting on a pipe being released. As we no longer run task_work from get_signal() unless it's queued with TWA_SIGNAL, the task goes idle without running the task_work. This prevents ->release() from being called on the pipe, which another boot task is waiting on. For now, re-instate the unconditional task_work run from get_signal(). For 5.12, we'll collapse TWA_RESUME and TWA_SIGNAL, as it no longer makes sense to have a distinction between the two. This will turn task_work notification into a simple boolean, whether to notify or not. Fixes: 98b89b649fce ("signal: kill JOBCTL_TASK_WORK") Reported-by: Song Liu Tested-by: John Stultz Tested-by: Douglas Anderson Tested-by: Sedat Dilek # LLVM/Clang version 11.0.1 Signed-off-by: Jens Axboe --- kernel/signal.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 5736c55aaa1a..6b9c431da08f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2550,6 +2550,9 @@ bool get_signal(struct ksignal *ksig) struct signal_struct *signal = current->signal; int signr; + if (unlikely(current->task_works)) + task_work_run(); + /* * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so * that the arch handlers don't all have to do it. If we get here -- cgit v1.2.3 From 619775c3cfd2bc8559abc4395bf7d85b72bd723f Mon Sep 17 00:00:00 2001 From: Leah Neukirchen Date: Wed, 16 Dec 2020 11:03:06 +0100 Subject: bpf: Remove unnecessary include from preload/iterators This program does not use argp (which is a glibcism). Instead include directly, which was pulled in by . Signed-off-by: Leah Neukirchen Signed-off-by: Daniel Borkmann Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201216100306.30942-1-leah@vuxu.org Signed-off-by: Alexei Starovoitov --- kernel/bpf/preload/iterators/iterators.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/preload/iterators/iterators.c b/kernel/bpf/preload/iterators/iterators.c index b7ff87939172..5d872a705470 100644 --- a/kernel/bpf/preload/iterators/iterators.c +++ b/kernel/bpf/preload/iterators/iterators.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2020 Facebook */ -#include +#include #include #include #include -- cgit v1.2.3 From adc5d8757288a3a5628436d16e78fb696d802e39 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 7 Dec 2020 01:02:52 +0100 Subject: signal: Add missing __user annotation to copy_siginfo_from_user_any copy_siginfo_from_user_any() takes a userspace pointer as second argument; annotate the parameter type accordingly. Signed-off-by: Jann Horn Link: https://lore.kernel.org/r/20201207000252.138564-1-jannh@google.com Signed-off-by: Christian Brauner --- kernel/signal.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 5736c55aaa1a..546b860c6514 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3701,7 +3701,8 @@ static bool access_pidfd_pidns(struct pid *pid) return true; } -static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info) +static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, + siginfo_t __user *info) { #ifdef CONFIG_COMPAT /* -- cgit v1.2.3 From 96e1e9846c6691f90009ae4d8e486e0ce5c628a7 Mon Sep 17 00:00:00 2001 From: Alexander Guril Date: Sat, 26 Dec 2020 12:40:21 +0100 Subject: Kernel: fork.c: Fix coding style: Do not use {} around single-line statements Fixed two coding style issues in kernel/fork.c Do not use {} around single-line statements. Cc: linux-kernel@vger.kernel.org Acked-by: Christian Brauner Signed-off-by: Alexander Guril Link: https://lore.kernel.org/r/20201226114021.2589-1-alexander.guril02@gmail.com Signed-off-by: Christian Brauner --- kernel/fork.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 37720a6d04ea..d66cd1014211 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -819,9 +819,8 @@ void __init fork_init(void) init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_NPROC]; - for (i = 0; i < UCOUNT_COUNTS; i++) { + for (i = 0; i < UCOUNT_COUNTS; i++) init_user_ns.ucount_max[i] = max_threads/2; - } #ifdef CONFIG_VMAP_STACK cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", @@ -1654,9 +1653,8 @@ static inline void init_task_pid_links(struct task_struct *task) { enum pid_type type; - for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { + for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) INIT_HLIST_NODE(&task->pid_links[type]); - } } static inline void -- cgit v1.2.3 From cb5021ca622fe83923e0789f99fe7227cbcd3f68 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Mon, 11 Jan 2021 18:48:07 +0800 Subject: kthread: remove comments about old _do_fork() helper The old _do_fork() helper has been removed in favor of kernel_clone(). Here correct some comments which still contain _do_fork() Link: https://lore.kernel.org/r/20210111104807.18022-1-yanfei.xu@windriver.com Cc: christian@brauner.io Cc: linux-kernel@vger.kernel.org Acked-by: Christian Brauner Signed-off-by: Yanfei Xu Signed-off-by: Christian Brauner --- include/trace/events/sched.h | 2 +- kernel/kthread.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 5039af667645..cbe3e152d24c 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -366,7 +366,7 @@ TRACE_EVENT(sched_process_wait, ); /* - * Tracepoint for do_fork: + * Tracepoint for kernel_clone: */ TRACE_EVENT(sched_process_fork, diff --git a/kernel/kthread.c b/kernel/kthread.c index a5eceecd4513..fb9c3dcbb68d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -294,7 +294,7 @@ static int kthread(void *_create) do_exit(ret); } -/* called from do_fork() to get node information for about to be created task */ +/* called from kernel_clone() to get node information for about to be created task */ int tsk_fork_get_node(struct task_struct *tsk) { #ifdef CONFIG_NUMA -- cgit v1.2.3 From 7bb83f6fc4ee84e95d0ac0d14452c2619fb3fe70 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Fri, 8 Jan 2021 13:19:38 +0900 Subject: tracing/kprobes: Do the notrace functions check without kprobes on ftrace Enable the notrace function check on the architecture which doesn't support kprobes on ftrace but support dynamic ftrace. This notrace function check is not only for the kprobes on ftrace but also sw-breakpoint based kprobes. Thus there is no reason to limit this check for the arch which supports kprobes on ftrace. This also changes the dependency of Kconfig. Because kprobe event uses the function tracer's address list for identifying notrace function, if the CONFIG_DYNAMIC_FTRACE=n, it can not check whether the target function is notrace or not. Link: https://lkml.kernel.org/r/20210105065730.2634785-1-naveen.n.rao@linux.vnet.ibm.com Link: https://lkml.kernel.org/r/161007957862.114704.4512260007555399463.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: 45408c4f92506 ("tracing: kprobes: Prohibit probing on notrace function") Acked-by: Naveen N. Rao Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/Kconfig | 2 +- kernel/trace/trace_kprobe.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d5a19413d4f8..c1a62ae7e812 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -538,7 +538,7 @@ config KPROBE_EVENTS config KPROBE_EVENTS_ON_NOTRACE bool "Do NOT protect notrace function from kprobe events" depends on KPROBE_EVENTS - depends on KPROBES_ON_FTRACE + depends on DYNAMIC_FTRACE default n help This is only for the developers who want to debug ftrace itself diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 9c31f42245e9..e6fba1798771 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -434,7 +434,7 @@ static int disable_trace_kprobe(struct trace_event_call *call, return 0; } -#if defined(CONFIG_KPROBES_ON_FTRACE) && \ +#if defined(CONFIG_DYNAMIC_FTRACE) && \ !defined(CONFIG_KPROBE_EVENTS_ON_NOTRACE) static bool __within_notrace_func(unsigned long addr) { -- cgit v1.2.3 From 5541075a348b6ca6ac668653f7d2c423ae8e00b6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 11 Jan 2021 20:16:50 +0100 Subject: bpf: Prevent double bpf_prog_put call from bpf_tracing_prog_attach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_tracing_prog_attach error path calls bpf_prog_put on prog, which causes refcount underflow when it's called from link_create function. link_create prog = bpf_prog_get <-- get ... tracing_bpf_link_attach(prog.. bpf_tracing_prog_attach(prog.. out_put_prog: bpf_prog_put(prog); <-- put if (ret < 0) bpf_prog_put(prog); <-- put Removing bpf_prog_put call from bpf_tracing_prog_attach and making sure its callers call it instead. Fixes: 4a1e7c0c63e0 ("bpf: Support attaching freplace programs to multiple attach points") Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210111191650.1241578-1-jolsa@kernel.org --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c3bb03c8371f..e5999d86c76e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2712,7 +2712,6 @@ out_unlock: out_put_prog: if (tgt_prog_fd && tgt_prog) bpf_prog_put(tgt_prog); - bpf_prog_put(prog); return err; } @@ -2825,7 +2824,10 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog, 0, 0); + err = bpf_tracing_prog_attach(prog, 0, 0); + if (err >= 0) + return err; + goto out_put_prog; case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, -- cgit v1.2.3 From 1a9c72ad4c26821e215a396167c14959cf24a7f1 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 12 Jan 2021 07:55:24 +0000 Subject: bpf: Local storage helpers should check nullness of owner ptr passed The verifier allows ARG_PTR_TO_BTF_ID helper arguments to be NULL, so helper implementations need to check this before dereferencing them. This was already fixed for the socket storage helpers but not for task and inode. The issue can be reproduced by attaching an LSM program to inode_rename hook (called when moving files) which tries to get the inode of the new file without checking for its nullness and then trying to move an existing file to a new path: mv existing_file new_file_does_not_exist The report including the sample program and the steps for reproducing the bug: https://lore.kernel.org/bpf/CANaYP3HWkH91SN=wTNO9FL_2ztHfqcXKX38SSE-JJ2voh+vssw@mail.gmail.com Fixes: 4cf1bc1f1045 ("bpf: Implement task local storage") Fixes: 8ea636848aca ("bpf: Implement bpf_local_storage for inodes") Reported-by: Gilad Reti Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075525.256820-3-kpsingh@kernel.org --- kernel/bpf/bpf_inode_storage.c | 5 ++++- kernel/bpf/bpf_task_storage.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 6edff97ad594..dbc1dbdd2cbf 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -176,7 +176,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, * bpf_local_storage_update expects the owner to have a * valid storage pointer. */ - if (!inode_storage_ptr(inode)) + if (!inode || !inode_storage_ptr(inode)) return (unsigned long)NULL; sdata = inode_storage_lookup(inode, map, true); @@ -200,6 +200,9 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, BPF_CALL_2(bpf_inode_storage_delete, struct bpf_map *, map, struct inode *, inode) { + if (!inode) + return -EINVAL; + /* This helper must only called from where the inode is gurranteed * to have a refcount and cannot be freed. */ diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 4ef1959a78f2..e0da0258b732 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -218,7 +218,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, * bpf_local_storage_update expects the owner to have a * valid storage pointer. */ - if (!task_storage_ptr(task)) + if (!task || !task_storage_ptr(task)) return (unsigned long)NULL; sdata = task_storage_lookup(task, map, true); @@ -243,6 +243,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { + if (!task) + return -EINVAL; + /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. -- cgit v1.2.3 From 84d571d46c7046a957ff3d1c916a1b9dcc7f1ce8 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 12 Jan 2021 07:55:25 +0000 Subject: bpf: Fix typo in bpf_inode_storage.c Fix "gurranteed" -> "guaranteed" in bpf_inode_storage.c Suggested-by: Andrii Nakryiko Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075525.256820-4-kpsingh@kernel.org --- kernel/bpf/bpf_inode_storage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index dbc1dbdd2cbf..2f0597320b6d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -183,7 +183,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (sdata) return (unsigned long)sdata->data; - /* This helper must only called from where the inode is gurranteed + /* This helper must only called from where the inode is guaranteed * to have a refcount and cannot be freed. */ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { @@ -203,7 +203,7 @@ BPF_CALL_2(bpf_inode_storage_delete, if (!inode) return -EINVAL; - /* This helper must only called from where the inode is gurranteed + /* This helper must only called from where the inode is guaranteed * to have a refcount and cannot be freed. */ return inode_storage_delete(inode, map); -- cgit v1.2.3 From 4be34f3d0731b38a1b24566b37fbb39500aaf3a2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 12 Jan 2021 08:28:29 -0800 Subject: bpf: Don't leak memory in bpf getsockopt when optlen == 0 optlen == 0 indicates that the kernel should ignore BPF buffer and use the original one from the user. We, however, forget to free the temporary buffer that we've allocated for BPF. Fixes: d8fe449a9c51 ("bpf: Don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE") Reported-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210112162829.775079-1-sdf@google.com --- kernel/bpf/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6ec088a96302..96555a8a2c54 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1391,12 +1391,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, if (ctx.optlen != 0) { *optlen = ctx.optlen; *kernel_optval = ctx.optval; + /* export and don't free sockopt buf */ + return 0; } } out: - if (ret) - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx); return ret; } -- cgit v1.2.3 From 0afda3a888dccf12557b41ef42eee942327d122b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jan 2021 15:36:22 +0100 Subject: locking/lockdep: Cure noinstr fail When the compiler doesn't feel like inlining, it causes a noinstr fail: vmlinux.o: warning: objtool: lock_is_held_type()+0xb: call to lockdep_enabled() leaves .noinstr.text section Fixes: 4d004099a668 ("lockdep: Fix lockdep recursion") Reported-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210106144017.592595176@infradead.org --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c1418b47f625..02bc5b8f1eb2 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -79,7 +79,7 @@ module_param(lock_stat, int, 0644); DEFINE_PER_CPU(unsigned int, lockdep_recursion); EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion); -static inline bool lockdep_enabled(void) +static __always_inline bool lockdep_enabled(void) { if (!debug_locks) return false; -- cgit v1.2.3 From 77ca93a6b1223e210e58e1000c09d8d420403c94 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 6 Jan 2021 15:36:23 +0100 Subject: locking/lockdep: Avoid noinstr warning for DEBUG_LOCKDEP vmlinux.o: warning: objtool: lock_is_held_type()+0x60: call to check_flags.part.0() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210106144017.652218215@infradead.org --- kernel/locking/lockdep.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 02bc5b8f1eb2..bdaf4829098c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5271,12 +5271,15 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie /* * Check whether we follow the irq-flags state precisely: */ -static void check_flags(unsigned long flags) +static noinstr void check_flags(unsigned long flags) { #if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) if (!debug_locks) return; + /* Get the warning out.. */ + instrumentation_begin(); + if (irqs_disabled_flags(flags)) { if (DEBUG_LOCKS_WARN_ON(lockdep_hardirqs_enabled())) { printk("possible reason: unannotated irqs-off.\n"); @@ -5304,6 +5307,8 @@ static void check_flags(unsigned long flags) if (!debug_locks) print_irqtrace_events(current); + + instrumentation_end(); #endif } -- cgit v1.2.3 From bcc5e6162d66d44f7929f30fce032f95855fc8b4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 9 Jan 2021 23:03:40 -0800 Subject: bpf: Allow empty module BTFs Some modules don't declare any new types and end up with an empty BTF, containing only valid BTF header and no types or strings sections. This currently causes BTF validation error. There is nothing wrong with such BTF, so fix the issue by allowing module BTFs with no types or strings. Fixes: 36e68442d1af ("bpf: Load and verify kernel module BTFs") Reported-by: Christopher William Snowhill Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210110070341.1380086-1-andrii@kernel.org --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8d6bdb4f4d61..84a36ee4a4c2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4172,7 +4172,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return -ENOTSUPP; } - if (btf_data_size == hdr->hdr_len) { + if (!btf->base_btf && btf_data_size == hdr->hdr_len) { btf_verifier_log(env, "No data"); return -EINVAL; } -- cgit v1.2.3 From aba428a0c612bb259891307da12e22efd0fab14c Mon Sep 17 00:00:00 2001 From: Chunguang Xu Date: Tue, 1 Dec 2020 17:52:31 +0800 Subject: timekeeping: Remove unused get_seconds() The get_seconds() cleanup seems to have been completed, now it is time to delete the legacy interface to avoid misuse later. Signed-off-by: Chunguang Xu Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/1606816351-26900-1-git-send-email-brookxu@tencent.com --- include/linux/ktime.h | 1 - include/linux/timekeeping32.h | 14 -------------- kernel/time/timekeeping.c | 3 +-- 3 files changed, 1 insertion(+), 17 deletions(-) delete mode 100644 include/linux/timekeeping32.h (limited to 'kernel') diff --git a/include/linux/ktime.h b/include/linux/ktime.h index a12b5523cc18..73f20deb497d 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -230,6 +230,5 @@ static inline ktime_t ms_to_ktime(u64 ms) } # include -# include #endif diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h deleted file mode 100644 index 266017fc9ee9..000000000000 --- a/include/linux/timekeeping32.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef _LINUX_TIMEKEEPING32_H -#define _LINUX_TIMEKEEPING32_H -/* - * These interfaces are all based on the old timespec type - * and should get replaced with the timespec64 based versions - * over time so we can remove the file here. - */ - -static inline unsigned long get_seconds(void) -{ - return ktime_get_real_seconds(); -} - -#endif diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index a45cedda93a7..6aee5768c86f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -991,8 +991,7 @@ EXPORT_SYMBOL_GPL(ktime_get_seconds); /** * ktime_get_real_seconds - Get the seconds portion of CLOCK_REALTIME * - * Returns the wall clock seconds since 1970. This replaces the - * get_seconds() interface which is not y2038 safe on 32bit systems. + * Returns the wall clock seconds since 1970. * * For 64bit systems the fast access to tk->xtime_sec is preserved. On * 32bit systems the access must be protected with the sequence -- cgit v1.2.3 From e3fab2f3de081e98c50b7b4ace1b040161d95310 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 11 Jan 2021 11:39:56 +0100 Subject: ntp: Fix RTC synchronization on 32-bit platforms Due to an integer overflow, RTC synchronization now happens every 2s instead of the intended 11 minutes. Fix this by forcing 64-bit arithmetic for the sync period calculation. Annotate the other place which multiplies seconds for consistency as well. Fixes: c9e6189fb03123a7 ("ntp: Make the RTC synchronization more reliable") Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210111103956.290378-1-geert+renesas@glider.be --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7404d3831527..87389b9e21ab 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -498,7 +498,7 @@ out: static void sync_hw_clock(struct work_struct *work); static DECLARE_WORK(sync_work, sync_hw_clock); static struct hrtimer sync_hrtimer; -#define SYNC_PERIOD_NS (11UL * 60 * NSEC_PER_SEC) +#define SYNC_PERIOD_NS (11ULL * 60 * NSEC_PER_SEC) static enum hrtimer_restart sync_timer_callback(struct hrtimer *timer) { @@ -512,7 +512,7 @@ static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry) ktime_t exp = ktime_set(ktime_get_real_seconds(), 0); if (retry) - exp = ktime_add_ns(exp, 2 * NSEC_PER_SEC - offset_nsec); + exp = ktime_add_ns(exp, 2ULL * NSEC_PER_SEC - offset_nsec); else exp = ktime_add_ns(exp, SYNC_PERIOD_NS - offset_nsec); -- cgit v1.2.3 From 28a8add64181059034b7f281491132112cd95bb4 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 12 Jan 2021 12:39:13 +0000 Subject: bpf: Fix a verifier message for alloc size helper arg The error message here is misleading, the argument will be rejected unless it is a known constant. Signed-off-by: Brendan Jackman Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210112123913.2016804-1-jackmanb@google.com --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17270b8404f1..5534e667bdb1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4319,7 +4319,7 @@ skip_type_check: err = mark_chain_precision(env, regno); } else if (arg_type_is_alloc_size(arg_type)) { if (!tnum_is_const(reg->var_off)) { - verbose(env, "R%d unbounded size, use 'var &= const' or 'if (var < const)'\n", + verbose(env, "R%d is not a known constant'\n", regno); return -EACCES; } -- cgit v1.2.3 From 541c3bad8dc51b253ba8686d0cd7628e6b9b5f4c Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Mon, 11 Jan 2021 23:55:18 -0800 Subject: bpf: Support BPF ksym variables in kernel modules Add support for directly accessing kernel module variables from BPF programs using special ldimm64 instructions. This functionality builds upon vmlinux ksym support, but extends ldimm64 with src_reg=BPF_PSEUDO_BTF_ID to allow specifying kernel module BTF's FD in insn[1].imm field. During BPF program load time, verifier will resolve FD to BTF object and will take reference on BTF object itself and, for module BTFs, corresponding module as well, to make sure it won't be unloaded from under running BPF program. The mechanism used is similar to how bpf_prog keeps track of used bpf_maps. One interesting change is also in how per-CPU variable is determined. The logic is to find .data..percpu data section in provided BTF, but both vmlinux and module each have their own .data..percpu entries in BTF. So for module's case, the search for DATASEC record needs to look at only module's added BTF types. This is implemented with custom search function. Signed-off-by: Andrii Nakryiko Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: Hao Luo Link: https://lore.kernel.org/bpf/20210112075520.4103414-6-andrii@kernel.org --- include/linux/bpf.h | 10 +++ include/linux/bpf_verifier.h | 3 + include/linux/btf.h | 3 + kernel/bpf/btf.c | 31 ++++++++- kernel/bpf/core.c | 23 +++++++ kernel/bpf/verifier.c | 154 +++++++++++++++++++++++++++++++++++-------- 6 files changed, 194 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 6e585dbc10df..1aac2af12fed 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -761,9 +761,15 @@ struct bpf_ctx_arg_aux { u32 btf_id; }; +struct btf_mod_pair { + struct btf *btf; + struct module *module; +}; + struct bpf_prog_aux { atomic64_t refcnt; u32 used_map_cnt; + u32 used_btf_cnt; u32 max_ctx_offset; u32 max_pkt_offset; u32 max_tp_access; @@ -802,6 +808,7 @@ struct bpf_prog_aux { const struct bpf_prog_ops *ops; struct bpf_map **used_maps; struct mutex used_maps_mutex; /* mutex for used_maps and used_map_cnt */ + struct btf_mod_pair *used_btfs; struct bpf_prog *prog; struct user_struct *user; u64 load_time; /* ns since boottime */ @@ -1668,6 +1675,9 @@ bpf_base_func_proto(enum bpf_func_id func_id) } #endif /* CONFIG_BPF_SYSCALL */ +void __bpf_free_used_btfs(struct bpf_prog_aux *aux, + struct btf_mod_pair *used_btfs, u32 len); + static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type) { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index e941fe1484e5..dfe6f85d97dd 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -340,6 +340,7 @@ struct bpf_insn_aux_data { }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ +#define MAX_USED_BTFS 64 /* max number of BTFs accessed by one BPF program */ #define BPF_VERIFIER_TMP_LOG_SIZE 1024 @@ -398,7 +399,9 @@ struct bpf_verifier_env { struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ struct bpf_verifier_state_list *free_list; struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ + struct btf_mod_pair used_btfs[MAX_USED_BTFS]; /* array of BTF's used by BPF program */ u32 used_map_cnt; /* number of used maps */ + u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; bool allow_ptr_to_map_access; diff --git a/include/linux/btf.h b/include/linux/btf.h index 4c200f5d242b..7fabf1428093 100644 --- a/include/linux/btf.h +++ b/include/linux/btf.h @@ -91,6 +91,9 @@ int btf_type_snprintf_show(const struct btf *btf, u32 type_id, void *obj, int btf_get_fd_by_id(u32 id); u32 btf_obj_id(const struct btf *btf); bool btf_is_kernel(const struct btf *btf); +bool btf_is_module(const struct btf *btf); +struct module *btf_try_get_module(const struct btf *btf); +u32 btf_nr_types(const struct btf *btf); bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s, const struct btf_member *m, u32 expected_offset, u32 expected_size); diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8d6bdb4f4d61..7ccc0133723a 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -458,7 +458,7 @@ static bool btf_type_is_datasec(const struct btf_type *t) return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC; } -static u32 btf_nr_types_total(const struct btf *btf) +u32 btf_nr_types(const struct btf *btf) { u32 total = 0; @@ -476,7 +476,7 @@ s32 btf_find_by_name_kind(const struct btf *btf, const char *name, u8 kind) const char *tname; u32 i, total; - total = btf_nr_types_total(btf); + total = btf_nr_types(btf); for (i = 1; i < total; i++) { t = btf_type_by_id(btf, i); if (BTF_INFO_KIND(t->info) != kind) @@ -5743,6 +5743,11 @@ bool btf_is_kernel(const struct btf *btf) return btf->kernel_btf; } +bool btf_is_module(const struct btf *btf) +{ + return btf->kernel_btf && strcmp(btf->name, "vmlinux") != 0; +} + static int btf_id_cmp_func(const void *a, const void *b) { const int *pa = a, *pb = b; @@ -5877,3 +5882,25 @@ static int __init btf_module_init(void) fs_initcall(btf_module_init); #endif /* CONFIG_DEBUG_INFO_BTF_MODULES */ + +struct module *btf_try_get_module(const struct btf *btf) +{ + struct module *res = NULL; +#ifdef CONFIG_DEBUG_INFO_BTF_MODULES + struct btf_module *btf_mod, *tmp; + + mutex_lock(&btf_module_mutex); + list_for_each_entry_safe(btf_mod, tmp, &btf_modules, list) { + if (btf_mod->btf != btf) + continue; + + if (try_module_get(btf_mod->module)) + res = btf_mod->module; + + break; + } + mutex_unlock(&btf_module_mutex); +#endif + + return res; +} diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 261f8692d0d2..69c3c308de5e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2119,6 +2119,28 @@ static void bpf_free_used_maps(struct bpf_prog_aux *aux) kfree(aux->used_maps); } +void __bpf_free_used_btfs(struct bpf_prog_aux *aux, + struct btf_mod_pair *used_btfs, u32 len) +{ +#ifdef CONFIG_BPF_SYSCALL + struct btf_mod_pair *btf_mod; + u32 i; + + for (i = 0; i < len; i++) { + btf_mod = &used_btfs[i]; + if (btf_mod->module) + module_put(btf_mod->module); + btf_put(btf_mod->btf); + } +#endif +} + +static void bpf_free_used_btfs(struct bpf_prog_aux *aux) +{ + __bpf_free_used_btfs(aux, aux->used_btfs, aux->used_btf_cnt); + kfree(aux->used_btfs); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; @@ -2126,6 +2148,7 @@ static void bpf_prog_free_deferred(struct work_struct *work) aux = container_of(work, struct bpf_prog_aux, work); bpf_free_used_maps(aux); + bpf_free_used_btfs(aux); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5534e667bdb1..ae2aee48cf82 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9703,6 +9703,36 @@ process_bpf_exit: return 0; } +static int find_btf_percpu_datasec(struct btf *btf) +{ + const struct btf_type *t; + const char *tname; + int i, n; + + /* + * Both vmlinux and module each have their own ".data..percpu" + * DATASECs in BTF. So for module's case, we need to skip vmlinux BTF + * types to look at only module's own BTF types. + */ + n = btf_nr_types(btf); + if (btf_is_module(btf)) + i = btf_nr_types(btf_vmlinux); + else + i = 1; + + for(; i < n; i++) { + t = btf_type_by_id(btf, i); + if (BTF_INFO_KIND(t->info) != BTF_KIND_DATASEC) + continue; + + tname = btf_name_by_offset(btf, t->name_off); + if (!strcmp(tname, ".data..percpu")) + return i; + } + + return -ENOENT; +} + /* replace pseudo btf_id with kernel symbol address */ static int check_pseudo_btf_id(struct bpf_verifier_env *env, struct bpf_insn *insn, @@ -9710,48 +9740,57 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, { const struct btf_var_secinfo *vsi; const struct btf_type *datasec; + struct btf_mod_pair *btf_mod; const struct btf_type *t; const char *sym_name; bool percpu = false; u32 type, id = insn->imm; + struct btf *btf; s32 datasec_id; u64 addr; - int i; - - if (!btf_vmlinux) { - verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); - return -EINVAL; - } + int i, btf_fd, err; - if (insn[1].imm != 0) { - verbose(env, "reserved field (insn[1].imm) is used in pseudo_btf_id ldimm64 insn.\n"); - return -EINVAL; + btf_fd = insn[1].imm; + if (btf_fd) { + btf = btf_get_by_fd(btf_fd); + if (IS_ERR(btf)) { + verbose(env, "invalid module BTF object FD specified.\n"); + return -EINVAL; + } + } else { + if (!btf_vmlinux) { + verbose(env, "kernel is missing BTF, make sure CONFIG_DEBUG_INFO_BTF=y is specified in Kconfig.\n"); + return -EINVAL; + } + btf = btf_vmlinux; + btf_get(btf); } - t = btf_type_by_id(btf_vmlinux, id); + t = btf_type_by_id(btf, id); if (!t) { verbose(env, "ldimm64 insn specifies invalid btf_id %d.\n", id); - return -ENOENT; + err = -ENOENT; + goto err_put; } if (!btf_type_is_var(t)) { - verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", - id); - return -EINVAL; + verbose(env, "pseudo btf_id %d in ldimm64 isn't KIND_VAR.\n", id); + err = -EINVAL; + goto err_put; } - sym_name = btf_name_by_offset(btf_vmlinux, t->name_off); + sym_name = btf_name_by_offset(btf, t->name_off); addr = kallsyms_lookup_name(sym_name); if (!addr) { verbose(env, "ldimm64 failed to find the address for kernel symbol '%s'.\n", sym_name); - return -ENOENT; + err = -ENOENT; + goto err_put; } - datasec_id = btf_find_by_name_kind(btf_vmlinux, ".data..percpu", - BTF_KIND_DATASEC); + datasec_id = find_btf_percpu_datasec(btf); if (datasec_id > 0) { - datasec = btf_type_by_id(btf_vmlinux, datasec_id); + datasec = btf_type_by_id(btf, datasec_id); for_each_vsi(i, datasec, vsi) { if (vsi->type == id) { percpu = true; @@ -9764,10 +9803,10 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, insn[1].imm = addr >> 32; type = t->type; - t = btf_type_skip_modifiers(btf_vmlinux, type, NULL); + t = btf_type_skip_modifiers(btf, type, NULL); if (percpu) { aux->btf_var.reg_type = PTR_TO_PERCPU_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } else if (!btf_type_is_struct(t)) { const struct btf_type *ret; @@ -9775,21 +9814,54 @@ static int check_pseudo_btf_id(struct bpf_verifier_env *env, u32 tsize; /* resolve the type size of ksym. */ - ret = btf_resolve_size(btf_vmlinux, t, &tsize); + ret = btf_resolve_size(btf, t, &tsize); if (IS_ERR(ret)) { - tname = btf_name_by_offset(btf_vmlinux, t->name_off); + tname = btf_name_by_offset(btf, t->name_off); verbose(env, "ldimm64 unable to resolve the size of type '%s': %ld\n", tname, PTR_ERR(ret)); - return -EINVAL; + err = -EINVAL; + goto err_put; } aux->btf_var.reg_type = PTR_TO_MEM; aux->btf_var.mem_size = tsize; } else { aux->btf_var.reg_type = PTR_TO_BTF_ID; - aux->btf_var.btf = btf_vmlinux; + aux->btf_var.btf = btf; aux->btf_var.btf_id = type; } + + /* check whether we recorded this BTF (and maybe module) already */ + for (i = 0; i < env->used_btf_cnt; i++) { + if (env->used_btfs[i].btf == btf) { + btf_put(btf); + return 0; + } + } + + if (env->used_btf_cnt >= MAX_USED_BTFS) { + err = -E2BIG; + goto err_put; + } + + btf_mod = &env->used_btfs[env->used_btf_cnt]; + btf_mod->btf = btf; + btf_mod->module = NULL; + + /* if we reference variables from kernel module, bump its refcount */ + if (btf_is_module(btf)) { + btf_mod->module = btf_try_get_module(btf); + if (!btf_mod->module) { + err = -ENXIO; + goto err_put; + } + } + + env->used_btf_cnt++; + return 0; +err_put: + btf_put(btf); + return err; } static int check_map_prealloc(struct bpf_map *map) @@ -10086,6 +10158,13 @@ static void release_maps(struct bpf_verifier_env *env) env->used_map_cnt); } +/* drop refcnt of maps used by the rejected program */ +static void release_btfs(struct bpf_verifier_env *env) +{ + __bpf_free_used_btfs(env->prog->aux, env->used_btfs, + env->used_btf_cnt); +} + /* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) { @@ -12098,7 +12177,10 @@ skip_full_check: goto err_release_maps; } - if (ret == 0 && env->used_map_cnt) { + if (ret) + goto err_release_maps; + + if (env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), @@ -12112,15 +12194,29 @@ skip_full_check: memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); env->prog->aux->used_map_cnt = env->used_map_cnt; + } + if (env->used_btf_cnt) { + /* if program passed verifier, update used_btfs in bpf_prog_aux */ + env->prog->aux->used_btfs = kmalloc_array(env->used_btf_cnt, + sizeof(env->used_btfs[0]), + GFP_KERNEL); + if (!env->prog->aux->used_btfs) { + ret = -ENOMEM; + goto err_release_maps; + } + memcpy(env->prog->aux->used_btfs, env->used_btfs, + sizeof(env->used_btfs[0]) * env->used_btf_cnt); + env->prog->aux->used_btf_cnt = env->used_btf_cnt; + } + if (env->used_map_cnt || env->used_btf_cnt) { /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions */ convert_pseudo_ld_imm64(env); } - if (ret == 0) - adjust_btf_func(env); + adjust_btf_func(env); err_release_maps: if (!env->prog->aux->used_maps) @@ -12128,6 +12224,8 @@ err_release_maps: * them now. Otherwise free_used_maps() will release them. */ release_maps(env); + if (!env->prog->aux->used_btfs) + release_btfs(env); /* extension progs temporarily inherit the attach_type of their targets for verification purposes, so set it back to zero before returning -- cgit v1.2.3 From ce09ccc50208c04a1b03abfd530b5d6314258fd0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 13 Jan 2021 15:43:18 +0100 Subject: genirq: Export irq_check_status_bit() One of the users can be built modular: ERROR: modpost: "irq_check_status_bit" [drivers/perf/arm_spe_pmu.ko] undefined! Fixes: fdd029630434 ("genirq: Move status flag checks to core") Reported-by: Guenter Roeck Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201227192049.GA195845@roeck-us.net --- kernel/irq/manage.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index ab8567f32501..dec3f73e8db9 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -2859,3 +2859,4 @@ bool irq_check_status_bit(unsigned int irq, unsigned int bitmask) rcu_read_unlock(); return res; } +EXPORT_SYMBOL_GPL(irq_check_status_bit); -- cgit v1.2.3 From 744ea4e3885eccb6d332a06fae9eb7420a622c0f Mon Sep 17 00:00:00 2001 From: Gilad Reti Date: Wed, 13 Jan 2021 07:38:07 +0200 Subject: bpf: Support PTR_TO_MEM{,_OR_NULL} register spilling Add support for pointer to mem register spilling, to allow the verifier to track pointers to valid memory addresses. Such pointers are returned for example by a successful call of the bpf_ringbuf_reserve helper. The patch was partially contributed by CyberArk Software, Inc. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Suggested-by: Yonghong Song Signed-off-by: Gilad Reti Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210113053810.13518-1-gilad.reti@gmail.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17270b8404f1..36af69fac591 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2217,6 +2217,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_RDWR_BUF: case PTR_TO_RDWR_BUF_OR_NULL: case PTR_TO_PERCPU_BTF_ID: + case PTR_TO_MEM: + case PTR_TO_MEM_OR_NULL: return true; default: return false; -- cgit v1.2.3 From 7d6a905f3dd62c4502cdd772c71319de4058ec89 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 8 Dec 2020 09:46:55 +0530 Subject: sched/core: Move schedutil_cpu_util() to core.c There is nothing schedutil specific in schedutil_cpu_util(), move it to core.c and define it only for CONFIG_SMP. Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/c921a362c78e1324f8ebc5aaa12f53e309c5a8a2.1607400596.git.viresh.kumar@linaro.org --- kernel/sched/core.c | 108 +++++++++++++++++++++++++++++++++++++++ kernel/sched/cpufreq_schedutil.c | 106 -------------------------------------- kernel/sched/sched.h | 12 +---- 3 files changed, 109 insertions(+), 117 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 15d2562118d1..d89d682d5337 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5662,6 +5662,114 @@ struct task_struct *idle_task(int cpu) return cpu_rq(cpu)->idle; } +#ifdef CONFIG_SMP +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the irq utilization. + * + * The DL bandwidth number otoh is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ +unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum schedutil_type type, + struct task_struct *p) +{ + unsigned long dl_util, util, irq; + struct rq *rq = cpu_rq(cpu); + + if (!uclamp_is_used() && + type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { + return max; + } + + /* + * Early check to see if IRQ/steal time saturates the CPU, can be + * because of inaccuracies in how we track these -- see + * update_irq_load_avg(). + */ + irq = cpu_util_irq(rq); + if (unlikely(irq >= max)) + return max; + + /* + * Because the time spend on RT/DL tasks is visible as 'lost' time to + * CFS tasks and we use the same metric to track the effective + * utilization (PELT windows are synchronized) we can directly add them + * to obtain the CPU's actual utilization. + * + * CFS and RT utilization can be boosted or capped, depending on + * utilization clamp constraints requested by currently RUNNABLE + * tasks. + * When there are no CFS RUNNABLE tasks, clamps are released and + * frequency will be gracefully reduced with the utilization decay. + */ + util = util_cfs + cpu_util_rt(rq); + if (type == FREQUENCY_UTIL) + util = uclamp_rq_util_with(rq, util, p); + + dl_util = cpu_util_dl(rq); + + /* + * For frequency selection we do not make cpu_util_dl() a permanent part + * of this sum because we want to use cpu_bw_dl() later on, but we need + * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such + * that we select f_max when there is no idle time. + * + * NOTE: numerical errors or stop class might cause us to not quite hit + * saturation when we should -- something for later. + */ + if (util + dl_util >= max) + return max; + + /* + * OTOH, for energy computation we need the estimated running time, so + * include util_dl and ignore dl_bw. + */ + if (type == ENERGY_UTIL) + util += dl_util; + + /* + * There is still idle time; further improve the number by using the + * irq metric. Because IRQ/steal time is hidden from the task clock we + * need to scale the task numbers: + * + * max - irq + * U' = irq + --------- * U + * max + */ + util = scale_irq_capacity(util, irq, max); + util += irq; + + /* + * Bandwidth required by DEADLINE must always be granted while, for + * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism + * to gracefully reduce the frequency when no tasks show up for longer + * periods of time. + * + * Ideally we would like to set bw_dl as min/guaranteed freq and util + + * bw_dl as requested freq. However, cpufreq is not yet ready for such + * an interface. So, we only do the latter for now. + */ + if (type == FREQUENCY_UTIL) + util += cpu_bw_dl(rq); + + return min(max, util); +} +#endif /* CONFIG_SMP */ + /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6931f0cdeb80..1dfa69246485 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -171,112 +171,6 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -/* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. - * - * The scheduler tracks the following metrics: - * - * cpu_util_{cfs,rt,dl,irq}() - * cpu_bw_dl() - * - * Where the cfs,rt and dl util numbers are tracked with the same metric and - * synchronized windows and are thus directly comparable. - * - * The cfs,rt,dl utilization are the running times measured with rq->clock_task - * which excludes things like IRQ and steal-time. These latter are then accrued - * in the irq utilization. - * - * The DL bandwidth number otoh is not a measured metric but a value computed - * based on the task model parameters and gives the minimal utilization - * required to meet deadlines. - */ -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) -{ - unsigned long dl_util, util, irq; - struct rq *rq = cpu_rq(cpu); - - if (!uclamp_is_used() && - type == FREQUENCY_UTIL && rt_rq_is_runnable(&rq->rt)) { - return max; - } - - /* - * Early check to see if IRQ/steal time saturates the CPU, can be - * because of inaccuracies in how we track these -- see - * update_irq_load_avg(). - */ - irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) - return max; - - /* - * Because the time spend on RT/DL tasks is visible as 'lost' time to - * CFS tasks and we use the same metric to track the effective - * utilization (PELT windows are synchronized) we can directly add them - * to obtain the CPU's actual utilization. - * - * CFS and RT utilization can be boosted or capped, depending on - * utilization clamp constraints requested by currently RUNNABLE - * tasks. - * When there are no CFS RUNNABLE tasks, clamps are released and - * frequency will be gracefully reduced with the utilization decay. - */ - util = util_cfs + cpu_util_rt(rq); - if (type == FREQUENCY_UTIL) - util = uclamp_rq_util_with(rq, util, p); - - dl_util = cpu_util_dl(rq); - - /* - * For frequency selection we do not make cpu_util_dl() a permanent part - * of this sum because we want to use cpu_bw_dl() later on, but we need - * to check if the CFS+RT+DL sum is saturated (ie. no idle time) such - * that we select f_max when there is no idle time. - * - * NOTE: numerical errors or stop class might cause us to not quite hit - * saturation when we should -- something for later. - */ - if (util + dl_util >= max) - return max; - - /* - * OTOH, for energy computation we need the estimated running time, so - * include util_dl and ignore dl_bw. - */ - if (type == ENERGY_UTIL) - util += dl_util; - - /* - * There is still idle time; further improve the number by using the - * irq metric. Because IRQ/steal time is hidden from the task clock we - * need to scale the task numbers: - * - * max - irq - * U' = irq + --------- * U - * max - */ - util = scale_irq_capacity(util, irq, max); - util += irq; - - /* - * Bandwidth required by DEADLINE must always be granted while, for - * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism - * to gracefully reduce the frequency when no tasks show up for longer - * periods of time. - * - * Ideally we would like to set bw_dl as min/guaranteed freq and util + - * bw_dl as requested freq. However, cpufreq is not yet ready for such - * an interface. So, we only do the latter for now. - */ - if (type == FREQUENCY_UTIL) - util += cpu_bw_dl(rq); - - return min(max, util); -} - static void sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 12ada79d40f3..242d4c5a5efc 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2557,7 +2557,6 @@ static inline unsigned long capacity_orig_of(int cpu) { return cpu_rq(cpu)->cpu_capacity_orig; } -#endif /** * enum schedutil_type - CPU utilization type @@ -2574,8 +2573,6 @@ enum schedutil_type { ENERGY_UTIL, }; -#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL - unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, unsigned long max, enum schedutil_type type, struct task_struct *p); @@ -2606,14 +2603,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq) { return READ_ONCE(rq->avg_rt.util_avg); } -#else /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ -static inline unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, - struct task_struct *p) -{ - return 0; -} -#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ +#endif #ifdef CONFIG_HAVE_SCHED_AVG_IRQ static inline unsigned long cpu_util_irq(struct rq *rq) -- cgit v1.2.3 From a5418be9dffe70ccbb0b4bd5ea3881c81927e965 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 8 Dec 2020 09:46:56 +0530 Subject: sched/core: Rename schedutil_cpu_util() and allow rest of the kernel to use it There is nothing schedutil specific in schedutil_cpu_util(), rename it to effective_cpu_util(). Also create and expose another wrapper sched_cpu_util() which can be used by other parts of the kernel, like thermal core (that will be done in a later commit). Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/db011961fb3bb8bef1c0eda5cd64564637d3ef31.1607400596.git.viresh.kumar@linaro.org --- include/linux/sched.h | 5 +++++ kernel/sched/core.c | 10 ++++++++-- kernel/sched/cpufreq_schedutil.c | 2 +- kernel/sched/fair.c | 6 +++--- kernel/sched/sched.h | 10 +++++----- 5 files changed, 22 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6e3a5eeec509..31169e70476a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1968,6 +1968,11 @@ extern long sched_getaffinity(pid_t pid, struct cpumask *mask); #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +#ifdef CONFIG_SMP +/* Returns effective CPU energy utilization, as seen by the scheduler */ +unsigned long sched_cpu_util(int cpu, unsigned long max); +#endif /* CONFIG_SMP */ + #ifdef CONFIG_RSEQ /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d89d682d5337..4fe4cbf0bf08 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5683,8 +5683,8 @@ struct task_struct *idle_task(int cpu) * based on the task model parameters and gives the minimal utilization * required to meet deadlines. */ -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum cpu_util_type type, struct task_struct *p) { unsigned long dl_util, util, irq; @@ -5768,6 +5768,12 @@ unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, return min(max, util); } + +unsigned long sched_cpu_util(int cpu, unsigned long max) +{ + return effective_cpu_util(cpu, cpu_util_cfs(cpu_rq(cpu)), max, + ENERGY_UTIL, NULL); +} #endif /* CONFIG_SMP */ /** diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 1dfa69246485..41e498b0008a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -178,7 +178,7 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) sg_cpu->max = max; sg_cpu->bw_dl = cpu_bw_dl(rq); - sg_cpu->util = schedutil_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, + sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(rq), max, FREQUENCY_UTIL, NULL); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04a3ce20da67..39c5bda90bd4 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6543,7 +6543,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * is already enough to scale the EM reported power * consumption at the (eventually clamped) cpu_capacity. */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap, ENERGY_UTIL, NULL); /* @@ -6553,7 +6553,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap, FREQUENCY_UTIL, tsk); max_util = max(max_util, cpu_util); } @@ -6651,7 +6651,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * IOW, placing the task there would make the CPU * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is - * aligned with schedutil_cpu_util(). + * aligned with sched_cpu_util(). */ util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 242d4c5a5efc..045b01064c1e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2559,22 +2559,22 @@ static inline unsigned long capacity_orig_of(int cpu) } /** - * enum schedutil_type - CPU utilization type + * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency * @ENERGY_UTIL: Utilization used during energy calculation * * The utilization signals of all scheduling classes (CFS/RT/DL) and IRQ time * need to be aggregated differently depending on the usage made of them. This - * enum is used within schedutil_freq_util() to differentiate the types of + * enum is used within effective_cpu_util() to differentiate the types of * utilization expected by the callers, and adjust the aggregation accordingly. */ -enum schedutil_type { +enum cpu_util_type { FREQUENCY_UTIL, ENERGY_UTIL, }; -unsigned long schedutil_cpu_util(int cpu, unsigned long util_cfs, - unsigned long max, enum schedutil_type type, +unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, + unsigned long max, enum cpu_util_type type, struct task_struct *p); static inline unsigned long cpu_bw_dl(struct rq *rq) -- cgit v1.2.3 From e0b257c3b71bd98a4866c3daecf000998aaa4927 Mon Sep 17 00:00:00 2001 From: Anna-Maria Behnsen Date: Tue, 15 Dec 2020 11:44:00 +0100 Subject: sched: Prevent raising SCHED_SOFTIRQ when CPU is !active SCHED_SOFTIRQ is raised to trigger periodic load balancing. When CPU is not active, CPU should not participate in load balancing. The scheduler uses nohz.idle_cpus_mask to keep track of the CPUs which can do idle load balancing. When bringing a CPU up the CPU is added to the mask when it reaches the active state, but on teardown the CPU stays in the mask until it goes offline and invokes sched_cpu_dying(). When SCHED_SOFTIRQ is raised on a !active CPU, there might be a pending softirq when stopping the tick which triggers a warning in NOHZ code. The SCHED_SOFTIRQ can also be raised by the scheduler tick which has the same issue. Therefore remove the CPU from nohz.idle_cpus_mask when it is marked inactive and also prevent the scheduler_tick() from raising SCHED_SOFTIRQ after this point. Signed-off-by: Anna-Maria Behnsen Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20201215104400.9435-1-anna-maria@linutronix.de --- kernel/sched/core.c | 7 ++++++- kernel/sched/fair.c | 7 +++++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4fe4cbf0bf08..06b449942adf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7596,6 +7596,12 @@ int sched_cpu_deactivate(unsigned int cpu) struct rq_flags rf; int ret; + /* + * Remove CPU from nohz.idle_cpus_mask to prevent participating in + * load balancing when not active + */ + nohz_balance_exit_idle(rq); + set_cpu_active(cpu, false); /* * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU @@ -7702,7 +7708,6 @@ int sched_cpu_dying(unsigned int cpu) calc_load_migrate(rq); update_max_interval(); - nohz_balance_exit_idle(rq); hrtick_clear(rq); return 0; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 39c5bda90bd4..389cb58655c0 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10700,8 +10700,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* + * Don't need to rebalance while attached to NULL domain or + * runqueue CPU is not active + */ + if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq)))) return; if (time_after_eq(jiffies, rq->next_balance)) -- cgit v1.2.3 From 8c1f560c1ea3f19e22ba356f62680d9d449c9ec2 Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Fri, 18 Dec 2020 17:27:52 +0800 Subject: sched/fair: Avoid stale CPU util_est value for schedutil in task dequeue CPU (root cfs_rq) estimated utilization (util_est) is currently used in dequeue_task_fair() to drive frequency selection before it is updated. with: CPU_util : rq->cfs.avg.util_avg CPU_util_est : rq->cfs.avg.util_est CPU_utilization : max(CPU_util, CPU_util_est) task_util : p->se.avg.util_avg task_util_est : p->se.avg.util_est dequeue_task_fair(): /* (1) CPU_util and task_util update + inform schedutil about CPU_utilization changes */ for_each_sched_entity() /* 2 loops */ (dequeue_entity() ->) update_load_avg() -> cfs_rq_util_change() -> cpufreq_update_util() ->...-> sugov_update_[shared\|single] -> sugov_get_util() -> cpu_util_cfs() /* (2) CPU_util_est and task_util_est update */ util_est_dequeue() cpu_util_cfs() uses CPU_utilization which could lead to a false (too high) utilization value for schedutil in task ramp-down or ramp-up scenarios during task dequeue. To mitigate the issue split the util_est update (2) into: (A) CPU_util_est update in util_est_dequeue() (B) task_util_est update in util_est_update() Place (A) before (1) and keep (B) where (2) is. The latter is necessary since (B) relies on task_util update in (1). Fixes: 7f65ea42eb00 ("sched/fair: Add util_est on top of PELT") Signed-off-by: Xuewen Yan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Dietmar Eggemann Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/1608283672-18240-1-git-send-email-xuewen.yan94@gmail.com --- kernel/sched/fair.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 389cb58655c0..40d3ebfb5a48 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3943,6 +3943,22 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, trace_sched_util_est_cfs_tp(cfs_rq); } +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + /* * Check if a (signed) value is within a specified (unsigned) margin, * based on the observation that: @@ -3956,23 +3972,16 @@ static inline bool within_margin(int value, int margin) return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); } -static void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +static inline void util_est_update(struct cfs_rq *cfs_rq, + struct task_struct *p, + bool task_sleep) { long last_ewma_diff; struct util_est ue; - int cpu; if (!sched_feat(UTIL_EST)) return; - /* Update root cfs_rq's estimated utilization */ - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); - - trace_sched_util_est_cfs_tp(cfs_rq); - /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -4012,8 +4021,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - cpu = cpu_of(rq_of(cfs_rq)); - if (task_util(p) > capacity_orig_of(cpu)) + if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) return; /* @@ -4096,8 +4104,11 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -5609,6 +5620,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + util_est_dequeue(&rq->cfs, p); + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -5659,7 +5672,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) rq->next_balance = jiffies; dequeue_throttle: - util_est_dequeue(&rq->cfs, p, task_sleep); + util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); } -- cgit v1.2.3 From fc488ffd4297f661b3e9d7450dcdb9089a53df7c Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 7 Jan 2021 11:33:23 +0100 Subject: sched/fair: Skip idle cfs_rq Don't waste time checking whether an idle cfs_rq could be the busiest queue. Furthermore, this can end up selecting a cfs_rq with a high load but being idle in case of migrate_load. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20210107103325.30851-2-vincent.guittot@linaro.org --- kernel/sched/fair.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 40d3ebfb5a48..13de7aede74f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9402,8 +9402,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - capacity = capacity_of(i); nr_running = rq->cfs.h_nr_running; + if (!nr_running) + continue; + + capacity = capacity_of(i); /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could -- cgit v1.2.3 From 8a41dfcda7a32ed4435c00d98a9dc7156b08b671 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 7 Jan 2021 11:33:24 +0100 Subject: sched/fair: Don't set LBF_ALL_PINNED unnecessarily Setting LBF_ALL_PINNED during active load balance is only valid when there is only 1 running task on the rq otherwise this ends up increasing the balance interval whereas other tasks could migrate after the next interval once they become cache-cold as an example. LBF_ALL_PINNED flag is now always set it by default. It is then cleared when we find one task that can be pulled when calling detach_tasks() or during active migration. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20210107103325.30851-3-vincent.guittot@linaro.org --- kernel/sched/fair.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 13de7aede74f..48f99c8a52f9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9639,6 +9639,8 @@ redo: env.src_rq = busiest; ld_moved = 0; + /* Clear this flag as soon as we find a pullable task */ + env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -9646,7 +9648,6 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - env.flags |= LBF_ALL_PINNED; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -9772,10 +9773,12 @@ more_balance: if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } + /* Record that we found at least one task that could run on this_cpu */ + env.flags &= ~LBF_ALL_PINNED; + /* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared -- cgit v1.2.3 From e9b9734b74656abb585a7f6fabf1d30ce00e51ea Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 7 Jan 2021 11:33:25 +0100 Subject: sched/fair: Reduce cases for active balance Active balance is triggered for a number of voluntary cases like misfit or pinned tasks cases but also after that a number of load balance attempts failed to migrate a task. There is no need to use active load balance when the group is overloaded because an overloaded state means that there is at least one waiting task. Nevertheless, the waiting task is not selected and detached until the threshold becomes higher than its load. This threshold increases with the number of failed lb (see the condition if ((load >> env->sd->nr_balance_failed) > env->imbalance) in detach_tasks()) and the waiting task will end up to be selected after a number of attempts. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Acked-by: Mel Gorman Link: https://lkml.kernel.org/r/20210107103325.30851-4-vincent.guittot@linaro.org --- kernel/sched/fair.c | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 48f99c8a52f9..53802b77ce64 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9512,13 +9512,32 @@ asym_active_balance(struct lb_env *env) } static inline bool -voluntary_active_balance(struct lb_env *env) +imbalanced_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + /* + * The imbalanced case includes the case of pinned tasks preventing a fair + * distribution of the load on the system but also the even distribution of the + * threads on a system with spare capacity + */ + if ((env->migration_type == migrate_task) && + (sd->nr_balance_failed > sd->cache_nice_tries+2)) + return 1; + + return 0; +} + +static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; if (asym_active_balance(env)) return 1; + if (imbalanced_active_balance(env)) + return 1; + /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. * It's worth migrating the task if the src_cpu's capacity is reduced @@ -9538,16 +9557,6 @@ voluntary_active_balance(struct lb_env *env) return 0; } -static int need_active_balance(struct lb_env *env) -{ - struct sched_domain *sd = env->sd; - - if (voluntary_active_balance(env)) - return 1; - - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); -} - static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) @@ -9800,21 +9809,13 @@ more_balance: /* We've kicked active balancing, force task migration. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } - } else + } else { sd->nr_balance_failed = 0; + } - if (likely(!active_balance) || voluntary_active_balance(&env)) { + if (likely(!active_balance) || need_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * detach_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; } goto out; -- cgit v1.2.3 From 65bcf072e20ed7597caa902f170f293662b0af3c Mon Sep 17 00:00:00 2001 From: Hui Su Date: Sat, 31 Oct 2020 01:32:23 +0800 Subject: sched: Use task_current() instead of 'rq->curr == p' Use the task_current() function where appropriate. No functional change. Signed-off-by: Hui Su Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Steven Rostedt (VMware) Link: https://lkml.kernel.org/r/20201030173223.GA52339@rlk --- kernel/sched/deadline.c | 2 +- kernel/sched/debug.c | 2 +- kernel/sched/fair.c | 6 +++--- kernel/sched/rt.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 75686c6d4436..5421782fe897 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2514,7 +2514,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) static void prio_changed_dl(struct rq *rq, struct task_struct *p, int oldprio) { - if (task_on_rq_queued(p) || rq->curr == p) { + if (task_on_rq_queued(p) || task_current(rq, p)) { #ifdef CONFIG_SMP /* * This might be too much, but unfortunately diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2357921580f9..486f403a778b 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -486,7 +486,7 @@ static char *task_group_path(struct task_group *tg) static void print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { - if (rq->curr == p) + if (task_current(rq, p)) SEQ_printf(m, ">R"); else SEQ_printf(m, " %c", task_state_to_char(p)); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 53802b77ce64..197a51473e0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5430,7 +5430,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) s64 delta = slice - ran; if (delta < 0) { - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); return; } @@ -10829,7 +10829,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (rq->curr == p) { + if (task_current(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else @@ -10962,7 +10962,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); else check_preempt_curr(rq, p, 0); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index dbe4629cf7ba..8f720b71d13d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2357,7 +2357,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) if (!task_on_rq_queued(p)) return; - if (rq->curr == p) { + if (task_current(rq, p)) { #ifdef CONFIG_SMP /* * If our priority decreases while running, we -- cgit v1.2.3 From dfd5e3f5fe27bda91d5cc028c86ffbb7a0614489 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Dec 2020 16:06:21 +0100 Subject: locking/lockdep: Mark local_lock_t The local_lock_t's are special, because they cannot form IRQ inversions, make sure we can tell them apart from the rest of the locks. Signed-off-by: Peter Zijlstra (Intel) --- include/linux/local_lock_internal.h | 5 ++++- include/linux/lockdep.h | 15 ++++++++++++--- include/linux/lockdep_types.h | 18 ++++++++++++++---- kernel/locking/lockdep.c | 16 +++++++++------- 4 files changed, 39 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 4a8795b21d77..ded90b097e6e 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -18,6 +18,7 @@ typedef struct { .dep_map = { \ .name = #lockname, \ .wait_type_inner = LD_WAIT_CONFIG, \ + .lock_type = LD_LOCK_PERCPU, \ } #else # define LL_DEP_MAP_INIT(lockname) @@ -30,7 +31,9 @@ do { \ static struct lock_class_key __key; \ \ debug_check_no_locks_freed((void *)lock, sizeof(*lock));\ - lockdep_init_map_wait(&(lock)->dep_map, #lock, &__key, 0, LD_WAIT_CONFIG);\ + lockdep_init_map_type(&(lock)->dep_map, #lock, &__key, 0, \ + LD_WAIT_CONFIG, LD_WAIT_INV, \ + LD_LOCK_PERCPU); \ } while (0) #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index b9e9adec73e8..7b7ebf2e28ec 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -185,12 +185,19 @@ extern void lockdep_unregister_key(struct lock_class_key *key); * to lockdep: */ -extern void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass, short inner, short outer); +extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type); + +static inline void +lockdep_init_map_waits(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, int subclass, u8 inner, u8 outer) +{ + lockdep_init_map_type(lock, name, key, subclass, inner, LD_WAIT_INV, LD_LOCK_NORMAL); +} static inline void lockdep_init_map_wait(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass, short inner) + struct lock_class_key *key, int subclass, u8 inner) { lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV); } @@ -340,6 +347,8 @@ static inline void lockdep_set_selftest_task(struct task_struct *task) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) +# define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \ + do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_wait(lock, name, key, sub, inner) \ diff --git a/include/linux/lockdep_types.h b/include/linux/lockdep_types.h index 9a1fd49df17f..2ec9ff5a7fff 100644 --- a/include/linux/lockdep_types.h +++ b/include/linux/lockdep_types.h @@ -30,6 +30,12 @@ enum lockdep_wait_type { LD_WAIT_MAX, /* must be last */ }; +enum lockdep_lock_type { + LD_LOCK_NORMAL = 0, /* normal, catch all */ + LD_LOCK_PERCPU, /* percpu */ + LD_LOCK_MAX, +}; + #ifdef CONFIG_LOCKDEP /* @@ -119,8 +125,10 @@ struct lock_class { int name_version; const char *name; - short wait_type_inner; - short wait_type_outer; + u8 wait_type_inner; + u8 wait_type_outer; + u8 lock_type; + /* u8 hole; */ #ifdef CONFIG_LOCK_STAT unsigned long contention_point[LOCKSTAT_POINTS]; @@ -169,8 +177,10 @@ struct lockdep_map { struct lock_class_key *key; struct lock_class *class_cache[NR_LOCKDEP_CACHING_CLASSES]; const char *name; - short wait_type_outer; /* can be taken in this context */ - short wait_type_inner; /* presents this context */ + u8 wait_type_outer; /* can be taken in this context */ + u8 wait_type_inner; /* presents this context */ + u8 lock_type; + /* u8 hole; */ #ifdef CONFIG_LOCK_STAT int cpu; unsigned long ip; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index c1418b47f625..b061e2991700 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1290,6 +1290,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) class->name_version = count_matching_names(class); class->wait_type_inner = lock->wait_type_inner; class->wait_type_outer = lock->wait_type_outer; + class->lock_type = lock->lock_type; /* * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: @@ -4503,9 +4504,9 @@ print_lock_invalid_wait_context(struct task_struct *curr, */ static int check_wait_context(struct task_struct *curr, struct held_lock *next) { - short next_inner = hlock_class(next)->wait_type_inner; - short next_outer = hlock_class(next)->wait_type_outer; - short curr_inner; + u8 next_inner = hlock_class(next)->wait_type_inner; + u8 next_outer = hlock_class(next)->wait_type_outer; + u8 curr_inner; int depth; if (!curr->lockdep_depth || !next_inner || next->trylock) @@ -4528,7 +4529,7 @@ static int check_wait_context(struct task_struct *curr, struct held_lock *next) for (; depth < curr->lockdep_depth; depth++) { struct held_lock *prev = curr->held_locks + depth; - short prev_inner = hlock_class(prev)->wait_type_inner; + u8 prev_inner = hlock_class(prev)->wait_type_inner; if (prev_inner) { /* @@ -4577,9 +4578,9 @@ static inline int check_wait_context(struct task_struct *curr, /* * Initialize a lock instance's lock-class mapping info: */ -void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, +void lockdep_init_map_type(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, - short inner, short outer) + u8 inner, u8 outer, u8 lock_type) { int i; @@ -4602,6 +4603,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, lock->wait_type_outer = outer; lock->wait_type_inner = inner; + lock->lock_type = lock_type; /* * No key, no joy, we need to hash something. @@ -4636,7 +4638,7 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, raw_local_irq_restore(flags); } } -EXPORT_SYMBOL_GPL(lockdep_init_map_waits); +EXPORT_SYMBOL_GPL(lockdep_init_map_type); struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); -- cgit v1.2.3 From bc2dd71b283665f0a409d5b6fc603d5a6fdc219e Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 10 Dec 2020 11:02:40 +0100 Subject: locking/lockdep: Add a skip() function to __bfs() Some __bfs() walks will have additional iteration constraints (beyond the path being strong). Provide an additional function to allow terminating graph walks. Signed-off-by: Boqun Feng Signed-off-by: Peter Zijlstra (Intel) --- kernel/locking/lockdep.c | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index b061e2991700..f50f026772d7 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -1672,6 +1672,7 @@ static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset) static enum bfs_result __bfs(struct lock_list *source_entry, void *data, bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), struct lock_list **target_entry, int offset) { @@ -1732,7 +1733,12 @@ static enum bfs_result __bfs(struct lock_list *source_entry, /* * Step 3: we haven't visited this and there is a strong * dependency path to this, so check with @match. + * If @skip is provide and returns true, we skip this + * lock (and any path this lock is in). */ + if (skip && skip(lock, data)) + continue; + if (match(lock, data)) { *target_entry = lock; return BFS_RMATCH; @@ -1775,9 +1781,10 @@ static inline enum bfs_result __bfs_forwards(struct lock_list *src_entry, void *data, bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, + return __bfs(src_entry, data, match, skip, target_entry, offsetof(struct lock_class, locks_after)); } @@ -1786,9 +1793,10 @@ static inline enum bfs_result __bfs_backwards(struct lock_list *src_entry, void *data, bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), struct lock_list **target_entry) { - return __bfs(src_entry, data, match, target_entry, + return __bfs(src_entry, data, match, skip, target_entry, offsetof(struct lock_class, locks_before)); } @@ -2019,7 +2027,7 @@ static unsigned long __lockdep_count_forward_deps(struct lock_list *this) unsigned long count = 0; struct lock_list *target_entry; - __bfs_forwards(this, (void *)&count, noop_count, &target_entry); + __bfs_forwards(this, (void *)&count, noop_count, NULL, &target_entry); return count; } @@ -2044,7 +2052,7 @@ static unsigned long __lockdep_count_backward_deps(struct lock_list *this) unsigned long count = 0; struct lock_list *target_entry; - __bfs_backwards(this, (void *)&count, noop_count, &target_entry); + __bfs_backwards(this, (void *)&count, noop_count, NULL, &target_entry); return count; } @@ -2072,11 +2080,12 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class) static noinline enum bfs_result check_path(struct held_lock *target, struct lock_list *src_entry, bool (*match)(struct lock_list *entry, void *data), + bool (*skip)(struct lock_list *entry, void *data), struct lock_list **target_entry) { enum bfs_result ret; - ret = __bfs_forwards(src_entry, target, match, target_entry); + ret = __bfs_forwards(src_entry, target, match, skip, target_entry); if (unlikely(bfs_error(ret))) print_bfs_bug(ret); @@ -2103,7 +2112,7 @@ check_noncircular(struct held_lock *src, struct held_lock *target, debug_atomic_inc(nr_cyclic_checks); - ret = check_path(target, &src_entry, hlock_conflict, &target_entry); + ret = check_path(target, &src_entry, hlock_conflict, NULL, &target_entry); if (unlikely(ret == BFS_RMATCH)) { if (!*trace) { @@ -2152,7 +2161,7 @@ check_redundant(struct held_lock *src, struct held_lock *target) debug_atomic_inc(nr_redundant_checks); - ret = check_path(target, &src_entry, hlock_equal, &target_entry); + ret = check_path(target, &src_entry, hlock_equal, NULL, &target_entry); if (ret == BFS_RMATCH) debug_atomic_inc(nr_redundant); @@ -2246,7 +2255,7 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask, debug_atomic_inc(nr_find_usage_forwards_checks); - result = __bfs_forwards(root, &usage_mask, usage_match, target_entry); + result = __bfs_forwards(root, &usage_mask, usage_match, NULL, target_entry); return result; } @@ -2263,7 +2272,7 @@ find_usage_backwards(struct lock_list *root, unsigned long usage_mask, debug_atomic_inc(nr_find_usage_backwards_checks); - result = __bfs_backwards(root, &usage_mask, usage_match, target_entry); + result = __bfs_backwards(root, &usage_mask, usage_match, NULL, target_entry); return result; } @@ -2628,7 +2637,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, */ bfs_init_rootb(&this, prev); - ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL); + ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL, NULL); if (bfs_error(ret)) { print_bfs_bug(ret); return 0; -- cgit v1.2.3 From 175b1a60e8805617d74aefe17ce0d3a32eceb55c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Dec 2020 11:16:34 +0100 Subject: locking/lockdep: Clean up check_redundant() a bit In preparation for adding an TRACE_IRQFLAGS dependent skip function to check_redundant(), move it below the TRACE_IRQFLAGS #ifdef. While there, provide a stub function to reduce #ifdef usage. Signed-off-by: Peter Zijlstra (Intel) --- kernel/locking/lockdep.c | 91 ++++++++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f50f026772d7..f2ae8a65f667 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2130,46 +2130,6 @@ check_noncircular(struct held_lock *src, struct held_lock *target, return ret; } -#ifdef CONFIG_LOCKDEP_SMALL -/* - * Check that the dependency graph starting at can lead to - * or not. If it can, -> dependency is already - * in the graph. - * - * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if - * any error appears in the bfs search. - */ -static noinline enum bfs_result -check_redundant(struct held_lock *src, struct held_lock *target) -{ - enum bfs_result ret; - struct lock_list *target_entry; - struct lock_list src_entry; - - bfs_init_root(&src_entry, src); - /* - * Special setup for check_redundant(). - * - * To report redundant, we need to find a strong dependency path that - * is equal to or stronger than -> . So if is E, - * we need to let __bfs() only search for a path starting at a -(E*)->, - * we achieve this by setting the initial node's ->only_xr to true in - * that case. And if is S, we set initial ->only_xr to false - * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant. - */ - src_entry.only_xr = src->read == 0; - - debug_atomic_inc(nr_redundant_checks); - - ret = check_path(target, &src_entry, hlock_equal, NULL, &target_entry); - - if (ret == BFS_RMATCH) - debug_atomic_inc(nr_redundant); - - return ret; -} -#endif - #ifdef CONFIG_TRACE_IRQFLAGS /* @@ -2706,6 +2666,55 @@ static inline int check_irq_usage(struct task_struct *curr, } #endif /* CONFIG_TRACE_IRQFLAGS */ +#ifdef CONFIG_LOCKDEP_SMALL +/* + * Check that the dependency graph starting at can lead to + * or not. If it can, -> dependency is already + * in the graph. + * + * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if + * any error appears in the bfs search. + */ +static noinline enum bfs_result +check_redundant(struct held_lock *src, struct held_lock *target) +{ + enum bfs_result ret; + struct lock_list *target_entry; + struct lock_list src_entry; + + bfs_init_root(&src_entry, src); + /* + * Special setup for check_redundant(). + * + * To report redundant, we need to find a strong dependency path that + * is equal to or stronger than -> . So if is E, + * we need to let __bfs() only search for a path starting at a -(E*)->, + * we achieve this by setting the initial node's ->only_xr to true in + * that case. And if is S, we set initial ->only_xr to false + * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant. + */ + src_entry.only_xr = src->read == 0; + + debug_atomic_inc(nr_redundant_checks); + + ret = check_path(target, &src_entry, hlock_equal, NULL, &target_entry); + + if (ret == BFS_RMATCH) + debug_atomic_inc(nr_redundant); + + return ret; +} + +#else + +static inline enum bfs_result +check_redundant(struct held_lock *src, struct held_lock *target) +{ + return BFS_RNOMATCH; +} + +#endif + static void inc_chains(int irq_context) { if (irq_context & LOCK_CHAIN_HARDIRQ_CONTEXT) @@ -2926,7 +2935,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } -#ifdef CONFIG_LOCKDEP_SMALL /* * Is the -> link redundant? */ @@ -2935,7 +2943,6 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, return 0; else if (ret == BFS_RMATCH) return 2; -#endif if (!*trace) { *trace = save_trace(); -- cgit v1.2.3 From 5f2962401c6e195222f320d12b3a55377b2d4653 Mon Sep 17 00:00:00 2001 From: Boqun Feng Date: Thu, 10 Dec 2020 11:15:00 +0100 Subject: locking/lockdep: Exclude local_lock_t from IRQ inversions The purpose of local_lock_t is to abstract: preempt_disable() / local_bh_disable() / local_irq_disable(). These are the traditional means of gaining access to per-cpu data, but are fundamentally non-preemptible. local_lock_t provides a per-cpu lock, that on !PREEMPT_RT reduces to no-ops, just like regular spinlocks do on UP. This gives rise to: CPU0 CPU1 local_lock(B) spin_lock_irq(A) spin_lock(A) local_lock(B) Where lockdep then figures things will lock up; which would be true if B were any other kind of lock. However this is a false positive, no such deadlock actually exists. For !RT the above local_lock(B) is preempt_disable(), and there's obviously no deadlock; alternatively, CPU0's B != CPU1's B. For RT the argument is that since local_lock() nests inside spin_lock(), it cannot be used in hardirq context, and therefore CPU0 cannot in fact happen. Even though B is a real lock, it is a preemptible lock and any threaded-irq would simply schedule out and let the preempted task (which holds B) continue such that the task on CPU1 can make progress, after which the threaded-irq resumes and can finish. This means that we can never form an IRQ inversion on a local_lock dependency, so terminate the graph walk when looking for IRQ inversions when we encounter one. One consequence is that (for LOCKDEP_SMALL) when we look for redundant dependencies, A -> B is not redundant in the presence of A -> L -> B. Signed-off-by: Boqun Feng [peterz: Changelog] Signed-off-by: Peter Zijlstra (Intel) --- kernel/locking/lockdep.c | 57 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 53 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index f2ae8a65f667..ad9afd8c7eb9 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -2200,6 +2200,44 @@ static inline bool usage_match(struct lock_list *entry, void *mask) return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask); } +static inline bool usage_skip(struct lock_list *entry, void *mask) +{ + /* + * Skip local_lock() for irq inversion detection. + * + * For !RT, local_lock() is not a real lock, so it won't carry any + * dependency. + * + * For RT, an irq inversion happens when we have lock A and B, and on + * some CPU we can have: + * + * lock(A); + * + * lock(B); + * + * where lock(B) cannot sleep, and we have a dependency B -> ... -> A. + * + * Now we prove local_lock() cannot exist in that dependency. First we + * have the observation for any lock chain L1 -> ... -> Ln, for any + * 1 <= i <= n, Li.inner_wait_type <= L1.inner_wait_type, otherwise + * wait context check will complain. And since B is not a sleep lock, + * therefore B.inner_wait_type >= 2, and since the inner_wait_type of + * local_lock() is 3, which is greater than 2, therefore there is no + * way the local_lock() exists in the dependency B -> ... -> A. + * + * As a result, we will skip local_lock(), when we search for irq + * inversion bugs. + */ + if (entry->class->lock_type == LD_LOCK_PERCPU) { + if (DEBUG_LOCKS_WARN_ON(entry->class->wait_type_inner < LD_WAIT_CONFIG)) + return false; + + return true; + } + + return false; +} + /* * Find a node in the forwards-direction dependency sub-graph starting * at @root->class that matches @bit. @@ -2215,7 +2253,7 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask, debug_atomic_inc(nr_find_usage_forwards_checks); - result = __bfs_forwards(root, &usage_mask, usage_match, NULL, target_entry); + result = __bfs_forwards(root, &usage_mask, usage_match, usage_skip, target_entry); return result; } @@ -2232,7 +2270,7 @@ find_usage_backwards(struct lock_list *root, unsigned long usage_mask, debug_atomic_inc(nr_find_usage_backwards_checks); - result = __bfs_backwards(root, &usage_mask, usage_match, NULL, target_entry); + result = __bfs_backwards(root, &usage_mask, usage_match, usage_skip, target_entry); return result; } @@ -2597,7 +2635,7 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev, */ bfs_init_rootb(&this, prev); - ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL, NULL); + ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, usage_skip, NULL); if (bfs_error(ret)) { print_bfs_bug(ret); return 0; @@ -2664,6 +2702,12 @@ static inline int check_irq_usage(struct task_struct *curr, { return 1; } + +static inline bool usage_skip(struct lock_list *entry, void *mask) +{ + return false; +} + #endif /* CONFIG_TRACE_IRQFLAGS */ #ifdef CONFIG_LOCKDEP_SMALL @@ -2697,7 +2741,12 @@ check_redundant(struct held_lock *src, struct held_lock *target) debug_atomic_inc(nr_redundant_checks); - ret = check_path(target, &src_entry, hlock_equal, NULL, &target_entry); + /* + * Note: we skip local_lock() for redundant check, because as the + * comment in usage_skip(), A -> local_lock() -> B and A -> B are not + * the same. + */ + ret = check_path(target, &src_entry, hlock_equal, usage_skip, &target_entry); if (ret == BFS_RMATCH) debug_atomic_inc(nr_redundant); -- cgit v1.2.3 From 91c960b0056672e74627776655c926388350fa30 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:44 +0000 Subject: bpf: Rename BPF_XADD and prepare to encode other atomics in .imm MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A subsequent patch will add additional atomic operations. These new operations will use the same opcode field as the existing XADD, with the immediate discriminating different operations. In preparation, rename the instruction mode BPF_ATOMIC and start calling the zero immediate BPF_ADD. This is possible (doesn't break existing valid BPF progs) because the immediate field is currently reserved MBZ and BPF_ADD is zero. All uses are removed from the tree but the BPF_XADD definition is kept around to avoid breaking builds for people including kernel headers. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210114181751.768687-5-jackmanb@google.com --- Documentation/networking/filter.rst | 30 ++++++++------ arch/arm/net/bpf_jit_32.c | 7 ++-- arch/arm64/net/bpf_jit_comp.c | 16 ++++++-- arch/mips/net/ebpf_jit.c | 11 ++++-- arch/powerpc/net/bpf_jit_comp64.c | 25 +++++++++--- arch/riscv/net/bpf_jit_comp32.c | 20 ++++++++-- arch/riscv/net/bpf_jit_comp64.c | 16 ++++++-- arch/s390/net/bpf_jit_comp.c | 27 +++++++------ arch/sparc/net/bpf_jit_comp_64.c | 17 ++++++-- arch/x86/net/bpf_jit_comp.c | 46 ++++++++++++++++------ arch/x86/net/bpf_jit_comp32.c | 6 +-- drivers/net/ethernet/netronome/nfp/bpf/jit.c | 14 +++++-- drivers/net/ethernet/netronome/nfp/bpf/main.h | 4 +- drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 15 ++++--- include/linux/filter.h | 16 ++++++-- include/uapi/linux/bpf.h | 5 ++- kernel/bpf/core.c | 31 ++++++++++----- kernel/bpf/disasm.c | 6 ++- kernel/bpf/verifier.c | 24 ++++++----- lib/test_bpf.c | 14 +++---- samples/bpf/bpf_insn.h | 4 +- samples/bpf/cookie_uid_helper_example.c | 8 ++-- samples/bpf/sock_example.c | 2 +- samples/bpf/test_cgrp2_attach.c | 5 ++- tools/include/linux/filter.h | 15 +++++-- tools/include/uapi/linux/bpf.h | 5 ++- .../selftests/bpf/prog_tests/cgroup_attach_multi.c | 4 +- tools/testing/selftests/bpf/test_cgroup_storage.c | 2 +- tools/testing/selftests/bpf/verifier/ctx.c | 7 ++-- .../selftests/bpf/verifier/direct_packet_access.c | 4 +- tools/testing/selftests/bpf/verifier/leak_ptr.c | 10 ++--- tools/testing/selftests/bpf/verifier/meta_access.c | 4 +- tools/testing/selftests/bpf/verifier/unpriv.c | 3 +- .../selftests/bpf/verifier/value_illegal_alu.c | 2 +- tools/testing/selftests/bpf/verifier/xadd.c | 18 ++++----- 35 files changed, 291 insertions(+), 152 deletions(-) (limited to 'kernel') diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst index debb59e374de..1583d59d806d 100644 --- a/Documentation/networking/filter.rst +++ b/Documentation/networking/filter.rst @@ -1006,13 +1006,13 @@ Size modifier is one of ... Mode modifier is one of:: - BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ - BPF_ABS 0x20 - BPF_IND 0x40 - BPF_MEM 0x60 - BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ - BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ - BPF_XADD 0xc0 /* eBPF only, exclusive add */ + BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */ + BPF_ABS 0x20 + BPF_IND 0x40 + BPF_MEM 0x60 + BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */ + BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */ + BPF_ATOMIC 0xc0 /* eBPF only, atomic operations */ eBPF has two non-generic instructions: (BPF_ABS | | BPF_LD) and (BPF_IND | | BPF_LD) which are used to access packet data. @@ -1044,11 +1044,19 @@ Unlike classic BPF instruction set, eBPF has generic load/store operations:: BPF_MEM | | BPF_STX: *(size *) (dst_reg + off) = src_reg BPF_MEM | | BPF_ST: *(size *) (dst_reg + off) = imm32 BPF_MEM | | BPF_LDX: dst_reg = *(size *) (src_reg + off) - BPF_XADD | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg - BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg -Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. Note that 1 and -2 byte atomic increments are not supported. +Where size is one of: BPF_B or BPF_H or BPF_W or BPF_DW. + +It also includes atomic operations, which use the immediate field for extra +encoding. + + .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg + .imm = BPF_ADD, .code = BPF_ATOMIC | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg + +Note that 1 and 2 byte atomic operations are not supported. + +You may encounter BPF_XADD - this is a legacy name for BPF_ATOMIC, referring to +the exclusive-add operation encoded when the immediate field is zero. eBPF has one 16-byte instruction: BPF_LD | BPF_DW | BPF_IMM which consists of two consecutive ``struct bpf_insn`` 8-byte blocks and interpreted as single diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 0207b6ea6e8a..897634d0a67c 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1620,10 +1620,9 @@ exit: } emit_str_r(dst_lo, tmp2, off, ctx, BPF_SIZE(code)); break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + /* Atomic ops */ + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: goto notyet; /* STX: *(size *)(dst + off) = src */ case BPF_STX | BPF_MEM | BPF_W: diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index ef9f1d5e989d..f7b194878a99 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -875,10 +875,18 @@ emit_cond_jmp: } break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm != BPF_ADD) { + pr_err_once("unknown atomic op code %02x\n", insn->imm); + return -EINVAL; + } + + /* STX XADD: lock *(u32 *)(dst + off) += src + * and + * STX XADD: lock *(u64 *)(dst + off) += src + */ + if (!off) { reg = dst; } else { diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 561154cbcc40..939dd06764bc 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1423,8 +1423,8 @@ jeq_common: case BPF_STX | BPF_H | BPF_MEM: case BPF_STX | BPF_W | BPF_MEM: case BPF_STX | BPF_DW | BPF_MEM: - case BPF_STX | BPF_W | BPF_XADD: - case BPF_STX | BPF_DW | BPF_XADD: + case BPF_STX | BPF_W | BPF_ATOMIC: + case BPF_STX | BPF_DW | BPF_ATOMIC: if (insn->dst_reg == BPF_REG_10) { ctx->flags |= EBPF_SEEN_FP; dst = MIPS_R_SP; @@ -1438,7 +1438,12 @@ jeq_common: src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp); if (src < 0) return src; - if (BPF_MODE(insn->code) == BPF_XADD) { + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + if (insn->imm != BPF_ADD) { + pr_err("ATOMIC OP %02x NOT HANDLED\n", insn->imm); + return -EINVAL; + } + /* * If mem_off does not fit within the 9 bit ll/sc * instruction immediate field, use a temp reg. diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 022103c6a201..aaf1a887f653 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -683,10 +683,18 @@ emit_clear: break; /* - * BPF_STX XADD (atomic_add) + * BPF_STX ATOMIC (atomic ops) */ - /* *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_W: + if (insn->imm != BPF_ADD) { + pr_err_ratelimited( + "eBPF filter atomic op code %02x (@%d) unsupported\n", + code, i); + return -ENOTSUPP; + } + + /* *(u32 *)(dst + off) += src */ + /* Get EA into TMP_REG_1 */ EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], dst_reg, off)); tmp_idx = ctx->idx * 4; @@ -699,8 +707,15 @@ emit_clear: /* we're done if this succeeded */ PPC_BCC_SHORT(COND_NE, tmp_idx); break; - /* *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm != BPF_ADD) { + pr_err_ratelimited( + "eBPF filter atomic op code %02x (@%d) unsupported\n", + code, i); + return -ENOTSUPP; + } + /* *(u64 *)(dst + off) += src */ + EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], dst_reg, off)); tmp_idx = ctx->idx * 4; EMIT(PPC_RAW_LDARX(b2p[TMP_REG_2], 0, b2p[TMP_REG_1], 0)); diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c index 579575f9cdae..81de865f4c7c 100644 --- a/arch/riscv/net/bpf_jit_comp32.c +++ b/arch/riscv/net/bpf_jit_comp32.c @@ -881,7 +881,7 @@ static int emit_store_r64(const s8 *dst, const s8 *src, s16 off, const s8 *rd = bpf_get_reg64(dst, tmp1, ctx); const s8 *rs = bpf_get_reg64(src, tmp2, ctx); - if (mode == BPF_XADD && size != BPF_W) + if (mode == BPF_ATOMIC && size != BPF_W) return -1; emit_imm(RV_REG_T0, off, ctx); @@ -899,7 +899,7 @@ static int emit_store_r64(const s8 *dst, const s8 *src, s16 off, case BPF_MEM: emit(rv_sw(RV_REG_T0, 0, lo(rs)), ctx); break; - case BPF_XADD: + case BPF_ATOMIC: /* Only BPF_ADD supported */ emit(rv_amoadd_w(RV_REG_ZERO, lo(rs), RV_REG_T0, 0, 0), ctx); break; @@ -1260,7 +1260,6 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, case BPF_STX | BPF_MEM | BPF_H: case BPF_STX | BPF_MEM | BPF_W: case BPF_STX | BPF_MEM | BPF_DW: - case BPF_STX | BPF_XADD | BPF_W: if (BPF_CLASS(code) == BPF_ST) { emit_imm32(tmp2, imm, ctx); src = tmp2; @@ -1271,8 +1270,21 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, return -1; break; + case BPF_STX | BPF_ATOMIC | BPF_W: + if (insn->imm != BPF_ADD) { + pr_info_once( + "bpf-jit: not supported: atomic operation %02x ***\n", + insn->imm); + return -EFAULT; + } + + if (emit_store_r64(dst, src, off, ctx, BPF_SIZE(code), + BPF_MODE(code))) + return -1; + break; + /* No hardware support for 8-byte atomics in RV32. */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_DW: /* Fallthrough. */ notsupported: diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 8a56b5293117..b44ff52f84a6 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -1027,10 +1027,18 @@ out_be: emit_add(RV_REG_T1, RV_REG_T1, rd, ctx); emit_sd(RV_REG_T1, 0, rs, ctx); break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm != BPF_ADD) { + pr_err("bpf-jit: not supported: atomic operation %02x ***\n", + insn->imm); + return -EINVAL; + } + + /* atomic_add: lock *(u32 *)(dst + off) += src + * atomic_add: lock *(u64 *)(dst + off) += src + */ + if (off) { if (is_12b_int(off)) { emit_addi(RV_REG_T1, rd, off, ctx); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 0a4182792876..f973e2ead197 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1205,18 +1205,23 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, jit->seen |= SEEN_MEM; break; /* - * BPF_STX XADD (atomic_add) + * BPF_ATOMIC */ - case BPF_STX | BPF_XADD | BPF_W: /* *(u32 *)(dst + off) += src */ - /* laal %w0,%src,off(%dst) */ - EMIT6_DISP_LH(0xeb000000, 0x00fa, REG_W0, src_reg, - dst_reg, off); - jit->seen |= SEEN_MEM; - break; - case BPF_STX | BPF_XADD | BPF_DW: /* *(u64 *)(dst + off) += src */ - /* laalg %w0,%src,off(%dst) */ - EMIT6_DISP_LH(0xeb000000, 0x00ea, REG_W0, src_reg, - dst_reg, off); + case BPF_STX | BPF_ATOMIC | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + if (insn->imm != BPF_ADD) { + pr_err("Unknown atomic operation %02x\n", insn->imm); + return -1; + } + + /* *(u32/u64 *)(dst + off) += src + * + * BFW_W: laal %w0,%src,off(%dst) + * BPF_DW: laalg %w0,%src,off(%dst) + */ + EMIT6_DISP_LH(0xeb000000, + BPF_SIZE(insn->code) == BPF_W ? 0x00fa : 0x00ea, + REG_W0, src_reg, dst_reg, off); jit->seen |= SEEN_MEM; break; /* diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 3364e2a00989..4b8d3c65d266 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1366,12 +1366,18 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; } - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: { + case BPF_STX | BPF_ATOMIC | BPF_W: { const u8 tmp = bpf2sparc[TMP_REG_1]; const u8 tmp2 = bpf2sparc[TMP_REG_2]; const u8 tmp3 = bpf2sparc[TMP_REG_3]; + if (insn->imm != BPF_ADD) { + pr_err_once("unknown atomic op %02x\n", insn->imm); + return -EINVAL; + } + + /* lock *(u32 *)(dst + off) += src */ + if (insn->dst_reg == BPF_REG_FP) ctx->saw_frame_pointer = true; @@ -1390,11 +1396,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx) break; } /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: { + case BPF_STX | BPF_ATOMIC | BPF_DW: { const u8 tmp = bpf2sparc[TMP_REG_1]; const u8 tmp2 = bpf2sparc[TMP_REG_2]; const u8 tmp3 = bpf2sparc[TMP_REG_3]; + if (insn->imm != BPF_ADD) { + pr_err_once("unknown atomic op %02x\n", insn->imm); + return -EINVAL; + } + if (insn->dst_reg == BPF_REG_FP) ctx->saw_frame_pointer = true; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 93f32e0ba0ef..b1829a534da1 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -795,6 +795,33 @@ static void emit_stx(u8 **pprog, u32 size, u32 dst_reg, u32 src_reg, int off) *pprog = prog; } +static int emit_atomic(u8 **pprog, u8 atomic_op, + u32 dst_reg, u32 src_reg, s16 off, u8 bpf_size) +{ + u8 *prog = *pprog; + int cnt = 0; + + EMIT1(0xF0); /* lock prefix */ + + maybe_emit_mod(&prog, dst_reg, src_reg, bpf_size == BPF_DW); + + /* emit opcode */ + switch (atomic_op) { + case BPF_ADD: + /* lock *(u32/u64*)(dst_reg + off) = src_reg */ + EMIT1(simple_alu_opcodes[atomic_op]); + break; + default: + pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); + return -EFAULT; + } + + emit_insn_suffix(&prog, dst_reg, src_reg, off); + + *pprog = prog; + return 0; +} + static bool ex_handler_bpf(const struct exception_table_entry *x, struct pt_regs *regs, int trapnr, unsigned long error_code, unsigned long fault_addr) @@ -839,6 +866,7 @@ static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, int i, cnt = 0, excnt = 0; int proglen = 0; u8 *prog = temp; + int err; detect_reg_usage(insn, insn_cnt, callee_regs_used, &tail_call_seen); @@ -1250,18 +1278,12 @@ st: if (is_imm8(insn->off)) } break; - /* STX XADD: lock *(u32*)(dst_reg + off) += src_reg */ - case BPF_STX | BPF_XADD | BPF_W: - /* Emit 'lock add dword ptr [rax + off], eax' */ - if (is_ereg(dst_reg) || is_ereg(src_reg)) - EMIT3(0xF0, add_2mod(0x40, dst_reg, src_reg), 0x01); - else - EMIT2(0xF0, 0x01); - goto xadd; - case BPF_STX | BPF_XADD | BPF_DW: - EMIT3(0xF0, add_2mod(0x48, dst_reg, src_reg), 0x01); -xadd: - emit_modrm_dstoff(&prog, dst_reg, src_reg, insn->off); + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: + err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, + insn->off, BPF_SIZE(insn->code)); + if (err) + return err; break; /* call */ diff --git a/arch/x86/net/bpf_jit_comp32.c b/arch/x86/net/bpf_jit_comp32.c index 96fde03aa987..d17b67c69f89 100644 --- a/arch/x86/net/bpf_jit_comp32.c +++ b/arch/x86/net/bpf_jit_comp32.c @@ -2243,10 +2243,8 @@ emit_jmp: return -EFAULT; } break; - /* STX XADD: lock *(u32 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_W: - /* STX XADD: lock *(u64 *)(dst + off) += src */ - case BPF_STX | BPF_XADD | BPF_DW: + case BPF_STX | BPF_ATOMIC | BPF_W: + case BPF_STX | BPF_ATOMIC | BPF_DW: goto notyet; case BPF_JMP | BPF_EXIT: if (seen_exit) { diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 0a721f6e8676..e31f8fbbc696 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -3109,13 +3109,19 @@ mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64) return 0; } -static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +static int mem_atomic4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { + if (meta->insn.imm != BPF_ADD) + return -EOPNOTSUPP; + return mem_xadd(nfp_prog, meta, false); } -static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +static int mem_atomic8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { + if (meta->insn.imm != BPF_ADD) + return -EOPNOTSUPP; + return mem_xadd(nfp_prog, meta, true); } @@ -3475,8 +3481,8 @@ static const instr_cb_t instr_cb[256] = { [BPF_STX | BPF_MEM | BPF_H] = mem_stx2, [BPF_STX | BPF_MEM | BPF_W] = mem_stx4, [BPF_STX | BPF_MEM | BPF_DW] = mem_stx8, - [BPF_STX | BPF_XADD | BPF_W] = mem_xadd4, - [BPF_STX | BPF_XADD | BPF_DW] = mem_xadd8, + [BPF_STX | BPF_ATOMIC | BPF_W] = mem_atomic4, + [BPF_STX | BPF_ATOMIC | BPF_DW] = mem_atomic8, [BPF_ST | BPF_MEM | BPF_B] = mem_st1, [BPF_ST | BPF_MEM | BPF_H] = mem_st2, [BPF_ST | BPF_MEM | BPF_W] = mem_st4, diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index fac9c6f9e197..d0e17eebddd9 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -428,9 +428,9 @@ static inline bool is_mbpf_classic_store_pkt(const struct nfp_insn_meta *meta) return is_mbpf_classic_store(meta) && meta->ptr.type == PTR_TO_PACKET; } -static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta) +static inline bool is_mbpf_atomic(const struct nfp_insn_meta *meta) { - return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD); + return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_ATOMIC); } static inline bool is_mbpf_mul(const struct nfp_insn_meta *meta) diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index e92ee510fd52..9d235c0ce46a 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -479,7 +479,7 @@ nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, pr_vlog(env, "map writes not supported\n"); return -EOPNOTSUPP; } - if (is_mbpf_xadd(meta)) { + if (is_mbpf_atomic(meta)) { err = nfp_bpf_map_mark_used(env, meta, reg, NFP_MAP_USE_ATOMIC_CNT); if (err) @@ -523,12 +523,17 @@ exit_check_ptr: } static int -nfp_bpf_check_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, - struct bpf_verifier_env *env) +nfp_bpf_check_atomic(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + struct bpf_verifier_env *env) { const struct bpf_reg_state *sreg = cur_regs(env) + meta->insn.src_reg; const struct bpf_reg_state *dreg = cur_regs(env) + meta->insn.dst_reg; + if (meta->insn.imm != BPF_ADD) { + pr_vlog(env, "atomic op not implemented: %d\n", meta->insn.imm); + return -EOPNOTSUPP; + } + if (dreg->type != PTR_TO_MAP_VALUE) { pr_vlog(env, "atomic add not to a map value pointer: %d\n", dreg->type); @@ -655,8 +660,8 @@ int nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, if (is_mbpf_store(meta)) return nfp_bpf_check_store(nfp_prog, meta, env); - if (is_mbpf_xadd(meta)) - return nfp_bpf_check_xadd(nfp_prog, meta, env); + if (is_mbpf_atomic(meta)) + return nfp_bpf_check_atomic(nfp_prog, meta, env); if (is_mbpf_alu(meta)) return nfp_bpf_check_alu(nfp_prog, meta, env); diff --git a/include/linux/filter.h b/include/linux/filter.h index 5edf2b660881..392e94b79668 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -259,15 +259,23 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) .off = OFF, \ .imm = 0 }) -/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ -#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ +/* + * Atomic operations: + * + * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + */ + +#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ - .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = 0 }) + .imm = OP }) + +/* Legacy alias */ +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index a1ad32456f89..6b3996343e63 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -19,7 +19,8 @@ /* ld/ldx fields */ #define BPF_DW 0x18 /* double word (64-bit) */ -#define BPF_XADD 0xc0 /* exclusive add */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ /* alu/jmp fields */ #define BPF_MOV 0xb0 /* mov reg to reg */ @@ -2448,7 +2449,7 @@ union bpf_attr { * running simultaneously. * * A user should care about the synchronization by himself. - * For example, by using the **BPF_STX_XADD** instruction to alter + * For example, by using the **BPF_ATOMIC** instructions to alter * the shared data. * Return * A pointer to the local storage area. diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 69c3c308de5e..4836ebf459cf 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1309,8 +1309,8 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); INSN_3(STX, MEM, H), \ INSN_3(STX, MEM, W), \ INSN_3(STX, MEM, DW), \ - INSN_3(STX, XADD, W), \ - INSN_3(STX, XADD, DW), \ + INSN_3(STX, ATOMIC, W), \ + INSN_3(STX, ATOMIC, DW), \ /* Immediate based. */ \ INSN_3(ST, MEM, B), \ INSN_3(ST, MEM, H), \ @@ -1618,13 +1618,25 @@ out: LDX_PROBE(DW, 8) #undef LDX_PROBE - STX_XADD_W: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ - atomic_add((u32) SRC, (atomic_t *)(unsigned long) - (DST + insn->off)); + STX_ATOMIC_W: + switch (IMM) { + case BPF_ADD: + /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ + atomic_add((u32) SRC, (atomic_t *)(unsigned long) + (DST + insn->off)); + default: + goto default_label; + } CONT; - STX_XADD_DW: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ - atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) - (DST + insn->off)); + STX_ATOMIC_DW: + switch (IMM) { + case BPF_ADD: + /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ + atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) + (DST + insn->off)); + default: + goto default_label; + } CONT; default_label: @@ -1634,7 +1646,8 @@ out: * * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). */ - pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + pr_warn("BPF interpreter: unknown opcode %02x (imm: 0x%x)\n", + insn->code, insn->imm); BUG_ON(1); return 0; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index b44d8c447afd..37c8d6e9b4cc 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -153,14 +153,16 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); - else if (BPF_MODE(insn->code) == BPF_XADD) + else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_ADD) { verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); - else + } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); + } } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index ae2aee48cf82..cfc137b81ac6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3604,13 +3604,17 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return err; } -static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) +static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { int err; - if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) || - insn->imm != 0) { - verbose(env, "BPF_XADD uses reserved fields\n"); + if (insn->imm != BPF_ADD) { + verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); + return -EINVAL; + } + + if (BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) { + verbose(env, "invalid atomic operand size\n"); return -EINVAL; } @@ -3633,19 +3637,19 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins is_pkt_reg(env, insn->dst_reg) || is_flow_key_reg(env, insn->dst_reg) || is_sk_reg(env, insn->dst_reg)) { - verbose(env, "BPF_XADD stores into R%d %s is not allowed\n", + verbose(env, "BPF_ATOMIC stores into R%d %s is not allowed\n", insn->dst_reg, reg_type_str[reg_state(env, insn->dst_reg)->type]); return -EACCES; } - /* check whether atomic_add can read the memory */ + /* check whether we can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1, true); if (err) return err; - /* check whether atomic_add can write into the same memory */ + /* check whether we can write into the same memory */ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1, true); } @@ -9524,8 +9528,8 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type; - if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, env->insn_idx, insn); + if (BPF_MODE(insn->code) == BPF_ATOMIC) { + err = check_atomic(env, env->insn_idx, insn); if (err) return err; env->insn_idx++; @@ -10010,7 +10014,7 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) if (BPF_CLASS(insn->code) == BPF_STX && ((BPF_MODE(insn->code) != BPF_MEM && - BPF_MODE(insn->code) != BPF_XADD) || insn->imm != 0)) { + BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) { verbose(env, "BPF_STX uses reserved fields\n"); return -EINVAL; } diff --git a/lib/test_bpf.c b/lib/test_bpf.c index ca7d635bccd9..49ec9e8d8aed 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -4295,13 +4295,13 @@ static struct bpf_test tests[] = { { { 0, 0xffffffff } }, .stack_depth = 40, }, - /* BPF_STX | BPF_XADD | BPF_W/DW */ + /* BPF_STX | BPF_ATOMIC | BPF_W/DW */ { "STX_XADD_W: Test: 0x12 + 0x10 = 0x22", .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_W, R10, -40, 0x10), - BPF_STX_XADD(BPF_W, R10, R0, -40), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40), BPF_LDX_MEM(BPF_W, R0, R10, -40), BPF_EXIT_INSN(), }, @@ -4316,7 +4316,7 @@ static struct bpf_test tests[] = { BPF_ALU64_REG(BPF_MOV, R1, R10), BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_W, R10, -40, 0x10), - BPF_STX_XADD(BPF_W, R10, R0, -40), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40), BPF_ALU64_REG(BPF_MOV, R0, R10), BPF_ALU64_REG(BPF_SUB, R0, R1), BPF_EXIT_INSN(), @@ -4331,7 +4331,7 @@ static struct bpf_test tests[] = { .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_W, R10, -40, 0x10), - BPF_STX_XADD(BPF_W, R10, R0, -40), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, R10, R0, -40), BPF_EXIT_INSN(), }, INTERNAL, @@ -4352,7 +4352,7 @@ static struct bpf_test tests[] = { .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_DW, R10, -40, 0x10), - BPF_STX_XADD(BPF_DW, R10, R0, -40), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40), BPF_LDX_MEM(BPF_DW, R0, R10, -40), BPF_EXIT_INSN(), }, @@ -4367,7 +4367,7 @@ static struct bpf_test tests[] = { BPF_ALU64_REG(BPF_MOV, R1, R10), BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_DW, R10, -40, 0x10), - BPF_STX_XADD(BPF_DW, R10, R0, -40), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40), BPF_ALU64_REG(BPF_MOV, R0, R10), BPF_ALU64_REG(BPF_SUB, R0, R1), BPF_EXIT_INSN(), @@ -4382,7 +4382,7 @@ static struct bpf_test tests[] = { .u.insns_int = { BPF_ALU32_IMM(BPF_MOV, R0, 0x12), BPF_ST_MEM(BPF_DW, R10, -40, 0x10), - BPF_STX_XADD(BPF_DW, R10, R0, -40), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, R10, R0, -40), BPF_EXIT_INSN(), }, INTERNAL, diff --git a/samples/bpf/bpf_insn.h b/samples/bpf/bpf_insn.h index 544237980582..db67a2847395 100644 --- a/samples/bpf/bpf_insn.h +++ b/samples/bpf/bpf_insn.h @@ -138,11 +138,11 @@ struct bpf_insn; #define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ ((struct bpf_insn) { \ - .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = 0 }) + .imm = BPF_ADD }) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c index deb0e3e0324d..c5ff7a13918c 100644 --- a/samples/bpf/cookie_uid_helper_example.c +++ b/samples/bpf/cookie_uid_helper_example.c @@ -147,12 +147,12 @@ static void prog_load(void) */ BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, - offsetof(struct stats, packets)), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, packets)), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), - BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, - offsetof(struct stats, bytes)), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, bytes)), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, offsetof(struct __sk_buff, len)), BPF_EXIT_INSN(), diff --git a/samples/bpf/sock_example.c b/samples/bpf/sock_example.c index 00aae1d33fca..23d1930e1927 100644 --- a/samples/bpf/sock_example.c +++ b/samples/bpf/sock_example.c @@ -54,7 +54,7 @@ static int test_sock(void) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ BPF_EXIT_INSN(), }; diff --git a/samples/bpf/test_cgrp2_attach.c b/samples/bpf/test_cgrp2_attach.c index 20fbd1241db3..390ff38d2ac6 100644 --- a/samples/bpf/test_cgrp2_attach.c +++ b/samples/bpf/test_cgrp2_attach.c @@ -53,7 +53,7 @@ static int prog_load(int map_fd, int verdict) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), /* Count bytes */ BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */ @@ -64,7 +64,8 @@ static int prog_load(int map_fd, int verdict) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, verdict), /* r0 = verdict */ BPF_EXIT_INSN(), diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index ca28b6ab8db7..e870c9039f0d 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -169,15 +169,22 @@ .off = OFF, \ .imm = 0 }) -/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ +/* + * Atomic operations: + * + * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + */ -#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ +#define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ ((struct bpf_insn) { \ - .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_ATOMIC, \ .dst_reg = DST, \ .src_reg = SRC, \ .off = OFF, \ - .imm = 0 }) + .imm = OP }) + +/* Legacy alias */ +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) BPF_ATOMIC_OP(SIZE, BPF_ADD, DST, SRC, OFF) /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index a1ad32456f89..6b3996343e63 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -19,7 +19,8 @@ /* ld/ldx fields */ #define BPF_DW 0x18 /* double word (64-bit) */ -#define BPF_XADD 0xc0 /* exclusive add */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ /* alu/jmp fields */ #define BPF_MOV 0xb0 /* mov reg to reg */ @@ -2448,7 +2449,7 @@ union bpf_attr { * running simultaneously. * * A user should care about the synchronization by himself. - * For example, by using the **BPF_STX_XADD** instruction to alter + * For example, by using the **BPF_ATOMIC** instructions to alter * the shared data. * Return * A pointer to the local storage area. diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c index b549fcfacc0b..0a1fc9816cef 100644 --- a/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c +++ b/tools/testing/selftests/bpf/prog_tests/cgroup_attach_multi.c @@ -45,13 +45,13 @@ static int prog_load_cnt(int verdict, int val) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, val), /* r1 = 1 */ - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_LD_MAP_FD(BPF_REG_1, cgroup_storage_fd), BPF_MOV64_IMM(BPF_REG_2, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), BPF_MOV64_IMM(BPF_REG_1, val), - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_0, BPF_REG_1, 0, 0), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_LD_MAP_FD(BPF_REG_1, percpu_cgroup_storage_fd), BPF_MOV64_IMM(BPF_REG_2, 0), diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c index d946252a25bb..0cda61da5d39 100644 --- a/tools/testing/selftests/bpf/test_cgroup_storage.c +++ b/tools/testing/selftests/bpf/test_cgroup_storage.c @@ -29,7 +29,7 @@ int main(int argc, char **argv) BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_local_storage), BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_1, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_1, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, 0), BPF_ALU64_IMM(BPF_AND, BPF_REG_1, 0x1), BPF_MOV64_REG(BPF_REG_0, BPF_REG_1), diff --git a/tools/testing/selftests/bpf/verifier/ctx.c b/tools/testing/selftests/bpf/verifier/ctx.c index 93d6b1641481..23080862aafd 100644 --- a/tools/testing/selftests/bpf/verifier/ctx.c +++ b/tools/testing/selftests/bpf/verifier/ctx.c @@ -10,14 +10,13 @@ .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, { - "context stores via XADD", + "context stores via BPF_ATOMIC", .insns = { BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_W, BPF_REG_1, - BPF_REG_0, offsetof(struct __sk_buff, mark), 0), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, mark)), BPF_EXIT_INSN(), }, - .errstr = "BPF_XADD stores into R1 ctx is not allowed", + .errstr = "BPF_ATOMIC stores into R1 ctx is not allowed", .result = REJECT, .prog_type = BPF_PROG_TYPE_SCHED_CLS, }, diff --git a/tools/testing/selftests/bpf/verifier/direct_packet_access.c b/tools/testing/selftests/bpf/verifier/direct_packet_access.c index ae72536603fe..ac1e19d0f520 100644 --- a/tools/testing/selftests/bpf/verifier/direct_packet_access.c +++ b/tools/testing/selftests/bpf/verifier/direct_packet_access.c @@ -333,7 +333,7 @@ BPF_MOV64_REG(BPF_REG_4, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_4, -8), BPF_STX_MEM(BPF_DW, BPF_REG_4, BPF_REG_2, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_4, BPF_REG_5, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_4, BPF_REG_5, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_4, 0), BPF_STX_MEM(BPF_W, BPF_REG_2, BPF_REG_5, 0), BPF_MOV64_IMM(BPF_REG_0, 0), @@ -488,7 +488,7 @@ BPF_JMP_REG(BPF_JGT, BPF_REG_0, BPF_REG_3, 11), BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8), BPF_MOV64_IMM(BPF_REG_4, 0xffffffff), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_4, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_4, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_4, BPF_REG_10, -8), BPF_ALU64_IMM(BPF_RSH, BPF_REG_4, 49), BPF_ALU64_REG(BPF_ADD, BPF_REG_4, BPF_REG_2), diff --git a/tools/testing/selftests/bpf/verifier/leak_ptr.c b/tools/testing/selftests/bpf/verifier/leak_ptr.c index d6eec17f2cd2..73f0dea95546 100644 --- a/tools/testing/selftests/bpf/verifier/leak_ptr.c +++ b/tools/testing/selftests/bpf/verifier/leak_ptr.c @@ -5,7 +5,7 @@ BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, cb[0])), BPF_LD_MAP_FD(BPF_REG_2, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_2, + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_1, BPF_REG_2, offsetof(struct __sk_buff, cb[0])), BPF_EXIT_INSN(), }, @@ -13,7 +13,7 @@ .errstr_unpriv = "R2 leaks addr into mem", .result_unpriv = REJECT, .result = REJECT, - .errstr = "BPF_XADD stores into R1 ctx is not allowed", + .errstr = "BPF_ATOMIC stores into R1 ctx is not allowed", }, { "leak pointer into ctx 2", @@ -21,14 +21,14 @@ BPF_MOV64_IMM(BPF_REG_0, 0), BPF_STX_MEM(BPF_DW, BPF_REG_1, BPF_REG_0, offsetof(struct __sk_buff, cb[0])), - BPF_STX_XADD(BPF_DW, BPF_REG_1, BPF_REG_10, + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_1, BPF_REG_10, offsetof(struct __sk_buff, cb[0])), BPF_EXIT_INSN(), }, .errstr_unpriv = "R10 leaks addr into mem", .result_unpriv = REJECT, .result = REJECT, - .errstr = "BPF_XADD stores into R1 ctx is not allowed", + .errstr = "BPF_ATOMIC stores into R1 ctx is not allowed", }, { "leak pointer into ctx 3", @@ -56,7 +56,7 @@ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 3), BPF_MOV64_IMM(BPF_REG_3, 0), BPF_STX_MEM(BPF_DW, BPF_REG_0, BPF_REG_3, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_0, BPF_REG_6, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_0, BPF_REG_6, 0), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, diff --git a/tools/testing/selftests/bpf/verifier/meta_access.c b/tools/testing/selftests/bpf/verifier/meta_access.c index 205292b8dd65..b45e8af41420 100644 --- a/tools/testing/selftests/bpf/verifier/meta_access.c +++ b/tools/testing/selftests/bpf/verifier/meta_access.c @@ -171,7 +171,7 @@ BPF_MOV64_IMM(BPF_REG_5, 42), BPF_MOV64_IMM(BPF_REG_6, 24), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_6, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8), BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6), BPF_ALU64_REG(BPF_ADD, BPF_REG_3, BPF_REG_5), @@ -196,7 +196,7 @@ BPF_MOV64_IMM(BPF_REG_5, 42), BPF_MOV64_IMM(BPF_REG_6, 24), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_5, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_6, -8), BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -8), BPF_JMP_IMM(BPF_JGT, BPF_REG_5, 100, 6), BPF_ALU64_REG(BPF_ADD, BPF_REG_2, BPF_REG_5), diff --git a/tools/testing/selftests/bpf/verifier/unpriv.c b/tools/testing/selftests/bpf/verifier/unpriv.c index a3fe0fbaed41..ee298627abae 100644 --- a/tools/testing/selftests/bpf/verifier/unpriv.c +++ b/tools/testing/selftests/bpf/verifier/unpriv.c @@ -207,7 +207,8 @@ BPF_ALU64_IMM(BPF_ADD, BPF_REG_6, -8), BPF_STX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0), BPF_MOV64_IMM(BPF_REG_0, 1), - BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_10, BPF_REG_0, -8, 0), + BPF_RAW_INSN(BPF_STX | BPF_ATOMIC | BPF_DW, + BPF_REG_10, BPF_REG_0, -8, BPF_ADD), BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_6, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_hash_recalc), BPF_EXIT_INSN(), diff --git a/tools/testing/selftests/bpf/verifier/value_illegal_alu.c b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c index ed1c2cea1dea..489062867218 100644 --- a/tools/testing/selftests/bpf/verifier/value_illegal_alu.c +++ b/tools/testing/selftests/bpf/verifier/value_illegal_alu.c @@ -82,7 +82,7 @@ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_STX_MEM(BPF_DW, BPF_REG_2, BPF_REG_0, 0), - BPF_STX_XADD(BPF_DW, BPF_REG_2, BPF_REG_3, 0), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_2, BPF_REG_3, 0), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 0), BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 22), BPF_EXIT_INSN(), diff --git a/tools/testing/selftests/bpf/verifier/xadd.c b/tools/testing/selftests/bpf/verifier/xadd.c index c5de2e62cc8b..b96ef3526815 100644 --- a/tools/testing/selftests/bpf/verifier/xadd.c +++ b/tools/testing/selftests/bpf/verifier/xadd.c @@ -3,7 +3,7 @@ .insns = { BPF_MOV64_IMM(BPF_REG_0, 1), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -7), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -7), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), BPF_EXIT_INSN(), }, @@ -22,7 +22,7 @@ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), BPF_MOV64_IMM(BPF_REG_1, 1), - BPF_STX_XADD(BPF_W, BPF_REG_0, BPF_REG_1, 3), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_0, BPF_REG_1, 3), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, 3), BPF_EXIT_INSN(), }, @@ -45,13 +45,13 @@ BPF_MOV64_IMM(BPF_REG_0, 1), BPF_ST_MEM(BPF_W, BPF_REG_2, 0, 0), BPF_ST_MEM(BPF_W, BPF_REG_2, 3, 0), - BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 1), - BPF_STX_XADD(BPF_W, BPF_REG_2, BPF_REG_0, 2), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_2, BPF_REG_0, 1), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_2, BPF_REG_0, 2), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_2, 1), BPF_EXIT_INSN(), }, .result = REJECT, - .errstr = "BPF_XADD stores into R2 pkt is not allowed", + .errstr = "BPF_ATOMIC stores into R2 pkt is not allowed", .prog_type = BPF_PROG_TYPE_XDP, .flags = F_NEEDS_EFFICIENT_UNALIGNED_ACCESS, }, @@ -62,8 +62,8 @@ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_0, 3), BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_10, 2), BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_10, -8), @@ -82,8 +82,8 @@ BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -8), - BPF_STX_XADD(BPF_W, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), + BPF_ATOMIC_OP(BPF_W, BPF_ADD, BPF_REG_10, BPF_REG_0, -8), BPF_JMP_REG(BPF_JNE, BPF_REG_6, BPF_REG_0, 3), BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_10, 2), BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_10, -8), -- cgit v1.2.3 From c5bcb5eb4db632280b4123135d583a7bc8caea3e Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:45 +0000 Subject: bpf: Move BPF_STX reserved field check into BPF_STX verifier code I can't find a reason why this code is in resolve_pseudo_ldimm64; since I'll be modifying it in a subsequent commit, tidy it up. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210114181751.768687-6-jackmanb@google.com --- kernel/bpf/verifier.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cfc137b81ac6..d8a85f4e5b95 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -9528,6 +9528,12 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type; + if (((BPF_MODE(insn->code) != BPF_MEM && + BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + if (BPF_MODE(insn->code) == BPF_ATOMIC) { err = check_atomic(env, env->insn_idx, insn); if (err) @@ -10012,13 +10018,6 @@ static int resolve_pseudo_ldimm64(struct bpf_verifier_env *env) return -EINVAL; } - if (BPF_CLASS(insn->code) == BPF_STX && - ((BPF_MODE(insn->code) != BPF_MEM && - BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_insn_aux_data *aux; struct bpf_map *map; -- cgit v1.2.3 From 5ca419f2864a2c60940dcf4bbaeb69546200e36f Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:46 +0000 Subject: bpf: Add BPF_FETCH field / create atomic_fetch_add instruction The BPF_FETCH field can be set in bpf_insn.imm, for BPF_ATOMIC instructions, in order to have the previous value of the atomically-modified memory location loaded into the src register after an atomic op is carried out. Suggested-by: Yonghong Song Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210114181751.768687-7-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 4 ++++ include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 3 +++ kernel/bpf/core.c | 13 +++++++++++++ kernel/bpf/disasm.c | 7 +++++++ kernel/bpf/verifier.c | 33 ++++++++++++++++++++++++--------- tools/include/linux/filter.h | 1 + tools/include/uapi/linux/bpf.h | 3 +++ 8 files changed, 56 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b1829a534da1..eea7d8b0bb12 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -811,6 +811,10 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* lock *(u32/u64*)(dst_reg + off) = src_reg */ EMIT1(simple_alu_opcodes[atomic_op]); break; + case BPF_ADD | BPF_FETCH: + /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ + EMIT2(0x0F, 0xC1); + break; default: pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); return -EFAULT; diff --git a/include/linux/filter.h b/include/linux/filter.h index 392e94b79668..23fca41b8540 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -264,6 +264,7 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 6b3996343e63..ea262b009049 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -44,6 +44,9 @@ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* fetch previous value into src reg */ + /* Register numbers */ enum { BPF_REG_0 = 0, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4836ebf459cf..28d6000463e4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1624,16 +1624,29 @@ out: /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ atomic_add((u32) SRC, (atomic_t *)(unsigned long) (DST + insn->off)); + break; + case BPF_ADD | BPF_FETCH: + SRC = (u32) atomic_fetch_add( + (u32) SRC, + (atomic_t *)(unsigned long) (DST + insn->off)); + break; default: goto default_label; } CONT; + STX_ATOMIC_DW: switch (IMM) { case BPF_ADD: /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); + break; + case BPF_ADD | BPF_FETCH: + SRC = (u64) atomic64_fetch_add( + (u64) SRC, + (atomic64_t *)(unsigned long) (DST + insn->off)); + break; default: goto default_label; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 37c8d6e9b4cc..d2e20f6d0516 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -160,6 +160,13 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == (BPF_ADD | BPF_FETCH)) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_add((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d8a85f4e5b95..6aa1fc919761 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3608,7 +3608,11 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i { int err; - if (insn->imm != BPF_ADD) { + switch (insn->imm) { + case BPF_ADD: + case BPF_ADD | BPF_FETCH: + break; + default: verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); return -EINVAL; } @@ -3650,8 +3654,20 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return err; /* check whether we can write into the same memory */ - return check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, -1, true); + err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, + BPF_SIZE(insn->code), BPF_WRITE, -1, true); + if (err) + return err; + + if (!(insn->imm & BPF_FETCH)) + return 0; + + /* check and record load of old value into src reg */ + err = check_reg_arg(env, insn->src_reg, DST_OP); + if (err) + return err; + + return 0; } static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, @@ -9528,12 +9544,6 @@ static int do_check(struct bpf_verifier_env *env) } else if (class == BPF_STX) { enum bpf_reg_type *prev_dst_type, dst_reg_type; - if (((BPF_MODE(insn->code) != BPF_MEM && - BPF_MODE(insn->code) != BPF_ATOMIC) || insn->imm != 0)) { - verbose(env, "BPF_STX uses reserved fields\n"); - return -EINVAL; - } - if (BPF_MODE(insn->code) == BPF_ATOMIC) { err = check_atomic(env, env->insn_idx, insn); if (err) @@ -9542,6 +9552,11 @@ static int do_check(struct bpf_verifier_env *env) continue; } + if (BPF_MODE(insn->code) != BPF_MEM || insn->imm != 0) { + verbose(env, "BPF_STX uses reserved fields\n"); + return -EINVAL; + } + /* check src1 operand */ err = check_reg_arg(env, insn->src_reg, SRC_OP); if (err) diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index e870c9039f0d..7211ce9fba53 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -173,6 +173,7 @@ * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 6b3996343e63..ea262b009049 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -44,6 +44,9 @@ #define BPF_CALL 0x80 /* function call */ #define BPF_EXIT 0x90 /* function return */ +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* fetch previous value into src reg */ + /* Register numbers */ enum { BPF_REG_0 = 0, -- cgit v1.2.3 From 5ffa25502b5ab3d639829a2d1e316cff7f59a41e Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:47 +0000 Subject: bpf: Add instructions for atomic_[cmp]xchg This adds two atomic opcodes, both of which include the BPF_FETCH flag. XCHG without the BPF_FETCH flag would naturally encode atomic_set. This is not supported because it would be of limited value to userspace (it doesn't imply any barriers). CMPXCHG without BPF_FETCH woulud be an atomic compare-and-write. We don't have such an operation in the kernel so it isn't provided to BPF either. There are two significant design decisions made for the CMPXCHG instruction: - To solve the issue that this operation fundamentally has 3 operands, but we only have two register fields. Therefore the operand we compare against (the kernel's API calls it 'old') is hard-coded to be R0. x86 has similar design (and A64 doesn't have this problem). A potential alternative might be to encode the other operand's register number in the immediate field. - The kernel's atomic_cmpxchg returns the old value, while the C11 userspace APIs return a boolean indicating the comparison result. Which should BPF do? A64 returns the old value. x86 returns the old value in the hard-coded register (and also sets a flag). That means return-old-value is easier to JIT, so that's what we use. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210114181751.768687-8-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 8 ++++++++ include/linux/filter.h | 2 ++ include/uapi/linux/bpf.h | 4 +++- kernel/bpf/core.c | 20 ++++++++++++++++++++ kernel/bpf/disasm.c | 15 +++++++++++++++ kernel/bpf/verifier.c | 19 +++++++++++++++++-- tools/include/linux/filter.h | 2 ++ tools/include/uapi/linux/bpf.h | 4 +++- 8 files changed, 70 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index eea7d8b0bb12..308241187582 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -815,6 +815,14 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* src_reg = atomic_fetch_add(dst_reg + off, src_reg); */ EMIT2(0x0F, 0xC1); break; + case BPF_XCHG: + /* src_reg = atomic_xchg(dst_reg + off, src_reg); */ + EMIT1(0x87); + break; + case BPF_CMPXCHG: + /* r0 = atomic_cmpxchg(dst_reg + off, r0, src_reg); */ + EMIT2(0x0F, 0xB1); + break; default: pr_err("bpf_jit: unknown atomic opcode %02x\n", atomic_op); return -EFAULT; diff --git a/include/linux/filter.h b/include/linux/filter.h index 23fca41b8540..d563820f197d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -265,6 +265,8 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index ea262b009049..c001766adcbc 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -45,7 +45,9 @@ #define BPF_EXIT 0x90 /* function return */ /* atomic op type fields (stored in immediate) */ -#define BPF_FETCH 0x01 /* fetch previous value into src reg */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ /* Register numbers */ enum { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 28d6000463e4..4df6daba43ef 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1630,6 +1630,16 @@ out: (u32) SRC, (atomic_t *)(unsigned long) (DST + insn->off)); break; + case BPF_XCHG: + SRC = (u32) atomic_xchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) SRC); + break; + case BPF_CMPXCHG: + BPF_R0 = (u32) atomic_cmpxchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) BPF_R0, (u32) SRC); + break; default: goto default_label; } @@ -1647,6 +1657,16 @@ out: (u64) SRC, (atomic64_t *)(unsigned long) (DST + insn->off)); break; + case BPF_XCHG: + SRC = (u64) atomic64_xchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) SRC); + break; + case BPF_CMPXCHG: + BPF_R0 = (u64) atomic64_cmpxchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) BPF_R0, (u64) SRC); + break; default: goto default_label; } diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index d2e20f6d0516..ee8d1132767b 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -167,6 +167,21 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_CMPXCHG) { + verbose(cbs->private_data, "(%02x) r0 = atomic%s_cmpxchg((%s *)(r%d %+d), r0, r%d)\n", + insn->code, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, + insn->src_reg); + } else if (BPF_MODE(insn->code) == BPF_ATOMIC && + insn->imm == BPF_XCHG) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_xchg((%s *)(r%d %+d), r%d)\n", + insn->code, insn->src_reg, + BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_ldst_string[BPF_SIZE(insn->code) >> 3], + insn->dst_reg, insn->off, insn->src_reg); } else { verbose(cbs->private_data, "BUG_%02x\n", insn->code); } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6aa1fc919761..89a4d154ab37 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3606,11 +3606,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn) { + int load_reg; int err; switch (insn->imm) { case BPF_ADD: case BPF_ADD | BPF_FETCH: + case BPF_XCHG: + case BPF_CMPXCHG: break; default: verbose(env, "BPF_ATOMIC uses invalid atomic opcode %02x\n", insn->imm); @@ -3632,6 +3635,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (err) return err; + if (insn->imm == BPF_CMPXCHG) { + /* Check comparison of R0 with memory location */ + err = check_reg_arg(env, BPF_REG_0, SRC_OP); + if (err) + return err; + } + if (is_pointer_value(env, insn->src_reg)) { verbose(env, "R%d leaks addr into mem\n", insn->src_reg); return -EACCES; @@ -3662,8 +3672,13 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (!(insn->imm & BPF_FETCH)) return 0; - /* check and record load of old value into src reg */ - err = check_reg_arg(env, insn->src_reg, DST_OP); + if (insn->imm == BPF_CMPXCHG) + load_reg = BPF_REG_0; + else + load_reg = insn->src_reg; + + /* check and record load of old value */ + err = check_reg_arg(env, load_reg, DST_OP); if (err) return err; diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index 7211ce9fba53..d75998b0d5ac 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -174,6 +174,8 @@ * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) + * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ #define BPF_ATOMIC_OP(SIZE, OP, DST, SRC, OFF) \ diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index ea262b009049..c001766adcbc 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -45,7 +45,9 @@ #define BPF_EXIT 0x90 /* function return */ /* atomic op type fields (stored in immediate) */ -#define BPF_FETCH 0x01 /* fetch previous value into src reg */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ /* Register numbers */ enum { -- cgit v1.2.3 From 462910670e4ac91509829c5549bd0227668176fb Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:48 +0000 Subject: bpf: Pull out a macro for interpreting atomic ALU operations Since the atomic operations that are added in subsequent commits are all isomorphic with BPF_ADD, pull out a macro to avoid the interpreter becoming dominated by lines of atomic-related code. Note that this sacrificies interpreter performance (combining STX_ATOMIC_W and STX_ATOMIC_DW into single switch case means that we need an extra conditional branch to differentiate them) in favour of compact and (relatively!) simple C code. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210114181751.768687-9-jackmanb@google.com --- kernel/bpf/core.c | 80 +++++++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 4df6daba43ef..8669e685825f 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1618,55 +1618,53 @@ out: LDX_PROBE(DW, 8) #undef LDX_PROBE - STX_ATOMIC_W: - switch (IMM) { - case BPF_ADD: - /* lock xadd *(u32 *)(dst_reg + off16) += src_reg */ - atomic_add((u32) SRC, (atomic_t *)(unsigned long) - (DST + insn->off)); - break; - case BPF_ADD | BPF_FETCH: - SRC = (u32) atomic_fetch_add( - (u32) SRC, - (atomic_t *)(unsigned long) (DST + insn->off)); - break; - case BPF_XCHG: - SRC = (u32) atomic_xchg( - (atomic_t *)(unsigned long) (DST + insn->off), - (u32) SRC); - break; - case BPF_CMPXCHG: - BPF_R0 = (u32) atomic_cmpxchg( - (atomic_t *)(unsigned long) (DST + insn->off), - (u32) BPF_R0, (u32) SRC); +#define ATOMIC_ALU_OP(BOP, KOP) \ + case BOP: \ + if (BPF_SIZE(insn->code) == BPF_W) \ + atomic_##KOP((u32) SRC, (atomic_t *)(unsigned long) \ + (DST + insn->off)); \ + else \ + atomic64_##KOP((u64) SRC, (atomic64_t *)(unsigned long) \ + (DST + insn->off)); \ + break; \ + case BOP | BPF_FETCH: \ + if (BPF_SIZE(insn->code) == BPF_W) \ + SRC = (u32) atomic_fetch_##KOP( \ + (u32) SRC, \ + (atomic_t *)(unsigned long) (DST + insn->off)); \ + else \ + SRC = (u64) atomic64_fetch_##KOP( \ + (u64) SRC, \ + (atomic64_t *)(unsigned long) (DST + insn->off)); \ break; - default: - goto default_label; - } - CONT; STX_ATOMIC_DW: + STX_ATOMIC_W: switch (IMM) { - case BPF_ADD: - /* lock xadd *(u64 *)(dst_reg + off16) += src_reg */ - atomic64_add((u64) SRC, (atomic64_t *)(unsigned long) - (DST + insn->off)); - break; - case BPF_ADD | BPF_FETCH: - SRC = (u64) atomic64_fetch_add( - (u64) SRC, - (atomic64_t *)(unsigned long) (DST + insn->off)); - break; + ATOMIC_ALU_OP(BPF_ADD, add) +#undef ATOMIC_ALU_OP + case BPF_XCHG: - SRC = (u64) atomic64_xchg( - (atomic64_t *)(unsigned long) (DST + insn->off), - (u64) SRC); + if (BPF_SIZE(insn->code) == BPF_W) + SRC = (u32) atomic_xchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) SRC); + else + SRC = (u64) atomic64_xchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) SRC); break; case BPF_CMPXCHG: - BPF_R0 = (u64) atomic64_cmpxchg( - (atomic64_t *)(unsigned long) (DST + insn->off), - (u64) BPF_R0, (u64) SRC); + if (BPF_SIZE(insn->code) == BPF_W) + BPF_R0 = (u32) atomic_cmpxchg( + (atomic_t *)(unsigned long) (DST + insn->off), + (u32) BPF_R0, (u32) SRC); + else + BPF_R0 = (u64) atomic64_cmpxchg( + (atomic64_t *)(unsigned long) (DST + insn->off), + (u64) BPF_R0, (u64) SRC); break; + default: goto default_label; } -- cgit v1.2.3 From 981f94c3e92146705baf97fb417a5ed1ab1a79a5 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 14 Jan 2021 18:17:49 +0000 Subject: bpf: Add bitwise atomic instructions This adds instructions for atomic[64]_[fetch_]and atomic[64]_[fetch_]or atomic[64]_[fetch_]xor All these operations are isomorphic enough to implement with the same verifier, interpreter, and x86 JIT code, hence being a single commit. The main interesting thing here is that x86 doesn't directly support the fetch_ version these operations, so we need to generate a CMPXCHG loop in the JIT. This requires the use of two temporary registers, IIUC it's safe to use BPF_REG_AX and x86's AUX_REG for this purpose. Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210114181751.768687-10-jackmanb@google.com --- arch/x86/net/bpf_jit_comp.c | 50 +++++++++++++++++++++++++++++++++++++++++++- include/linux/filter.h | 6 ++++++ kernel/bpf/core.c | 3 +++ kernel/bpf/disasm.c | 21 +++++++++++++++---- kernel/bpf/verifier.c | 6 ++++++ tools/include/linux/filter.h | 6 ++++++ 6 files changed, 87 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 308241187582..1d4d50199293 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -808,6 +808,10 @@ static int emit_atomic(u8 **pprog, u8 atomic_op, /* emit opcode */ switch (atomic_op) { case BPF_ADD: + case BPF_SUB: + case BPF_AND: + case BPF_OR: + case BPF_XOR: /* lock *(u32/u64*)(dst_reg + off) = src_reg */ EMIT1(simple_alu_opcodes[atomic_op]); break; @@ -1292,8 +1296,52 @@ st: if (is_imm8(insn->off)) case BPF_STX | BPF_ATOMIC | BPF_W: case BPF_STX | BPF_ATOMIC | BPF_DW: + if (insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH)) { + u8 *branch_target; + bool is64 = BPF_SIZE(insn->code) == BPF_DW; + + /* + * Can't be implemented with a single x86 insn. + * Need to do a CMPXCHG loop. + */ + + /* Will need RAX as a CMPXCHG operand so save R0 */ + emit_mov_reg(&prog, true, BPF_REG_AX, BPF_REG_0); + branch_target = prog; + /* Load old value */ + emit_ldx(&prog, BPF_SIZE(insn->code), + BPF_REG_0, dst_reg, insn->off); + /* + * Perform the (commutative) operation locally, + * put the result in the AUX_REG. + */ + emit_mov_reg(&prog, is64, AUX_REG, BPF_REG_0); + maybe_emit_mod(&prog, AUX_REG, src_reg, is64); + EMIT2(simple_alu_opcodes[BPF_OP(insn->imm)], + add_2reg(0xC0, AUX_REG, src_reg)); + /* Attempt to swap in new value */ + err = emit_atomic(&prog, BPF_CMPXCHG, + dst_reg, AUX_REG, insn->off, + BPF_SIZE(insn->code)); + if (WARN_ON(err)) + return err; + /* + * ZF tells us whether we won the race. If it's + * cleared we need to try again. + */ + EMIT2(X86_JNE, -(prog - branch_target) - 2); + /* Return the pre-modification value */ + emit_mov_reg(&prog, is64, src_reg, BPF_REG_0); + /* Restore R0 after clobbering RAX */ + emit_mov_reg(&prog, true, BPF_REG_0, BPF_REG_AX); + break; + + } + err = emit_atomic(&prog, insn->imm, dst_reg, src_reg, - insn->off, BPF_SIZE(insn->code)); + insn->off, BPF_SIZE(insn->code)); if (err) return err; break; diff --git a/include/linux/filter.h b/include/linux/filter.h index d563820f197d..7fdce5407214 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -264,7 +264,13 @@ static inline bool insn_is_zext(const struct bpf_insn *insn) * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8669e685825f..5bbd4884ff7a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1642,6 +1642,9 @@ out: STX_ATOMIC_W: switch (IMM) { ATOMIC_ALU_OP(BPF_ADD, add) + ATOMIC_ALU_OP(BPF_AND, and) + ATOMIC_ALU_OP(BPF_OR, or) + ATOMIC_ALU_OP(BPF_XOR, xor) #undef ATOMIC_ALU_OP case BPF_XCHG: diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index ee8d1132767b..19ff8fed7f4b 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -80,6 +80,13 @@ const char *const bpf_alu_string[16] = { [BPF_END >> 4] = "endian", }; +static const char *const bpf_atomic_alu_string[16] = { + [BPF_ADD >> 4] = "add", + [BPF_AND >> 4] = "and", + [BPF_OR >> 4] = "or", + [BPF_XOR >> 4] = "or", +}; + static const char *const bpf_ldst_string[] = { [BPF_W >> 3] = "u32", [BPF_H >> 3] = "u16", @@ -154,17 +161,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_ATOMIC && - insn->imm == BPF_ADD) { - verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + (insn->imm == BPF_ADD || insn->imm == BPF_ADD || + insn->imm == BPF_OR || insn->imm == BPF_XOR)) { + verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, + bpf_alu_string[BPF_OP(insn->imm) >> 4], insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && - insn->imm == (BPF_ADD | BPF_FETCH)) { - verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_add((%s *)(r%d %+d), r%d)\n", + (insn->imm == (BPF_ADD | BPF_FETCH) || + insn->imm == (BPF_AND | BPF_FETCH) || + insn->imm == (BPF_OR | BPF_FETCH) || + insn->imm == (BPF_XOR | BPF_FETCH))) { + verbose(cbs->private_data, "(%02x) r%d = atomic%s_fetch_%s((%s *)(r%d %+d), r%d)\n", insn->code, insn->src_reg, BPF_SIZE(insn->code) == BPF_DW ? "64" : "", + bpf_atomic_alu_string[BPF_OP(insn->imm) >> 4], bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); } else if (BPF_MODE(insn->code) == BPF_ATOMIC && diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 89a4d154ab37..0f82d5d46e2c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3612,6 +3612,12 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i switch (insn->imm) { case BPF_ADD: case BPF_ADD | BPF_FETCH: + case BPF_AND: + case BPF_AND | BPF_FETCH: + case BPF_OR: + case BPF_OR | BPF_FETCH: + case BPF_XOR: + case BPF_XOR | BPF_FETCH: case BPF_XCHG: case BPF_CMPXCHG: break; diff --git a/tools/include/linux/filter.h b/tools/include/linux/filter.h index d75998b0d5ac..736bdeccdfe4 100644 --- a/tools/include/linux/filter.h +++ b/tools/include/linux/filter.h @@ -173,7 +173,13 @@ * Atomic operations: * * BPF_ADD *(uint *) (dst_reg + off16) += src_reg + * BPF_AND *(uint *) (dst_reg + off16) &= src_reg + * BPF_OR *(uint *) (dst_reg + off16) |= src_reg + * BPF_XOR *(uint *) (dst_reg + off16) ^= src_reg * BPF_ADD | BPF_FETCH src_reg = atomic_fetch_add(dst_reg + off16, src_reg); + * BPF_AND | BPF_FETCH src_reg = atomic_fetch_and(dst_reg + off16, src_reg); + * BPF_OR | BPF_FETCH src_reg = atomic_fetch_or(dst_reg + off16, src_reg); + * BPF_XOR | BPF_FETCH src_reg = atomic_fetch_xor(dst_reg + off16, src_reg); * BPF_XCHG src_reg = atomic_xchg(dst_reg + off16, src_reg) * BPF_CMPXCHG r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg) */ -- cgit v1.2.3 From bd7525dacd7e204f8cae061941fb9001c89d6988 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jan 2021 14:40:42 +0100 Subject: bpf: Move stack_map_get_build_id into lib Moving stack_map_get_build_id into lib with declaration in linux/buildid.h header: int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id); This function returns build id for given struct vm_area_struct. There is no functional change to stack_map_get_build_id function. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210114134044.1418404-2-jolsa@kernel.org --- include/linux/buildid.h | 11 ++++ kernel/bpf/stackmap.c | 143 ++---------------------------------------------- lib/Makefile | 3 +- lib/buildid.c | 136 +++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 153 insertions(+), 140 deletions(-) create mode 100644 include/linux/buildid.h create mode 100644 lib/buildid.c (limited to 'kernel') diff --git a/include/linux/buildid.h b/include/linux/buildid.h new file mode 100644 index 000000000000..08028a212589 --- /dev/null +++ b/include/linux/buildid.h @@ -0,0 +1,11 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_BUILDID_H +#define _LINUX_BUILDID_H + +#include + +#define BUILD_ID_SIZE_MAX 20 + +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id); + +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index aea96b638473..55d254a59f07 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -7,10 +7,9 @@ #include #include #include -#include -#include #include #include +#include #include "percpu_freelist.h" #define STACK_CREATE_FLAG_MASK \ @@ -143,140 +142,6 @@ free_smap: return ERR_PTR(err); } -#define BPF_BUILD_ID 3 -/* - * Parse build id from the note segment. This logic can be shared between - * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are - * identical. - */ -static inline int stack_map_parse_build_id(void *page_addr, - unsigned char *build_id, - void *note_start, - Elf32_Word note_size) -{ - Elf32_Word note_offs = 0, new_offs; - - /* check for overflow */ - if (note_start < page_addr || note_start + note_size < note_start) - return -EINVAL; - - /* only supports note that fits in the first page */ - if (note_start + note_size > page_addr + PAGE_SIZE) - return -EINVAL; - - while (note_offs + sizeof(Elf32_Nhdr) < note_size) { - Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); - - if (nhdr->n_type == BPF_BUILD_ID && - nhdr->n_namesz == sizeof("GNU") && - nhdr->n_descsz > 0 && - nhdr->n_descsz <= BPF_BUILD_ID_SIZE) { - memcpy(build_id, - note_start + note_offs + - ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), - nhdr->n_descsz); - memset(build_id + nhdr->n_descsz, 0, - BPF_BUILD_ID_SIZE - nhdr->n_descsz); - return 0; - } - new_offs = note_offs + sizeof(Elf32_Nhdr) + - ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); - if (new_offs <= note_offs) /* overflow */ - break; - note_offs = new_offs; - } - return -EINVAL; -} - -/* Parse build ID from 32-bit ELF */ -static int stack_map_get_build_id_32(void *page_addr, - unsigned char *build_id) -{ - Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; - Elf32_Phdr *phdr; - int i; - - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) - return -EINVAL; - - phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); - - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) - return 0; - } - return -EINVAL; -} - -/* Parse build ID from 64-bit ELF */ -static int stack_map_get_build_id_64(void *page_addr, - unsigned char *build_id) -{ - Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; - Elf64_Phdr *phdr; - int i; - - /* only supports phdr that fits in one page */ - if (ehdr->e_phnum > - (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) - return -EINVAL; - - phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); - - for (i = 0; i < ehdr->e_phnum; ++i) { - if (phdr[i].p_type == PT_NOTE && - !stack_map_parse_build_id(page_addr, build_id, - page_addr + phdr[i].p_offset, - phdr[i].p_filesz)) - return 0; - } - return -EINVAL; -} - -/* Parse build ID of ELF file mapped to vma */ -static int stack_map_get_build_id(struct vm_area_struct *vma, - unsigned char *build_id) -{ - Elf32_Ehdr *ehdr; - struct page *page; - void *page_addr; - int ret; - - /* only works for page backed storage */ - if (!vma->vm_file) - return -EINVAL; - - page = find_get_page(vma->vm_file->f_mapping, 0); - if (!page) - return -EFAULT; /* page not mapped */ - - ret = -EINVAL; - page_addr = kmap_atomic(page); - ehdr = (Elf32_Ehdr *)page_addr; - - /* compare magic x7f "ELF" */ - if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) - goto out; - - /* only support executable file and shared object file */ - if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) - goto out; - - if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) - ret = stack_map_get_build_id_32(page_addr, build_id); - else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) - ret = stack_map_get_build_id_64(page_addr, build_id); -out: - kunmap_atomic(page_addr); - put_page(page); - return ret; -} - static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, u64 *ips, u32 trace_nr, bool user) { @@ -317,18 +182,18 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; - memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); + memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); } return; } for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); - if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) { + if (!vma || build_id_parse(vma, id_offs[i].build_id)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; - memset(id_offs[i].build_id, 0, BPF_BUILD_ID_SIZE); + memset(id_offs[i].build_id, 0, BUILD_ID_SIZE_MAX); continue; } id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i] diff --git a/lib/Makefile b/lib/Makefile index afeff05fa8c5..a6b160c3a4fa 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -36,7 +36,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o dec_and_lock.o \ - nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o + nmi_backtrace.o nodemask.o win_minmax.o memcat_p.o \ + buildid.o lib-$(CONFIG_PRINTK) += dump_stack.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/buildid.c b/lib/buildid.c new file mode 100644 index 000000000000..4a4f520c0e29 --- /dev/null +++ b/lib/buildid.c @@ -0,0 +1,136 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include + +#define BUILD_ID 3 +/* + * Parse build id from the note segment. This logic can be shared between + * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are + * identical. + */ +static inline int parse_build_id(void *page_addr, + unsigned char *build_id, + void *note_start, + Elf32_Word note_size) +{ + Elf32_Word note_offs = 0, new_offs; + + /* check for overflow */ + if (note_start < page_addr || note_start + note_size < note_start) + return -EINVAL; + + /* only supports note that fits in the first page */ + if (note_start + note_size > page_addr + PAGE_SIZE) + return -EINVAL; + + while (note_offs + sizeof(Elf32_Nhdr) < note_size) { + Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs); + + if (nhdr->n_type == BUILD_ID && + nhdr->n_namesz == sizeof("GNU") && + nhdr->n_descsz > 0 && + nhdr->n_descsz <= BUILD_ID_SIZE_MAX) { + memcpy(build_id, + note_start + note_offs + + ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr), + nhdr->n_descsz); + memset(build_id + nhdr->n_descsz, 0, + BUILD_ID_SIZE_MAX - nhdr->n_descsz); + return 0; + } + new_offs = note_offs + sizeof(Elf32_Nhdr) + + ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4); + if (new_offs <= note_offs) /* overflow */ + break; + note_offs = new_offs; + } + return -EINVAL; +} + +/* Parse build ID from 32-bit ELF */ +static int get_build_id_32(void *page_addr, unsigned char *build_id) +{ + Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; + Elf32_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr)) + return -EINVAL; + + phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } + return -EINVAL; +} + +/* Parse build ID from 64-bit ELF */ +static int get_build_id_64(void *page_addr, unsigned char *build_id) +{ + Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; + Elf64_Phdr *phdr; + int i; + + /* only supports phdr that fits in one page */ + if (ehdr->e_phnum > + (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr)) + return -EINVAL; + + phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr)); + + for (i = 0; i < ehdr->e_phnum; ++i) { + if (phdr[i].p_type == PT_NOTE && + !parse_build_id(page_addr, build_id, + page_addr + phdr[i].p_offset, + phdr[i].p_filesz)) + return 0; + } + return -EINVAL; +} + +/* Parse build ID of ELF file mapped to vma */ +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id) +{ + Elf32_Ehdr *ehdr; + struct page *page; + void *page_addr; + int ret; + + /* only works for page backed storage */ + if (!vma->vm_file) + return -EINVAL; + + page = find_get_page(vma->vm_file->f_mapping, 0); + if (!page) + return -EFAULT; /* page not mapped */ + + ret = -EINVAL; + page_addr = kmap_atomic(page); + ehdr = (Elf32_Ehdr *)page_addr; + + /* compare magic x7f "ELF" */ + if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0) + goto out; + + /* only support executable file and shared object file */ + if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN) + goto out; + + if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) + ret = get_build_id_32(page_addr, build_id); + else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) + ret = get_build_id_64(page_addr, build_id); +out: + kunmap_atomic(page_addr); + put_page(page); + return ret; +} -- cgit v1.2.3 From 921f88fc891922b325b3668cd026a386571ed602 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jan 2021 14:40:43 +0100 Subject: bpf: Add size arg to build_id_parse function It's possible to have other build id types (other than default SHA1). Currently there's also ld support for MD5 build id. Adding size argument to build_id_parse function, that returns (if defined) size of the parsed build id, so we can recognize the build id type. Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210114134044.1418404-3-jolsa@kernel.org --- include/linux/buildid.h | 3 ++- kernel/bpf/stackmap.c | 2 +- lib/buildid.c | 29 +++++++++++++++++++++-------- 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/buildid.h b/include/linux/buildid.h index 08028a212589..40232f90db6e 100644 --- a/include/linux/buildid.h +++ b/include/linux/buildid.h @@ -6,6 +6,7 @@ #define BUILD_ID_SIZE_MAX 20 -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id); +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, + __u32 *size); #endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 55d254a59f07..cabaf7db8efc 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -189,7 +189,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, for (i = 0; i < trace_nr; i++) { vma = find_vma(current->mm, ips[i]); - if (!vma || build_id_parse(vma, id_offs[i].build_id)) { + if (!vma || build_id_parse(vma, id_offs[i].build_id, NULL)) { /* per entry fall back to ips */ id_offs[i].status = BPF_STACK_BUILD_ID_IP; id_offs[i].ip = ips[i]; diff --git a/lib/buildid.c b/lib/buildid.c index 4a4f520c0e29..6156997c3895 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -12,6 +12,7 @@ */ static inline int parse_build_id(void *page_addr, unsigned char *build_id, + __u32 *size, void *note_start, Elf32_Word note_size) { @@ -38,6 +39,8 @@ static inline int parse_build_id(void *page_addr, nhdr->n_descsz); memset(build_id + nhdr->n_descsz, 0, BUILD_ID_SIZE_MAX - nhdr->n_descsz); + if (size) + *size = nhdr->n_descsz; return 0; } new_offs = note_offs + sizeof(Elf32_Nhdr) + @@ -50,7 +53,8 @@ static inline int parse_build_id(void *page_addr, } /* Parse build ID from 32-bit ELF */ -static int get_build_id_32(void *page_addr, unsigned char *build_id) +static int get_build_id_32(void *page_addr, unsigned char *build_id, + __u32 *size) { Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr; Elf32_Phdr *phdr; @@ -65,7 +69,7 @@ static int get_build_id_32(void *page_addr, unsigned char *build_id) for (i = 0; i < ehdr->e_phnum; ++i) { if (phdr[i].p_type == PT_NOTE && - !parse_build_id(page_addr, build_id, + !parse_build_id(page_addr, build_id, size, page_addr + phdr[i].p_offset, phdr[i].p_filesz)) return 0; @@ -74,7 +78,8 @@ static int get_build_id_32(void *page_addr, unsigned char *build_id) } /* Parse build ID from 64-bit ELF */ -static int get_build_id_64(void *page_addr, unsigned char *build_id) +static int get_build_id_64(void *page_addr, unsigned char *build_id, + __u32 *size) { Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr; Elf64_Phdr *phdr; @@ -89,7 +94,7 @@ static int get_build_id_64(void *page_addr, unsigned char *build_id) for (i = 0; i < ehdr->e_phnum; ++i) { if (phdr[i].p_type == PT_NOTE && - !parse_build_id(page_addr, build_id, + !parse_build_id(page_addr, build_id, size, page_addr + phdr[i].p_offset, phdr[i].p_filesz)) return 0; @@ -97,8 +102,16 @@ static int get_build_id_64(void *page_addr, unsigned char *build_id) return -EINVAL; } -/* Parse build ID of ELF file mapped to vma */ -int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id) +/* + * Parse build ID of ELF file mapped to vma + * @vma: vma object + * @build_id: buffer to store build id, at least BUILD_ID_SIZE long + * @size: returns actual build id size in case of success + * + * Returns 0 on success, otherwise error (< 0). + */ +int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id, + __u32 *size) { Elf32_Ehdr *ehdr; struct page *page; @@ -126,9 +139,9 @@ int build_id_parse(struct vm_area_struct *vma, unsigned char *build_id) goto out; if (ehdr->e_ident[EI_CLASS] == ELFCLASS32) - ret = get_build_id_32(page_addr, build_id); + ret = get_build_id_32(page_addr, build_id, size); else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64) - ret = get_build_id_64(page_addr, build_id); + ret = get_build_id_64(page_addr, build_id, size); out: kunmap_atomic(page_addr); put_page(page); -- cgit v1.2.3 From 88a16a1309333e43d328621ece3e9fa37027e8eb Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 14 Jan 2021 14:40:44 +0100 Subject: perf: Add build id data in mmap2 event Adding support to carry build id data in mmap2 event. The build id data replaces maj/min/ino/ino_generation fields, which are also used to identify map's binary, so it's ok to replace them with build id data: union { struct { u32 maj; u32 min; u64 ino; u64 ino_generation; }; struct { u8 build_id_size; u8 __reserved_1; u16 __reserved_2; u8 build_id[20]; }; }; Replaced maj/min/ino/ino_generation fields give us size of 24 bytes. We use 20 bytes for build id data, 1 byte for size and rest is unused. There's new misc bit for mmap2 to signal there's build id data in it: #define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) Signed-off-by: Jiri Olsa Signed-off-by: Alexei Starovoitov Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/bpf/20210114134044.1418404-4-jolsa@kernel.org --- include/uapi/linux/perf_event.h | 42 ++++++++++++++++++++++++++++++++++++----- kernel/events/core.c | 32 +++++++++++++++++++++++++++---- 2 files changed, 65 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b15e3447cd9f..cb6f84103560 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -386,7 +386,8 @@ struct perf_event_attr { aux_output : 1, /* generate AUX records instead of events */ cgroup : 1, /* include cgroup events */ text_poke : 1, /* include text poke events */ - __reserved_1 : 30; + build_id : 1, /* use build id in mmap2 events */ + __reserved_1 : 29; union { __u32 wakeup_events; /* wakeup every n events */ @@ -659,6 +660,22 @@ struct perf_event_mmap_page { __u64 aux_size; }; +/* + * The current state of perf_event_header::misc bits usage: + * ('|' used bit, '-' unused bit) + * + * 012 CDEF + * |||---------|||| + * + * Where: + * 0-2 CPUMODE_MASK + * + * C PROC_MAP_PARSE_TIMEOUT + * D MMAP_DATA / COMM_EXEC / FORK_EXEC / SWITCH_OUT + * E MMAP_BUILD_ID / EXACT_IP / SCHED_OUT_PREEMPT + * F (reserved) + */ + #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) #define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_RECORD_MISC_KERNEL (1 << 0) @@ -690,6 +707,7 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_EXACT_IP - PERF_RECORD_SAMPLE of precise events * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events + * PERF_RECORD_MISC_MMAP_BUILD_ID - PERF_RECORD_MMAP2 event * * * PERF_RECORD_MISC_EXACT_IP: @@ -699,9 +717,13 @@ struct perf_event_mmap_page { * * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT: * Indicates that thread was preempted in TASK_RUNNING state. + * + * PERF_RECORD_MISC_MMAP_BUILD_ID: + * Indicates that mmap2 event carries build id data. */ #define PERF_RECORD_MISC_EXACT_IP (1 << 14) #define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) +#define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) /* * Reserve the last bit to indicate some extended misc field */ @@ -915,10 +937,20 @@ enum perf_event_type { * u64 addr; * u64 len; * u64 pgoff; - * u32 maj; - * u32 min; - * u64 ino; - * u64 ino_generation; + * union { + * struct { + * u32 maj; + * u32 min; + * u64 ino; + * u64 ino_generation; + * }; + * struct { + * u8 build_id_size; + * u8 __reserved_1; + * u16 __reserved_2; + * u8 build_id[20]; + * }; + * }; * u32 prot, flags; * char filename[]; * struct sample_id sample_id; diff --git a/kernel/events/core.c b/kernel/events/core.c index 55d18791a72d..c37401e3e5f7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -53,6 +53,7 @@ #include #include #include +#include #include "internal.h" @@ -397,6 +398,7 @@ static atomic_t nr_ksymbol_events __read_mostly; static atomic_t nr_bpf_events __read_mostly; static atomic_t nr_cgroup_events __read_mostly; static atomic_t nr_text_poke_events __read_mostly; +static atomic_t nr_build_id_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -4673,6 +4675,8 @@ static void unaccount_event(struct perf_event *event) dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); + if (event->attr.build_id) + atomic_dec(&nr_build_id_events); if (event->attr.comm) atomic_dec(&nr_comm_events); if (event->attr.namespaces) @@ -8046,6 +8050,8 @@ struct perf_mmap_event { u64 ino; u64 ino_generation; u32 prot, flags; + u8 build_id[BUILD_ID_SIZE_MAX]; + u32 build_id_size; struct { struct perf_event_header header; @@ -8077,6 +8083,7 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_sample_data sample; int size = mmap_event->event_id.header.size; u32 type = mmap_event->event_id.header.type; + bool use_build_id; int ret; if (!perf_event_mmap_match(event, data)) @@ -8101,13 +8108,25 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.pid = perf_event_pid(event, current); mmap_event->event_id.tid = perf_event_tid(event, current); + use_build_id = event->attr.build_id && mmap_event->build_id_size; + + if (event->attr.mmap2 && use_build_id) + mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_BUILD_ID; + perf_output_put(&handle, mmap_event->event_id); if (event->attr.mmap2) { - perf_output_put(&handle, mmap_event->maj); - perf_output_put(&handle, mmap_event->min); - perf_output_put(&handle, mmap_event->ino); - perf_output_put(&handle, mmap_event->ino_generation); + if (use_build_id) { + u8 size[4] = { (u8) mmap_event->build_id_size, 0, 0, 0 }; + + __output_copy(&handle, size, 4); + __output_copy(&handle, mmap_event->build_id, BUILD_ID_SIZE_MAX); + } else { + perf_output_put(&handle, mmap_event->maj); + perf_output_put(&handle, mmap_event->min); + perf_output_put(&handle, mmap_event->ino); + perf_output_put(&handle, mmap_event->ino_generation); + } perf_output_put(&handle, mmap_event->prot); perf_output_put(&handle, mmap_event->flags); } @@ -8236,6 +8255,9 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + if (atomic_read(&nr_build_id_events)) + build_id_parse(vma, mmap_event->build_id, &mmap_event->build_id_size); + perf_iterate_sb(perf_event_mmap_output, mmap_event, NULL); @@ -11172,6 +11194,8 @@ static void account_event(struct perf_event *event) inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); + if (event->attr.build_id) + atomic_inc(&nr_build_id_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.namespaces) -- cgit v1.2.3 From 668af87f995b6d6d09595c088ad1fb5dd9ff25d2 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 13 Jan 2021 15:48:34 +0106 Subject: printk: ringbuffer: fix line counting Counting text lines in a record simply involves counting the number of newline characters (+1). However, it is searching the full data block for newline characters, even though the text data can be (and often is) a subset of that area. Since the extra area in the data block was never initialized, the result is that extra newlines may be seen and counted. Restrict newline searching to the text data length. Fixes: b6cf8b3f3312 ("printk: add lockless ringbuffer") Signed-off-by: John Ogness Reviewed-by: Petr Mladek Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210113144234.6545-1-john.ogness@linutronix.de --- kernel/printk/printk_ringbuffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk_ringbuffer.c b/kernel/printk/printk_ringbuffer.c index 74e25a1704f2..617dd6358965 100644 --- a/kernel/printk/printk_ringbuffer.c +++ b/kernel/printk/printk_ringbuffer.c @@ -1720,7 +1720,7 @@ static bool copy_data(struct prb_data_ring *data_ring, /* Caller interested in the line count? */ if (line_count) - *line_count = count_lines(data, data_size); + *line_count = count_lines(data, len); /* Caller interested in the data content? */ if (!buf || !buf_size) -- cgit v1.2.3 From 89ccf18f032f26946e2ea6258120472eec6aa745 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 13 Jan 2021 17:50:13 +0106 Subject: printk: fix kmsg_dump_get_buffer length calulations kmsg_dump_get_buffer() uses @syslog to determine if the syslog prefix should be written to the buffer. However, when calculating the maximum number of records that can fit into the buffer, it always counts the bytes from the syslog prefix. Use @syslog when calculating the maximum number of records that can fit into the buffer. Fixes: e2ae715d66bf ("kmsg - kmsg_dump() use iterator to receive log buffer content") Signed-off-by: John Ogness Reviewed-by: Petr Mladek Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210113164413.1599-1-john.ogness@linutronix.de --- kernel/printk/printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c8847ee571f0..96e24074c962 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3423,7 +3423,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, while (prb_read_valid_info(prb, seq, &info, &line_count)) { if (r.info->seq >= dumper->next_seq) break; - l += get_record_print_text_size(&info, line_count, true, time); + l += get_record_print_text_size(&info, line_count, syslog, time); seq = r.info->seq + 1; } @@ -3433,7 +3433,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, &info, &line_count)) { if (r.info->seq >= dumper->next_seq) break; - l -= get_record_print_text_size(&info, line_count, true, time); + l -= get_record_print_text_size(&info, line_count, syslog, time); seq = r.info->seq + 1; } -- cgit v1.2.3 From 61e960b07b637f0295308ad91268501d744c21b5 Mon Sep 17 00:00:00 2001 From: Chen Zhou Date: Fri, 15 Jan 2021 17:37:17 +0800 Subject: cgroup-v1: add disabled controller check in cgroup1_parse_param() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When mounting a cgroup hierarchy with disabled controller in cgroup v1, all available controllers will be attached. For example, boot with cgroup_no_v1=cpu or cgroup_disable=cpu, and then mount with "mount -t cgroup -ocpu cpu /sys/fs/cgroup/cpu", then all enabled controllers will be attached except cpu. Fix this by adding disabled controller check in cgroup1_parse_param(). If the specified controller is disabled, just return error with information "Disabled controller xx" rather than attaching all the other enabled controllers. Fixes: f5dfb5315d34 ("cgroup: take options parsing into ->parse_monolithic()") Signed-off-by: Chen Zhou Reviewed-by: Zefan Li Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 32596fdbcd5b..a5751784ad74 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -917,6 +917,9 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param) for_each_subsys(ss, i) { if (strcmp(param->key, ss->legacy_name)) continue; + if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i)) + return invalfc(fc, "Disabled controller '%s'", + param->key); ctx->subsys_mask |= (1 << i); return 0; } -- cgit v1.2.3 From f0e386ee0c0b71ea6f7238506a4d0965a2dbef11 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Thu, 14 Jan 2021 18:10:12 +0106 Subject: printk: fix buffer overflow potential for print_text() Before the commit 896fbe20b4e2333fb55 ("printk: use the lockless ringbuffer"), msg_print_text() would only write up to size-1 bytes into the provided buffer. Some callers expect this behavior and append a terminator to returned string. In particular: arch/powerpc/xmon/xmon.c:dump_log_buf() arch/um/kernel/kmsg_dump.c:kmsg_dumper_stdout() msg_print_text() has been replaced by record_print_text(), which currently fills the full size of the buffer. This causes a buffer overflow for the above callers. Change record_print_text() so that it will only use size-1 bytes for text data. Also, for paranoia sakes, add a terminator after the text data. And finally, document this behavior so that it is clear that only size-1 bytes are used and a terminator is added. Fixes: 896fbe20b4e2333fb55 ("printk: use the lockless ringbuffer") Cc: stable@vger.kernel.org # 5.10+ Signed-off-by: John Ogness Reviewed-by: Petr Mladek Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210114170412.4819-1-john.ogness@linutronix.de --- kernel/printk/printk.c | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 96e24074c962..17fa6dc77053 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1292,11 +1292,16 @@ static size_t info_print_prefix(const struct printk_info *info, bool syslog, * done: * * - Add prefix for each line. + * - Drop truncated lines that no longer fit into the buffer. * - Add the trailing newline that has been removed in vprintk_store(). - * - Drop truncated lines that do not longer fit into the buffer. + * - Add a string terminator. + * + * Since the produced string is always terminated, the maximum possible + * return value is @r->text_buf_size - 1; * * Return: The length of the updated/prepared text, including the added - * prefixes and the newline. The dropped line(s) are not counted. + * prefixes and the newline. The terminator is not counted. The dropped + * line(s) are not counted. */ static size_t record_print_text(struct printk_record *r, bool syslog, bool time) @@ -1339,26 +1344,31 @@ static size_t record_print_text(struct printk_record *r, bool syslog, /* * Truncate the text if there is not enough space to add the - * prefix and a trailing newline. + * prefix and a trailing newline and a terminator. */ - if (len + prefix_len + text_len + 1 > buf_size) { + if (len + prefix_len + text_len + 1 + 1 > buf_size) { /* Drop even the current line if no space. */ - if (len + prefix_len + line_len + 1 > buf_size) + if (len + prefix_len + line_len + 1 + 1 > buf_size) break; - text_len = buf_size - len - prefix_len - 1; + text_len = buf_size - len - prefix_len - 1 - 1; truncated = true; } memmove(text + prefix_len, text, text_len); memcpy(text, prefix, prefix_len); + /* + * Increment the prepared length to include the text and + * prefix that were just moved+copied. Also increment for the + * newline at the end of this line. If this is the last line, + * there is no newline, but it will be added immediately below. + */ len += prefix_len + line_len + 1; - if (text_len == line_len) { /* - * Add the trailing newline removed in - * vprintk_store(). + * This is the last line. Add the trailing newline + * removed in vprintk_store(). */ text[prefix_len + line_len] = '\n'; break; @@ -1383,6 +1393,14 @@ static size_t record_print_text(struct printk_record *r, bool syslog, text_len -= line_len + 1; } + /* + * If a buffer was provided, it will be terminated. Space for the + * string terminator is guaranteed to be available. The terminator is + * not counted in the return value. + */ + if (buf_size > 0) + text[len] = 0; + return len; } -- cgit v1.2.3 From 385aac1519417b89cb91b77c22e4ca21db563cd0 Mon Sep 17 00:00:00 2001 From: Odin Ugedal Date: Sat, 16 Jan 2021 18:36:33 +0100 Subject: cgroup: fix psi monitor for root cgroup Fix NULL pointer dereference when adding new psi monitor to the root cgroup. PSI files for root cgroup was introduced in df5ba5be742 by using system wide psi struct when reading, but file write/monitor was not properly fixed. Since the PSI config for the root cgroup isn't initialized, the current implementation tries to lock a NULL ptr, resulting in a crash. Can be triggered by running this as root: $ tee /sys/fs/cgroup/cpu.pressure <<< "some 10000 1000000" Signed-off-by: Odin Ugedal Reviewed-by: Suren Baghdasaryan Acked-by: Dan Schatzberg Fixes: df5ba5be7425 ("kernel/sched/psi.c: expose pressure metrics on root cgroup") Acked-by: Johannes Weiner Cc: stable@vger.kernel.org # 5.2+ Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 613845769103..1ea995f801ec 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3564,6 +3564,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, { struct psi_trigger *new; struct cgroup *cgrp; + struct psi_group *psi; cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) @@ -3572,7 +3573,8 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, cgroup_get(cgrp); cgroup_kn_unlock(of->kn); - new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); -- cgit v1.2.3 From 301a33d51880619d0c5a581b5a48d3a5248fa84b Mon Sep 17 00:00:00 2001 From: Mircea Cirjaliu Date: Tue, 19 Jan 2021 21:53:18 +0100 Subject: bpf: Fix helper bpf_map_peek_elem_proto pointing to wrong callback I assume this was obtained by copy/paste. Point it to bpf_map_peek_elem() instead of bpf_map_pop_elem(). In practice it may have been less likely hit when under JIT given shielded via 84430d4232c3 ("bpf, verifier: avoid retpoline for map push/pop/peek operation"). Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Signed-off-by: Mircea Cirjaliu Signed-off-by: Daniel Borkmann Cc: Mauricio Vasquez Link: https://lore.kernel.org/bpf/AM7PR02MB6082663DFDCCE8DA7A6DD6B1BBA30@AM7PR02MB6082.eurprd02.prod.outlook.com --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bd8a3183d030..41ca280b1dc1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -108,7 +108,7 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) } const struct bpf_func_proto bpf_map_peek_elem_proto = { - .func = bpf_map_pop_elem, + .func = bpf_map_peek_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, -- cgit v1.2.3 From bc895e8b2a64e502fbba72748d59618272052a8b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Jan 2021 00:24:24 +0100 Subject: bpf: Fix signed_{sub,add32}_overflows type handling Fix incorrect signed_{sub,add32}_overflows() input types (and a related buggy comment). It looks like this might have slipped in via copy/paste issue, also given prior to 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") the signature of signed_sub_overflows() had s64 a and s64 b as its input args whereas now they are truncated to s32. Thus restore proper types. Also, the case of signed_add32_overflows() is not consistent to signed_sub32_overflows(). Both have s32 as inputs, therefore align the former. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: De4dCr0w Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 36af69fac591..e7368c5eacb7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5313,7 +5313,7 @@ static bool signed_add_overflows(s64 a, s64 b) return res < a; } -static bool signed_add32_overflows(s64 a, s64 b) +static bool signed_add32_overflows(s32 a, s32 b) { /* Do the add in u32, where overflow is well-defined */ s32 res = (s32)((u32)a + (u32)b); @@ -5323,7 +5323,7 @@ static bool signed_add32_overflows(s64 a, s64 b) return res < a; } -static bool signed_sub_overflows(s32 a, s32 b) +static bool signed_sub_overflows(s64 a, s64 b) { /* Do the sub in u64, where overflow is well-defined */ s64 res = (s64)((u64)a - (u64)b); @@ -5335,7 +5335,7 @@ static bool signed_sub_overflows(s32 a, s32 b) static bool signed_sub32_overflows(s32 a, s32 b) { - /* Do the sub in u64, where overflow is well-defined */ + /* Do the sub in u32, where overflow is well-defined */ s32 res = (s32)((u32)a - (u32)b); if (b < 0) -- cgit v1.2.3 From 97a0e1ea7b41c2db762c1258632f6ccc22719510 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 12 Jan 2021 19:26:12 +0100 Subject: net, xdp: Introduce __xdp_build_skb_from_frame utility routine Introduce __xdp_build_skb_from_frame utility routine to build the skb from xdp_frame. Rely on __xdp_build_skb_from_frame in cpumap code. Signed-off-by: Lorenzo Bianconi Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/4f9f4c6b3dd3933770c617eb6689dbc0c6e25863.1610475660.git.lorenzo@kernel.org Signed-off-by: Alexei Starovoitov --- include/net/xdp.h | 3 +++ kernel/bpf/cpumap.c | 46 ++-------------------------------------------- net/core/xdp.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 49 insertions(+), 44 deletions(-) (limited to 'kernel') diff --git a/include/net/xdp.h b/include/net/xdp.h index 0cf3976ce77c..689206dee6de 100644 --- a/include/net/xdp.h +++ b/include/net/xdp.h @@ -164,6 +164,9 @@ void xdp_warn(const char *msg, const char *func, const int line); #define XDP_WARN(msg) xdp_warn(msg, __func__, __LINE__) struct xdp_frame *xdp_convert_zc_to_xdp_frame(struct xdp_buff *xdp); +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev); static inline void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp) diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 747313698178..5d1469de6921 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -141,49 +141,6 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -static struct sk_buff *cpu_map_build_skb(struct xdp_frame *xdpf, - struct sk_buff *skb) -{ - unsigned int hard_start_headroom; - unsigned int frame_size; - void *pkt_data_start; - - /* Part of headroom was reserved to xdpf */ - hard_start_headroom = sizeof(struct xdp_frame) + xdpf->headroom; - - /* Memory size backing xdp_frame data already have reserved - * room for build_skb to place skb_shared_info in tailroom. - */ - frame_size = xdpf->frame_sz; - - pkt_data_start = xdpf->data - hard_start_headroom; - skb = build_skb_around(skb, pkt_data_start, frame_size); - if (unlikely(!skb)) - return NULL; - - skb_reserve(skb, hard_start_headroom); - __skb_put(skb, xdpf->len); - if (xdpf->metasize) - skb_metadata_set(skb, xdpf->metasize); - - /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdpf->dev_rx); - - /* Optional SKB info, currently missing: - * - HW checksum info (skb->ip_summed) - * - HW RX hash (skb_set_hash) - * - RX ring dev queue index (skb_record_rx_queue) - */ - - /* Until page_pool get SKB return path, release DMA here */ - xdp_release_frame(xdpf); - - /* Allow SKB to reuse area used by xdp_frame */ - xdp_scrub_frame(xdpf); - - return skb; -} - static void __cpu_map_ring_cleanup(struct ptr_ring *ring) { /* The tear-down procedure should have made sure that queue is @@ -350,7 +307,8 @@ static int cpu_map_kthread_run(void *data) struct sk_buff *skb = skbs[i]; int ret; - skb = cpu_map_build_skb(xdpf, skb); + skb = __xdp_build_skb_from_frame(xdpf, skb, + xdpf->dev_rx); if (!skb) { xdp_return_frame(xdpf); continue; diff --git a/net/core/xdp.c b/net/core/xdp.c index 3a8c9ab4ecbe..aeb09ed0704c 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -513,3 +513,47 @@ void xdp_warn(const char *msg, const char *func, const int line) WARN(1, "XDP_WARN: %s(line:%d): %s\n", func, line, msg); }; EXPORT_SYMBOL_GPL(xdp_warn); + +struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf, + struct sk_buff *skb, + struct net_device *dev) +{ + unsigned int headroom, frame_size; + void *hard_start; + + /* Part of headroom was reserved to xdpf */ + headroom = sizeof(*xdpf) + xdpf->headroom; + + /* Memory size backing xdp_frame data already have reserved + * room for build_skb to place skb_shared_info in tailroom. + */ + frame_size = xdpf->frame_sz; + + hard_start = xdpf->data - headroom; + skb = build_skb_around(skb, hard_start, frame_size); + if (unlikely(!skb)) + return NULL; + + skb_reserve(skb, headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); + + /* Essential SKB info: protocol and skb->dev */ + skb->protocol = eth_type_trans(skb, dev); + + /* Optional SKB info, currently missing: + * - HW checksum info (skb->ip_summed) + * - HW RX hash (skb_set_hash) + * - RX ring dev queue index (skb_record_rx_queue) + */ + + /* Until page_pool get SKB return path, release DMA here */ + xdp_release_frame(xdpf); + + /* Allow SKB to reuse area used by xdp_frame */ + xdp_scrub_frame(xdpf); + + return skb; +} +EXPORT_SYMBOL_GPL(__xdp_build_skb_from_frame); -- cgit v1.2.3 From 13ca51d5eb358edcb673afccb48c3440b9fda21b Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Tue, 19 Jan 2021 07:35:18 -0800 Subject: bpf: Permit size-0 datasec llvm patch https://reviews.llvm.org/D84002 permitted to emit empty rodata datasec if the elf .rodata section contains read-only data from local variables. These local variables will be not emitted as BTF_KIND_VARs since llvm converted these local variables as static variables with private linkage without debuginfo types. Such an empty rodata datasec will make skeleton code generation easy since for skeleton a rodata struct will be generated if there is a .rodata elf section. The existence of a rodata btf datasec is also consistent with the existence of a rodata map created by libbpf. The btf with such an empty rodata datasec will fail in the kernel though as kernel will reject a datasec with zero vlen and zero size. For example, for the below code, int sys_enter(void *ctx) { int fmt[6] = {1, 2, 3, 4, 5, 6}; int dst[6]; bpf_probe_read(dst, sizeof(dst), fmt); return 0; } We got the below btf (bpftool btf dump ./test.o): [1] PTR '(anon)' type_id=0 [2] FUNC_PROTO '(anon)' ret_type_id=3 vlen=1 'ctx' type_id=1 [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED [4] FUNC 'sys_enter' type_id=2 linkage=global [5] INT 'char' size=1 bits_offset=0 nr_bits=8 encoding=SIGNED [6] ARRAY '(anon)' type_id=5 index_type_id=7 nr_elems=4 [7] INT '__ARRAY_SIZE_TYPE__' size=4 bits_offset=0 nr_bits=32 encoding=(none) [8] VAR '_license' type_id=6, linkage=global-alloc [9] DATASEC '.rodata' size=0 vlen=0 [10] DATASEC 'license' size=0 vlen=1 type_id=8 offset=0 size=4 When loading the ./test.o to the kernel with bpftool, we see the following error: libbpf: Error loading BTF: Invalid argument(22) libbpf: magic: 0xeb9f ... [6] ARRAY (anon) type_id=5 index_type_id=7 nr_elems=4 [7] INT __ARRAY_SIZE_TYPE__ size=4 bits_offset=0 nr_bits=32 encoding=(none) [8] VAR _license type_id=6 linkage=1 [9] DATASEC .rodata size=24 vlen=0 vlen == 0 libbpf: Error loading .BTF into kernel: -22. BTF is optional, ignoring. Basically, libbpf changed .rodata datasec size to 24 since elf .rodata section size is 24. The kernel then rejected the BTF since vlen = 0. Note that the above kernel verifier failure can be worked around with changing local variable "fmt" to a static or global, optionally const, variable. This patch permits a datasec with vlen = 0 in kernel. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210119153519.3901963-1-yhs@fb.com --- kernel/bpf/btf.c | 5 ----- tools/testing/selftests/bpf/prog_tests/btf.c | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8962f988514f..756a93f534b6 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -3540,11 +3540,6 @@ static s32 btf_datasec_check_meta(struct btf_verifier_env *env, return -EINVAL; } - if (!btf_type_vlen(t)) { - btf_verifier_log_type(env, t, "vlen == 0"); - return -EINVAL; - } - if (!t->size) { btf_verifier_log_type(env, t, "size == 0"); return -EINVAL; diff --git a/tools/testing/selftests/bpf/prog_tests/btf.c b/tools/testing/selftests/bpf/prog_tests/btf.c index 8ae97e2a4b9d..055d2c0486ed 100644 --- a/tools/testing/selftests/bpf/prog_tests/btf.c +++ b/tools/testing/selftests/bpf/prog_tests/btf.c @@ -3509,6 +3509,27 @@ static struct btf_raw_test raw_tests[] = { .value_type_id = 3 /* arr_t */, .max_entries = 4, }, +/* + * elf .rodata section size 4 and btf .rodata section vlen 0. + */ +{ + .descr = "datasec: vlen == 0", + .raw_types = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* .rodata section */ + BTF_TYPE_ENC(NAME_NTH(1), BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 0), 4), + /* [2] */ + BTF_END_RAW, + }, + BTF_STR_SEC("\0.rodata"), + .map_type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .key_type_id = 1, + .value_type_id = 1, + .max_entries = 1, +}, }; /* struct btf_raw_test raw_tests[] */ -- cgit v1.2.3 From 9cacf81f8161111db25f98e78a7a0e32ae142b3f Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 15 Jan 2021 08:34:59 -0800 Subject: bpf: Remove extra lock_sock for TCP_ZEROCOPY_RECEIVE Add custom implementation of getsockopt hook for TCP_ZEROCOPY_RECEIVE. We skip generic hooks for TCP_ZEROCOPY_RECEIVE and have a custom call in do_tcp_getsockopt using the on-stack data. This removes 3% overhead for locking/unlocking the socket. Without this patch: 3.38% 0.07% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt | --3.30%--__cgroup_bpf_run_filter_getsockopt | --0.81%--__kmalloc With the patch applied: 0.52% 0.12% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt_kern Note, exporting uapi/tcp.h requires removing netinet/tcp.h from test_progs.h because those headers have confliciting definitions. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210115163501.805133-2-sdf@google.com --- include/linux/bpf-cgroup.h | 27 +- include/linux/indirect_call_wrapper.h | 6 + include/net/sock.h | 2 + include/net/tcp.h | 1 + kernel/bpf/cgroup.c | 46 +++ net/ipv4/tcp.c | 14 + net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + net/socket.c | 3 + tools/include/uapi/linux/tcp.h | 357 +++++++++++++++++++++ .../testing/selftests/bpf/prog_tests/bpf_tcp_ca.c | 1 + .../selftests/bpf/prog_tests/cls_redirect.c | 1 + .../selftests/bpf/prog_tests/sockmap_basic.c | 1 + .../testing/selftests/bpf/prog_tests/sockopt_sk.c | 28 ++ tools/testing/selftests/bpf/progs/sockopt_sk.c | 23 +- tools/testing/selftests/bpf/test_progs.h | 1 - 16 files changed, 506 insertions(+), 7 deletions(-) create mode 100644 tools/include/uapi/linux/tcp.h (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 72e69a0e1e8c..bcb2915e6124 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -147,6 +147,10 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int __user *optlen, int max_optlen, int retval); +int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, + int optname, void *optval, + int *optlen, int retval); + static inline enum bpf_cgroup_storage_type cgroup_storage_type( struct bpf_map *map) { @@ -364,10 +368,23 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, ({ \ int __ret = retval; \ if (cgroup_bpf_enabled) \ - __ret = __cgroup_bpf_run_filter_getsockopt(sock, level, \ - optname, optval, \ - optlen, max_optlen, \ - retval); \ + if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ + !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ + tcp_bpf_bypass_getsockopt, \ + level, optname)) \ + __ret = __cgroup_bpf_run_filter_getsockopt( \ + sock, level, optname, optval, optlen, \ + max_optlen, retval); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ + optlen, retval) \ +({ \ + int __ret = retval; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ + sock, level, optname, optval, optlen, retval); \ __ret; \ }) @@ -452,6 +469,8 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) ({ 0; }) #define BPF_CGROUP_RUN_PROG_GETSOCKOPT(sock, level, optname, optval, \ optlen, max_optlen, retval) ({ retval; }) +#define BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sock, level, optname, optval, \ + optlen, retval) ({ retval; }) #define BPF_CGROUP_RUN_PROG_SETSOCKOPT(sock, level, optname, optval, optlen, \ kernel_optval) ({ 0; }) diff --git a/include/linux/indirect_call_wrapper.h b/include/linux/indirect_call_wrapper.h index 54c02c84906a..cfcfef37b2f1 100644 --- a/include/linux/indirect_call_wrapper.h +++ b/include/linux/indirect_call_wrapper.h @@ -60,4 +60,10 @@ #define INDIRECT_CALL_INET(f, f2, f1, ...) f(__VA_ARGS__) #endif +#if IS_ENABLED(CONFIG_INET) +#define INDIRECT_CALL_INET_1(f, f1, ...) INDIRECT_CALL_1(f, f1, __VA_ARGS__) +#else +#define INDIRECT_CALL_INET_1(f, f1, ...) f(__VA_ARGS__) +#endif + #endif diff --git a/include/net/sock.h b/include/net/sock.h index 129d200bccb4..7644ea64a376 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1174,6 +1174,8 @@ struct proto { int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); + bool (*bpf_bypass_getsockopt)(int level, + int optname); void (*release_cb)(struct sock *sk); diff --git a/include/net/tcp.h b/include/net/tcp.h index 78d13c88720f..4bb42fb19711 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -403,6 +403,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); +bool tcp_bpf_bypass_getsockopt(int level, int optname); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 96555a8a2c54..416e7738981b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1486,6 +1486,52 @@ out: sockopt_free_buf(&ctx); return ret; } + +int __cgroup_bpf_run_filter_getsockopt_kern(struct sock *sk, int level, + int optname, void *optval, + int *optlen, int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + .optlen = *optlen, + .optval = optval, + .optval_end = optval + *optlen, + }; + int ret; + + /* Note that __cgroup_bpf_run_filter_getsockopt doesn't copy + * user data back into BPF buffer when reval != 0. This is + * done as an optimization to avoid extra copy, assuming + * kernel won't populate the data in case of an error. + * Here we always pass the data and memset() should + * be called if that data shouldn't be "exported". + */ + + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + if (!ret) + return -EPERM; + + if (ctx.optlen > *optlen) + return -EFAULT; + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) + return -EFAULT; + + /* BPF programs can shrink the buffer, export the modifications. + */ + if (ctx.optlen != 0) + *optlen = ctx.optlen; + + return ctx.retval; +} #endif static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 856ae516ac18..26aa923cf522 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4099,6 +4099,8 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return -EFAULT; lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc); + err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, + &zc, &len, err); release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, err)) goto zerocopy_rcv_sk_err; @@ -4133,6 +4135,18 @@ zerocopy_rcv_out: return 0; } +bool tcp_bpf_bypass_getsockopt(int level, int optname) +{ + /* TCP do_tcp_getsockopt has optimized getsockopt implementation + * to avoid extra socket lock for TCP_ZEROCOPY_RECEIVE. + */ + if (level == SOL_TCP && optname == TCP_ZEROCOPY_RECEIVE) + return true; + + return false; +} +EXPORT_SYMBOL(tcp_bpf_bypass_getsockopt); + int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 777306b5bc22..62b6fd385a47 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2793,6 +2793,7 @@ struct proto tcp_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 0e1509b02cb3..8539715ff035 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2121,6 +2121,7 @@ struct proto tcpv6_prot = { .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, + .bpf_bypass_getsockopt = tcp_bpf_bypass_getsockopt, .keepalive = tcp_set_keepalive, .recvmsg = tcp_recvmsg, .sendmsg = tcp_sendmsg, diff --git a/net/socket.c b/net/socket.c index 33e8b6c4e1d3..7f0617ab5437 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2126,6 +2126,9 @@ SYSCALL_DEFINE5(setsockopt, int, fd, int, level, int, optname, return __sys_setsockopt(fd, level, optname, optval, optlen); } +INDIRECT_CALLABLE_DECLARE(bool tcp_bpf_bypass_getsockopt(int level, + int optname)); + /* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. diff --git a/tools/include/uapi/linux/tcp.h b/tools/include/uapi/linux/tcp.h new file mode 100644 index 000000000000..13ceeb395eb8 --- /dev/null +++ b/tools/include/uapi/linux/tcp.h @@ -0,0 +1,357 @@ +/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the TCP protocol. + * + * Version: @(#)tcp.h 1.0.2 04/28/93 + * + * Author: Fred N. van Kempen, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _UAPI_LINUX_TCP_H +#define _UAPI_LINUX_TCP_H + +#include +#include +#include + +struct tcphdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +}; + +/* + * The union cast uses a gcc extension to avoid aliasing problems + * (union is compatible to any of its members) + * This means this part of the code is -fstrict-aliasing safe now. + */ +union tcp_word_hdr { + struct tcphdr hdr; + __be32 words[5]; +}; + +#define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) + +enum { + TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), + TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), + TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), + TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000), + TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000), + TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000), + TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000), + TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), + TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), + TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) +}; + +/* + * TCP general constants + */ +#define TCP_MSS_DEFAULT 536U /* IPv4 (RFC1122, RFC2581) */ +#define TCP_MSS_DESIRED 1220U /* IPv6 (tunneled), EDNS0 (RFC3226) */ + +/* TCP socket options */ +#define TCP_NODELAY 1 /* Turn off Nagle's algorithm. */ +#define TCP_MAXSEG 2 /* Limit MSS */ +#define TCP_CORK 3 /* Never send partially complete segments */ +#define TCP_KEEPIDLE 4 /* Start keeplives after this period */ +#define TCP_KEEPINTVL 5 /* Interval between keepalives */ +#define TCP_KEEPCNT 6 /* Number of keepalives before death */ +#define TCP_SYNCNT 7 /* Number of SYN retransmits */ +#define TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */ +#define TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */ +#define TCP_WINDOW_CLAMP 10 /* Bound advertised window */ +#define TCP_INFO 11 /* Information about this connection. */ +#define TCP_QUICKACK 12 /* Block/reenable quick acks */ +#define TCP_CONGESTION 13 /* Congestion control algorithm */ +#define TCP_MD5SIG 14 /* TCP MD5 Signature (RFC2385) */ +#define TCP_THIN_LINEAR_TIMEOUTS 16 /* Use linear timeouts for thin streams*/ +#define TCP_THIN_DUPACK 17 /* Fast retrans. after 1 dupack */ +#define TCP_USER_TIMEOUT 18 /* How long for loss retry before timeout */ +#define TCP_REPAIR 19 /* TCP sock is under repair right now */ +#define TCP_REPAIR_QUEUE 20 +#define TCP_QUEUE_SEQ 21 +#define TCP_REPAIR_OPTIONS 22 +#define TCP_FASTOPEN 23 /* Enable FastOpen on listeners */ +#define TCP_TIMESTAMP 24 +#define TCP_NOTSENT_LOWAT 25 /* limit number of unsent bytes in write queue */ +#define TCP_CC_INFO 26 /* Get Congestion Control (optional) info */ +#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */ +#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */ +#define TCP_REPAIR_WINDOW 29 /* Get/set window parameters */ +#define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */ +#define TCP_ULP 31 /* Attach a ULP to a TCP connection */ +#define TCP_MD5SIG_EXT 32 /* TCP MD5 Signature with extensions */ +#define TCP_FASTOPEN_KEY 33 /* Set the key for Fast Open (cookie) */ +#define TCP_FASTOPEN_NO_COOKIE 34 /* Enable TFO without a TFO cookie */ +#define TCP_ZEROCOPY_RECEIVE 35 +#define TCP_INQ 36 /* Notify bytes available to read as a cmsg on read */ + +#define TCP_CM_INQ TCP_INQ + +#define TCP_TX_DELAY 37 /* delay outgoing packets by XX usec */ + + +#define TCP_REPAIR_ON 1 +#define TCP_REPAIR_OFF 0 +#define TCP_REPAIR_OFF_NO_WP -1 /* Turn off without window probes */ + +struct tcp_repair_opt { + __u32 opt_code; + __u32 opt_val; +}; + +struct tcp_repair_window { + __u32 snd_wl1; + __u32 snd_wnd; + __u32 max_window; + + __u32 rcv_wnd; + __u32 rcv_wup; +}; + +enum { + TCP_NO_QUEUE, + TCP_RECV_QUEUE, + TCP_SEND_QUEUE, + TCP_QUEUES_NR, +}; + +/* why fastopen failed from client perspective */ +enum tcp_fastopen_client_fail { + TFO_STATUS_UNSPEC, /* catch-all */ + TFO_COOKIE_UNAVAILABLE, /* if not in TFO_CLIENT_NO_COOKIE mode */ + TFO_DATA_NOT_ACKED, /* SYN-ACK did not ack SYN data */ + TFO_SYN_RETRANSMITTED, /* SYN-ACK did not ack SYN data after timeout */ +}; + +/* for TCP_INFO socket option */ +#define TCPI_OPT_TIMESTAMPS 1 +#define TCPI_OPT_SACK 2 +#define TCPI_OPT_WSCALE 4 +#define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ +#define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ +#define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + +/* + * Sender's congestion state indicating normal or abnormal situations + * in the last round of packets sent. The state is driven by the ACK + * information and timer events. + */ +enum tcp_ca_state { + /* + * Nothing bad has been observed recently. + * No apparent reordering, packet loss, or ECN marks. + */ + TCP_CA_Open = 0, +#define TCPF_CA_Open (1< +#include #include #include "bpf_dctcp.skel.h" #include "bpf_cubic.skel.h" diff --git a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c index 9781d85cb223..e075d03ab630 100644 --- a/tools/testing/selftests/bpf/prog_tests/cls_redirect.c +++ b/tools/testing/selftests/bpf/prog_tests/cls_redirect.c @@ -7,6 +7,7 @@ #include #include +#include #include diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c index 85f73261fab0..b8b48cac2ac3 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_basic.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2020 Cloudflare #include +#include #include "test_progs.h" #include "test_skmsg_load_helpers.skel.h" diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c index b25c9c45c148..d5b44b135c00 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c +++ b/tools/testing/selftests/bpf/prog_tests/sockopt_sk.c @@ -2,6 +2,12 @@ #include #include "cgroup_helpers.h" +#include + +#ifndef SOL_TCP +#define SOL_TCP IPPROTO_TCP +#endif + #define SOL_CUSTOM 0xdeadbeef static int getsetsockopt(void) @@ -11,6 +17,7 @@ static int getsetsockopt(void) char u8[4]; __u32 u32; char cc[16]; /* TCP_CA_NAME_MAX */ + struct tcp_zerocopy_receive zc; } buf = {}; socklen_t optlen; char *big_buf = NULL; @@ -154,6 +161,27 @@ static int getsetsockopt(void) goto err; } + /* TCP_ZEROCOPY_RECEIVE triggers */ + memset(&buf, 0, sizeof(buf)); + optlen = sizeof(buf.zc); + err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); + if (err) { + log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", + err, errno); + goto err; + } + + memset(&buf, 0, sizeof(buf)); + buf.zc.address = 12345; /* rejected by BPF */ + optlen = sizeof(buf.zc); + errno = 0; + err = getsockopt(fd, SOL_TCP, TCP_ZEROCOPY_RECEIVE, &buf, &optlen); + if (errno != EPERM) { + log_err("Unexpected getsockopt(TCP_ZEROCOPY_RECEIVE) err=%d errno=%d", + err, errno); + goto err; + } + free(big_buf); close(fd); return 0; diff --git a/tools/testing/selftests/bpf/progs/sockopt_sk.c b/tools/testing/selftests/bpf/progs/sockopt_sk.c index 712df7b49cb1..d3597f81e6e9 100644 --- a/tools/testing/selftests/bpf/progs/sockopt_sk.c +++ b/tools/testing/selftests/bpf/progs/sockopt_sk.c @@ -1,8 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include -#include +#include #include +#include #include char _license[] SEC("license") = "GPL"; @@ -12,6 +12,10 @@ __u32 _version SEC("version") = 1; #define PAGE_SIZE 4096 #endif +#ifndef SOL_TCP +#define SOL_TCP IPPROTO_TCP +#endif + #define SOL_CUSTOM 0xdeadbeef struct sockopt_sk { @@ -57,6 +61,21 @@ int _getsockopt(struct bpf_sockopt *ctx) return 1; } + if (ctx->level == SOL_TCP && ctx->optname == TCP_ZEROCOPY_RECEIVE) { + /* Verify that TCP_ZEROCOPY_RECEIVE triggers. + * It has a custom implementation for performance + * reasons. + */ + + if (optval + sizeof(struct tcp_zerocopy_receive) > optval_end) + return 0; /* EPERM, bounds check */ + + if (((struct tcp_zerocopy_receive *)optval)->address != 0) + return 0; /* EPERM, unexpected data */ + + return 1; + } + if (ctx->level == SOL_IP && ctx->optname == IP_FREEBIND) { if (optval + 1 > optval_end) return 0; /* EPERM, bounds check */ diff --git a/tools/testing/selftests/bpf/test_progs.h b/tools/testing/selftests/bpf/test_progs.h index e49e2fdde942..f7c2fd89d01a 100644 --- a/tools/testing/selftests/bpf/test_progs.h +++ b/tools/testing/selftests/bpf/test_progs.h @@ -16,7 +16,6 @@ typedef __u16 __sum16; #include #include #include -#include #include #include #include -- cgit v1.2.3 From 20f2505fb436cfa674cf1f46aaa624f44d3d1d03 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 15 Jan 2021 08:35:00 -0800 Subject: bpf: Try to avoid kzalloc in cgroup/{s,g}etsockopt When we attach a bpf program to cgroup/getsockopt any other getsockopt() syscall starts incurring kzalloc/kfree cost. Let add a small buffer on the stack and use it for small (majority) {s,g}etsockopt values. The buffer is small enough to fit into the cache line and cover the majority of simple options (most of them are 4 byte ints). It seems natural to do the same for setsockopt, but it's a bit more involved when the BPF program modifies the data (where we have to kmalloc). The assumption is that for the majority of setsockopt calls (which are doing pure BPF options or apply policy) this will bring some benefit as well. Without this patch (we remove about 1% __kmalloc): 3.38% 0.07% tcp_mmap [kernel.kallsyms] [k] __cgroup_bpf_run_filter_getsockopt | --3.30%--__cgroup_bpf_run_filter_getsockopt | --0.81%--__kmalloc Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210115163501.805133-3-sdf@google.com --- include/linux/filter.h | 5 +++++ kernel/bpf/cgroup.c | 52 +++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 7fdce5407214..5b3137d7b690 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -1298,6 +1298,11 @@ struct bpf_sysctl_kern { u64 tmp_reg; }; +#define BPF_SOCKOPT_KERN_BUF_SIZE 32 +struct bpf_sockopt_buf { + u8 data[BPF_SOCKOPT_KERN_BUF_SIZE]; +}; + struct bpf_sockopt_kern { struct sock *sk; u8 *optval; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 416e7738981b..ba8a1199d0ba 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1298,7 +1298,8 @@ static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, return empty; } -static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen, + struct bpf_sockopt_buf *buf) { if (unlikely(max_optlen < 0)) return -EINVAL; @@ -1310,6 +1311,15 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) max_optlen = PAGE_SIZE; } + if (max_optlen <= sizeof(buf->data)) { + /* When the optval fits into BPF_SOCKOPT_KERN_BUF_SIZE + * bytes avoid the cost of kzalloc. + */ + ctx->optval = buf->data; + ctx->optval_end = ctx->optval + max_optlen; + return max_optlen; + } + ctx->optval = kzalloc(max_optlen, GFP_USER); if (!ctx->optval) return -ENOMEM; @@ -1319,16 +1329,26 @@ static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) return max_optlen; } -static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) { + if (ctx->optval == buf->data) + return; kfree(ctx->optval); } +static bool sockopt_buf_allocated(struct bpf_sockopt_kern *ctx, + struct bpf_sockopt_buf *buf) +{ + return ctx->optval != buf->data; +} + int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, int *optname, char __user *optval, int *optlen, char **kernel_optval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = *level, @@ -1350,7 +1370,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ max_optlen = max_t(int, 16, *optlen); - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1390,14 +1410,31 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, */ if (ctx.optlen != 0) { *optlen = ctx.optlen; - *kernel_optval = ctx.optval; + /* We've used bpf_sockopt_kern->buf as an intermediary + * storage, but the BPF program indicates that we need + * to pass this data to the kernel setsockopt handler. + * No way to export on-stack buf, have to allocate a + * new buffer. + */ + if (!sockopt_buf_allocated(&ctx, &buf)) { + void *p = kmalloc(ctx.optlen, GFP_USER); + + if (!p) { + ret = -ENOMEM; + goto out; + } + memcpy(p, ctx.optval, ctx.optlen); + *kernel_optval = p; + } else { + *kernel_optval = ctx.optval; + } /* export and don't free sockopt buf */ return 0; } } out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } @@ -1407,6 +1444,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, int retval) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_buf buf = {}; struct bpf_sockopt_kern ctx = { .sk = sk, .level = level, @@ -1425,7 +1463,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ctx.optlen = max_optlen; - max_optlen = sockopt_alloc_buf(&ctx, max_optlen); + max_optlen = sockopt_alloc_buf(&ctx, max_optlen, &buf); if (max_optlen < 0) return max_optlen; @@ -1483,7 +1521,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, ret = ctx.retval; out: - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx, &buf); return ret; } -- cgit v1.2.3 From a9ed15dae0755a0368735e0556a462d8519bdb05 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Fri, 15 Jan 2021 08:35:01 -0800 Subject: bpf: Split cgroup_bpf_enabled per attach type When we attach any cgroup hook, the rest (even if unused/unattached) start to contribute small overhead. In particular, the one we want to avoid is __cgroup_bpf_run_filter_skb which does two redirections to get to the cgroup and pushes/pulls skb. Let's split cgroup_bpf_enabled to be per-attach to make sure only used attach types trigger. I've dropped some existing high-level cgroup_bpf_enabled in some places because BPF_PROG_CGROUP_XXX_RUN macros usually have another cgroup_bpf_enabled check. I also had to copy-paste BPF_CGROUP_RUN_SA_PROG_LOCK for GETPEERNAME/GETSOCKNAME because type for cgroup_bpf_enabled[type] has to be constant and known at compile time. Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20210115163501.805133-4-sdf@google.com --- include/linux/bpf-cgroup.h | 38 ++++++++++++++++++++------------------ kernel/bpf/cgroup.c | 14 ++++++-------- net/ipv4/af_inet.c | 9 +++++---- net/ipv4/udp.c | 7 +++---- net/ipv6/af_inet6.c | 9 +++++---- net/ipv6/udp.c | 7 +++---- 6 files changed, 42 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index bcb2915e6124..0748fd87969e 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -23,8 +23,8 @@ struct ctl_table_header; #ifdef CONFIG_CGROUP_BPF -extern struct static_key_false cgroup_bpf_enabled_key; -#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key) +extern struct static_key_false cgroup_bpf_enabled_key[MAX_BPF_ATTACH_TYPE]; +#define cgroup_bpf_enabled(type) static_branch_unlikely(&cgroup_bpf_enabled_key[type]) DECLARE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]); @@ -189,7 +189,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_INET_INGRESS)) \ __ret = __cgroup_bpf_run_filter_skb(sk, skb, \ BPF_CGROUP_INET_INGRESS); \ \ @@ -199,7 +199,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && sk && sk == skb->sk) { \ + if (cgroup_bpf_enabled(BPF_CGROUP_INET_EGRESS) && sk && sk == skb->sk) { \ typeof(sk) __sk = sk_to_full_sk(sk); \ if (sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_skb(__sk, skb, \ @@ -211,7 +211,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) { \ + if (cgroup_bpf_enabled(type)) { \ __ret = __cgroup_bpf_run_filter_sk(sk, type); \ } \ __ret; \ @@ -232,7 +232,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(type)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ NULL); \ __ret; \ @@ -241,7 +241,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) { \ + if (cgroup_bpf_enabled(type)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ t_ctx); \ @@ -256,8 +256,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) -#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ - sk->sk_prot->pre_connect) +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ + ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ + cgroup_bpf_enabled(BPF_CGROUP_INET6_CONNECT)) && \ + (sk)->sk_prot->pre_connect) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) @@ -301,7 +303,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(sock_ops, sk) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS)) \ __ret = __cgroup_bpf_run_filter_sock_ops(sk, \ sock_ops, \ BPF_CGROUP_SOCK_OPS); \ @@ -311,7 +313,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled && (sock_ops)->sk) { \ + if (cgroup_bpf_enabled(BPF_CGROUP_SOCK_OPS) && (sock_ops)->sk) { \ typeof(sk) __sk = sk_to_full_sk((sock_ops)->sk); \ if (__sk && sk_fullsock(__sk)) \ __ret = __cgroup_bpf_run_filter_sock_ops(__sk, \ @@ -324,7 +326,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type, major, minor, access) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_DEVICE)) \ __ret = __cgroup_bpf_check_dev_permission(type, major, minor, \ access, \ BPF_CGROUP_DEVICE); \ @@ -336,7 +338,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_PROG_SYSCTL(head, table, write, buf, count, pos) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SYSCTL)) \ __ret = __cgroup_bpf_run_filter_sysctl(head, table, write, \ buf, count, pos, \ BPF_CGROUP_SYSCTL); \ @@ -347,7 +349,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, kernel_optval) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_SETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_setsockopt(sock, level, \ optname, optval, \ optlen, \ @@ -358,7 +360,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_GETSOCKOPT_MAX_OPTLEN(optlen) \ ({ \ int __ret = 0; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ get_user(__ret, optlen); \ __ret; \ }) @@ -367,7 +369,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, max_optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ if (!(sock)->sk_prot->bpf_bypass_getsockopt || \ !INDIRECT_CALL_INET_1((sock)->sk_prot->bpf_bypass_getsockopt, \ tcp_bpf_bypass_getsockopt, \ @@ -382,7 +384,7 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, optlen, retval) \ ({ \ int __ret = retval; \ - if (cgroup_bpf_enabled) \ + if (cgroup_bpf_enabled(BPF_CGROUP_GETSOCKOPT)) \ __ret = __cgroup_bpf_run_filter_getsockopt_kern( \ sock, level, optname, optval, optlen, retval); \ __ret; \ @@ -444,7 +446,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, return 0; } -#define cgroup_bpf_enabled (0) +#define cgroup_bpf_enabled(type) (0) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) ({ 0; }) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index ba8a1199d0ba..da649f20d6b2 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -19,7 +19,7 @@ #include "../cgroup/cgroup-internal.h" -DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); +DEFINE_STATIC_KEY_ARRAY_FALSE(cgroup_bpf_enabled_key, MAX_BPF_ATTACH_TYPE); EXPORT_SYMBOL(cgroup_bpf_enabled_key); void cgroup_bpf_offline(struct cgroup *cgrp) @@ -128,7 +128,7 @@ static void cgroup_bpf_release(struct work_struct *work) if (pl->link) bpf_cgroup_link_auto_detach(pl->link); kfree(pl); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); } old_array = rcu_dereference_protected( cgrp->bpf.effective[type], @@ -499,7 +499,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp, if (old_prog) bpf_prog_put(old_prog); else - static_branch_inc(&cgroup_bpf_enabled_key); + static_branch_inc(&cgroup_bpf_enabled_key[type]); bpf_cgroup_storages_link(new_storage, cgrp, type); return 0; @@ -698,7 +698,7 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog, cgrp->bpf.flags[type] = 0; if (old_prog) bpf_prog_put(old_prog); - static_branch_dec(&cgroup_bpf_enabled_key); + static_branch_dec(&cgroup_bpf_enabled_key[type]); return 0; cleanup: @@ -1360,8 +1360,7 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) return 0; /* Allocate a bit more than the initial user buffer for @@ -1457,8 +1456,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, * attached to the hook so we don't waste time allocating * memory and locking the socket. */ - if (!cgroup_bpf_enabled || - __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + if (__cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) return retval; ctx.optlen = max_optlen; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index b94fa8eb831b..6ba2930ff49b 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -777,18 +777,19 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, return -ENOTCONN; sin->sin_port = inet->inet_dport; sin->sin_addr.s_addr = inet->inet_daddr; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET4_GETPEERNAME, + NULL); } else { __be32 addr = inet->inet_rcv_saddr; if (!addr) addr = inet->inet_saddr; sin->sin_port = inet->inet_sport; sin->sin_addr.s_addr = addr; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET4_GETPEERNAME : - BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET4_GETSOCKNAME, NULL); + } memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); return sizeof(*sin); } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 69ea76578abb..c67e483fce41 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1124,7 +1124,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP4_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, (struct sockaddr *)usin, &ipc.addr); if (err) @@ -1858,9 +1858,8 @@ try_again: memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); *addr_len = sizeof(*sin); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, - (struct sockaddr *)sin); + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, + (struct sockaddr *)sin); } if (udp_sk(sk)->gro_enabled) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 8e9c3e9ea36e..b9c654836b72 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -527,18 +527,19 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, sin->sin6_addr = sk->sk_v6_daddr; if (np->sndflow) sin->sin6_flowinfo = np->flow_label; + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, + BPF_CGROUP_INET6_GETPEERNAME, + NULL); } else { if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) sin->sin6_addr = np->saddr; else sin->sin6_addr = sk->sk_v6_rcv_saddr; sin->sin6_port = inet->inet_sport; - } - if (cgroup_bpf_enabled) BPF_CGROUP_RUN_SA_PROG_LOCK(sk, (struct sockaddr *)sin, - peer ? BPF_CGROUP_INET6_GETPEERNAME : - BPF_CGROUP_INET6_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, NULL); + } sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, sk->sk_bound_dev_if); return sizeof(*sin); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index b9f3dfdd2383..a02ac875a923 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -409,9 +409,8 @@ try_again: } *addr_len = sizeof(*sin6); - if (cgroup_bpf_enabled) - BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, - (struct sockaddr *)sin6); + BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, + (struct sockaddr *)sin6); } if (udp_sk(sk)->gro_enabled) @@ -1462,7 +1461,7 @@ do_udp_sendmsg: fl6.saddr = np->saddr; fl6.fl6_sport = inet->inet_sport; - if (cgroup_bpf_enabled && !connected) { + if (cgroup_bpf_enabled(BPF_CGROUP_UDP6_SENDMSG) && !connected) { err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, (struct sockaddr *)sin6, &fl6.saddr); if (err) -- cgit v1.2.3 From 997acaf6b4b59c6a9c259740312a69ea549cc684 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Mon, 11 Jan 2021 15:37:07 +0000 Subject: lockdep: report broken irq restoration We generally expect local_irq_save() and local_irq_restore() to be paired and sanely nested, and so local_irq_restore() expects to be called with irqs disabled. Thus, within local_irq_restore() we only trace irq flag changes when unmasking irqs. This means that a sequence such as: | local_irq_disable(); | local_irq_save(flags); | local_irq_enable(); | local_irq_restore(flags); ... is liable to break things, as the local_irq_restore() would mask irqs without tracing this change. Similar problems may exist for architectures whose arch_irq_restore() function depends on being called with irqs disabled. We don't consider such sequences to be a good idea, so let's define those as forbidden, and add tooling to detect such broken cases. This patch adds debug code to WARN() when raw_local_irq_restore() is called with irqs enabled. As raw_local_irq_restore() is expected to pair with raw_local_irq_save(), it should never be called with irqs enabled. To avoid the possibility of circular header dependencies between irqflags.h and bug.h, the warning is handled in a separate C file. The new code is all conditional on a new CONFIG_DEBUG_IRQFLAGS symbol which is independent of CONFIG_TRACE_IRQFLAGS. As noted above such cases will confuse lockdep, so CONFIG_DEBUG_LOCKDEP now selects CONFIG_DEBUG_IRQFLAGS. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210111153707.10071-1-mark.rutland@arm.com --- include/linux/irqflags.h | 12 ++++++++++++ kernel/locking/Makefile | 1 + kernel/locking/irqflag-debug.c | 11 +++++++++++ lib/Kconfig.debug | 8 ++++++++ 4 files changed, 32 insertions(+) create mode 100644 kernel/locking/irqflag-debug.c (limited to 'kernel') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 8de0e1373de7..600c10da321a 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -149,6 +149,17 @@ do { \ # define start_critical_timings() do { } while (0) #endif +#ifdef CONFIG_DEBUG_IRQFLAGS +extern void warn_bogus_irq_restore(void); +#define raw_check_bogus_irq_restore() \ + do { \ + if (unlikely(!arch_irqs_disabled())) \ + warn_bogus_irq_restore(); \ + } while (0) +#else +#define raw_check_bogus_irq_restore() do { } while (0) +#endif + /* * Wrap the arch provided IRQ routines to provide appropriate checks. */ @@ -162,6 +173,7 @@ do { \ #define raw_local_irq_restore(flags) \ do { \ typecheck(unsigned long, flags); \ + raw_check_bogus_irq_restore(); \ arch_local_irq_restore(flags); \ } while (0) #define raw_local_save_flags(flags) \ diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6d11cfb9b41f..8838f1d7c4a2 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -15,6 +15,7 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) endif +obj-$(CONFIG_DEBUG_IRQFLAGS) += irqflag-debug.o obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c new file mode 100644 index 000000000000..9603d207a4ca --- /dev/null +++ b/kernel/locking/irqflag-debug.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include +#include +#include + +void warn_bogus_irq_restore(void) +{ + WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n"); +} +EXPORT_SYMBOL(warn_bogus_irq_restore); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e6e58b26e888..78eadf615df8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1343,6 +1343,7 @@ config LOCKDEP_SMALL config DEBUG_LOCKDEP bool "Lock dependency engine debugging" depends on DEBUG_KERNEL && LOCKDEP + select DEBUG_IRQFLAGS help If you say Y here, the lock dependency engine will do additional runtime checks to debug itself, at the price @@ -1431,6 +1432,13 @@ config TRACE_IRQFLAGS_NMI depends on TRACE_IRQFLAGS depends on TRACE_IRQFLAGS_NMI_SUPPORT +config DEBUG_IRQFLAGS + bool "Debug IRQ flag manipulation" + help + Enables checks for potentially unsafe enabling or disabling of + interrupts, such as calling raw_local_irq_restore() when interrupts + are enabled. + config STACKTRACE bool "Stack backtrace support" depends on STACKTRACE_SUPPORT -- cgit v1.2.3 From 36c6e17bf16922935a5a0dd073d5b032d34aa73d Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Wed, 13 Jan 2021 18:31:41 +0000 Subject: sched/core: Print out straggler tasks in sched_cpu_dying() Since commit 1cf12e08bc4d ("sched/hotplug: Consolidate task migration on CPU unplug") tasks are expected to move themselves out of a out-going CPU. For most tasks this will be done automagically via BALANCE_PUSH, but percpu kthreads will have to cooperate and move themselves away one way or another. Currently, some percpu kthreads (workqueues being a notable exemple) do not cooperate nicely and can end up on an out-going CPU at the time sched_cpu_dying() is invoked. Print the dying rq's tasks to shed some light on the stragglers. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210113183141.11974-1-valentin.schneider@arm.com --- kernel/sched/core.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 15d2562118d1..627534fa9196 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7574,6 +7574,25 @@ static void calc_load_migrate(struct rq *rq) atomic_long_add(delta, &calc_load_tasks); } +static void dump_rq_tasks(struct rq *rq, const char *loglvl) +{ + struct task_struct *g, *p; + int cpu = cpu_of(rq); + + lockdep_assert_held(&rq->lock); + + printk("%sCPU%d enqueued tasks (%u total):\n", loglvl, cpu, rq->nr_running); + for_each_process_thread(g, p) { + if (task_cpu(p) != cpu) + continue; + + if (!task_on_rq_queued(p)) + continue; + + printk("%s\tpid: %d, name: %s\n", loglvl, p->pid, p->comm); + } +} + int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); @@ -7583,7 +7602,10 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); - BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); + if (rq->nr_running != 1 || rq_has_pinned_tasks(rq)) { + WARN(true, "Dying CPU not properly vacated!"); + dump_rq_tasks(rq, KERN_WARNING); + } rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); -- cgit v1.2.3 From 547a77d02f8cfb345631ce23b5b548d27afa0fc4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 11 Jan 2021 23:26:33 +0800 Subject: workqueue: Use cpu_possible_mask instead of cpu_active_mask to break affinity The scheduler won't break affinity for us any more, and we should "emulate" the same behavior when the scheduler breaks affinity for us. The behavior is "changing the cpumask to cpu_possible_mask". And there might be some other CPUs online later while the worker is still running with the pending work items. The worker should be allowed to use the later online CPUs as before and process the work items ASAP. If we use cpu_active_mask here, we can't achieve this goal but using cpu_possible_mask can. Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug") Signed-off-by: Lai Jiangshan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Acked-by: Tejun Heo Tested-by: Paul E. McKenney Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210111152638.2417-4-jiangshanlai@gmail.com --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9880b6c0e272..1646331546eb 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4920,7 +4920,7 @@ static void unbind_workers(int cpu) raw_spin_unlock_irq(&pool->lock); for_each_pool_worker(worker, pool) - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); mutex_unlock(&wq_pool_attach_mutex); -- cgit v1.2.3 From 22f667c97aadbf481e2cae2d6feabdf431e27b31 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jan 2021 18:17:45 +0100 Subject: sched: Don't run cpu-online with balance_push() enabled We don't need to push away tasks when we come online, mark the push complete right before the CPU dies. XXX hotplug state machine has trouble with rollback here. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103506.415606087@infradead.org --- kernel/sched/core.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 627534fa9196..8da0fd7f3cca 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7320,10 +7320,12 @@ static void balance_push_set(int cpu, bool on) struct rq_flags rf; rq_lock_irqsave(rq, &rf); - if (on) + if (on) { + WARN_ON_ONCE(rq->balance_callback); rq->balance_callback = &balance_push_callback; - else + } else if (rq->balance_callback == &balance_push_callback) { rq->balance_callback = NULL; + } rq_unlock_irqrestore(rq, &rf); } @@ -7441,6 +7443,10 @@ int sched_cpu_activate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; + /* + * Make sure that when the hotplug state machine does a roll-back + * we clear balance_push. Ideally that would happen earlier... + */ balance_push_set(cpu, false); #ifdef CONFIG_SCHED_SMT @@ -7608,6 +7614,12 @@ int sched_cpu_dying(unsigned int cpu) } rq_unlock_irqrestore(rq, &rf); + /* + * Now that the CPU is offline, make sure we're welcome + * to new tasks once we come back up. + */ + balance_push_set(cpu, false); + calc_load_migrate(rq); update_max_interval(); nohz_balance_exit_idle(rq); -- cgit v1.2.3 From ac687e6e8c26181a33270efd1a2e2241377924b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jan 2021 11:24:04 +0100 Subject: kthread: Extract KTHREAD_IS_PER_CPU There is a need to distinguish geniune per-cpu kthreads from kthreads that happen to have a single CPU affinity. Geniune per-cpu kthreads are kthreads that are CPU affine for correctness, these will obviously have PF_KTHREAD set, but must also have PF_NO_SETAFFINITY set, lest userspace modify their affinity and ruins things. However, these two things are not sufficient, PF_NO_SETAFFINITY is also set on other tasks that have their affinities controlled through other means, like for instance workqueues. Therefore another bit is needed; it turns out kthread_create_per_cpu() already has such a bit: KTHREAD_IS_PER_CPU, which is used to make kthread_park()/kthread_unpark() work correctly. Expose this flag and remove the implicit setting of it from kthread_create_on_cpu(); the io_uring usage of it seems dubious at best. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103506.557620262@infradead.org --- include/linux/kthread.h | 3 +++ kernel/kthread.c | 27 ++++++++++++++++++++++++++- kernel/smpboot.c | 1 + 3 files changed, 30 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 65b81e0c494d..2484ed97e72f 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -33,6 +33,9 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), unsigned int cpu, const char *namefmt); +void kthread_set_per_cpu(struct task_struct *k, int cpu); +bool kthread_is_per_cpu(struct task_struct *k); + /** * kthread_run - create and wake a thread. * @threadfn: the function to run until signal_pending(current). diff --git a/kernel/kthread.c b/kernel/kthread.c index a5eceecd4513..e0e4a423f184 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -493,11 +493,36 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), return p; kthread_bind(p, cpu); /* CPU hotplug need to bind once again when unparking the thread. */ - set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); to_kthread(p)->cpu = cpu; return p; } +void kthread_set_per_cpu(struct task_struct *k, int cpu) +{ + struct kthread *kthread = to_kthread(k); + if (!kthread) + return; + + WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY)); + + if (cpu < 0) { + clear_bit(KTHREAD_IS_PER_CPU, &kthread->flags); + return; + } + + kthread->cpu = cpu; + set_bit(KTHREAD_IS_PER_CPU, &kthread->flags); +} + +bool kthread_is_per_cpu(struct task_struct *k) +{ + struct kthread *kthread = to_kthread(k); + if (!kthread) + return false; + + return test_bit(KTHREAD_IS_PER_CPU, &kthread->flags); +} + /** * kthread_unpark - unpark a thread created by kthread_create(). * @k: thread created by kthread_create(). diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 2efe1e206167..f25208e8df83 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -188,6 +188,7 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kfree(td); return PTR_ERR(tsk); } + kthread_set_per_cpu(tsk, cpu); /* * Park the thread so that it could start right on the CPU * when it is available. -- cgit v1.2.3 From 5c25b5ff89f004c30b04759dc34ace8585a4085f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jan 2021 11:26:49 +0100 Subject: workqueue: Tag bound workers with KTHREAD_IS_PER_CPU Mark the per-cpu workqueue workers as KTHREAD_IS_PER_CPU. Workqueues have unfortunate semantics in that per-cpu workers are not default flushed and parked during hotplug, however a subset does manual flush on hotplug and hard relies on them for correctness. Therefore play silly games.. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103506.693465814@infradead.org --- kernel/workqueue.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1646331546eb..cce34336fbd7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1861,6 +1861,8 @@ static void worker_attach_to_pool(struct worker *worker, */ if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; + else + kthread_set_per_cpu(worker->task, pool->cpu); list_add_tail(&worker->node, &pool->workers); worker->pool = pool; @@ -1883,6 +1885,7 @@ static void worker_detach_from_pool(struct worker *worker) mutex_lock(&wq_pool_attach_mutex); + kthread_set_per_cpu(worker->task, -1); list_del(&worker->node); worker->pool = NULL; @@ -4919,8 +4922,10 @@ static void unbind_workers(int cpu) raw_spin_unlock_irq(&pool->lock); - for_each_pool_worker(worker, pool) + for_each_pool_worker(worker, pool) { + kthread_set_per_cpu(worker->task, -1); WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); + } mutex_unlock(&wq_pool_attach_mutex); @@ -4972,9 +4977,11 @@ static void rebind_workers(struct worker_pool *pool) * of all workers first and then clear UNBOUND. As we're called * from CPU_ONLINE, the following shouldn't fail. */ - for_each_pool_worker(worker, pool) + for_each_pool_worker(worker, pool) { + kthread_set_per_cpu(worker->task, pool->cpu); WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask) < 0); + } raw_spin_lock_irq(&pool->lock); -- cgit v1.2.3 From 640f17c82460e9724fd256f0a1f5d99e7ff0bda4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Jan 2021 19:08:36 +0100 Subject: workqueue: Restrict affinity change to rescuer create_worker() will already set the right affinity using kthread_bind_mask(), this means only the rescuer will need to change it's affinity. Howveer, while in cpu-hot-unplug a regular task is not allowed to run on online&&!active as it would be pushed away quite agressively. We need KTHREAD_IS_PER_CPU to survive in that environment. Therefore set the affinity after getting that magic flag. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103506.826629830@infradead.org --- kernel/workqueue.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index cce34336fbd7..894bb885b40b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1848,12 +1848,6 @@ static void worker_attach_to_pool(struct worker *worker, { mutex_lock(&wq_pool_attach_mutex); - /* - * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any - * online CPUs. It'll be re-applied when any of the CPUs come up. - */ - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); - /* * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains * stable across this function. See the comments above the flag @@ -1864,6 +1858,9 @@ static void worker_attach_to_pool(struct worker *worker, else kthread_set_per_cpu(worker->task, pool->cpu); + if (worker->rescue_wq) + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + list_add_tail(&worker->node, &pool->workers); worker->pool = pool; -- cgit v1.2.3 From 975707f227b07a8212060f94447171d15d7a681b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 Jan 2021 15:05:41 +0100 Subject: sched: Prepare to use balance_push in ttwu() In preparation of using the balance_push state in ttwu() we need it to provide a reliable and consistent state. The immediate problem is that rq->balance_callback gets cleared every schedule() and then re-set in the balance_push_callback() itself. This is not a reliable signal, so add a variable that stays set during the entire time. Also move setting it before the synchronize_rcu() in sched_cpu_deactivate(), such that we get guaranteed visibility to ttwu(), which is a preempt-disable region. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103506.966069627@infradead.org --- kernel/sched/core.c | 11 ++++++----- kernel/sched/sched.h | 1 + 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 8da0fd7f3cca..16946b55e8e7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7320,6 +7320,7 @@ static void balance_push_set(int cpu, bool on) struct rq_flags rf; rq_lock_irqsave(rq, &rf); + rq->balance_push = on; if (on) { WARN_ON_ONCE(rq->balance_callback); rq->balance_callback = &balance_push_callback; @@ -7489,17 +7490,17 @@ int sched_cpu_deactivate(unsigned int cpu) int ret; set_cpu_active(cpu, false); + balance_push_set(cpu, true); + /* - * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU - * users of this state to go away such that all new such users will - * observe it. + * We've cleared cpu_active_mask / set balance_push, wait for all + * preempt-disabled and RCU users of this state to go away such that + * all new such users will observe it. * * Do sync before park smpboot threads to take care the rcu boost case. */ synchronize_rcu(); - balance_push_set(cpu, true); - rq_lock_irqsave(rq, &rf); if (rq->rd) { update_rq_clock(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 12ada79d40f3..bb09988451a0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -975,6 +975,7 @@ struct rq { unsigned long cpu_capacity_orig; struct callback_head *balance_callback; + unsigned char balance_push; unsigned char nohz_idle_balance; unsigned char idle_balance; -- cgit v1.2.3 From 5ba2ffba13a1e24e7b153683e97300f9cc6f605a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 12 Jan 2021 11:28:16 +0100 Subject: sched: Fix CPU hotplug / tighten is_per_cpu_kthread() Prior to commit 1cf12e08bc4d ("sched/hotplug: Consolidate task migration on CPU unplug") we'd leave any task on the dying CPU and break affinity and force them off at the very end. This scheme had to change in order to enable migrate_disable(). One cannot wait for migrate_disable() to complete while stuck in stop_machine(). Furthermore, since we need at the very least: idle, hotplug and stop threads at any point before stop_machine, we can't break affinity and/or push those away. Under the assumption that all per-cpu kthreads are sanely handled by CPU hotplug, the new code no long breaks affinity or migrates any of them (which then includes the critical ones above). However, there's an important difference between per-cpu kthreads and kthreads that happen to have a single CPU affinity which is lost. The latter class very much relies on the forced affinity breaking and migration semantics previously provided. Use the new kthread_is_per_cpu() infrastructure to tighten is_per_cpu_kthread() and fix the hot-unplug problems stemming from the change. Fixes: 1cf12e08bc4d ("sched/hotplug: Consolidate task migration on CPU unplug") Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103507.102416009@infradead.org --- kernel/sched/core.c | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 16946b55e8e7..56b09628692a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1796,13 +1796,28 @@ static inline bool rq_has_pinned_tasks(struct rq *rq) */ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) { + /* When not in the task's cpumask, no point in looking further. */ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; - if (is_per_cpu_kthread(p) || is_migration_disabled(p)) + /* migrate_disabled() must be allowed to finish. */ + if (is_migration_disabled(p)) return cpu_online(cpu); - return cpu_active(cpu); + /* Non kernel threads are not allowed during either online or offline. */ + if (!(p->flags & PF_KTHREAD)) + return cpu_active(cpu); + + /* KTHREAD_IS_PER_CPU is always allowed. */ + if (kthread_is_per_cpu(p)) + return cpu_online(cpu); + + /* Regular kernel threads don't get to stay during offline. */ + if (cpu_rq(cpu)->balance_push) + return false; + + /* But are allowed during online. */ + return cpu_online(cpu); } /* @@ -3121,6 +3136,13 @@ bool cpus_share_cache(int this_cpu, int that_cpu) static inline bool ttwu_queue_cond(int cpu, int wake_flags) { + /* + * Do not complicate things with the async wake_list while the CPU is + * in hotplug state. + */ + if (!cpu_active(cpu)) + return false; + /* * If the CPU does not share cache, then queue the task on the * remote rqs wakelist to avoid accessing remote data. @@ -7276,8 +7298,14 @@ static void balance_push(struct rq *rq) /* * Both the cpu-hotplug and stop task are in this case and are * required to complete the hotplug process. + * + * XXX: the idle task does not match kthread_is_per_cpu() due to + * histerical raisins. */ - if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { + if (rq->idle == push_task || + ((push_task->flags & PF_KTHREAD) && kthread_is_per_cpu(push_task)) || + is_migration_disabled(push_task)) { + /* * If this is the idle task on the outgoing CPU try to wake * up the hotplug control thread which might wait for the @@ -7309,7 +7337,7 @@ static void balance_push(struct rq *rq) /* * At this point need_resched() is true and we'll take the loop in * schedule(). The next pick is obviously going to be the stop task - * which is_per_cpu_kthread() and will push this task away. + * which kthread_is_per_cpu() and will push this task away. */ raw_spin_lock(&rq->lock); } @@ -7497,6 +7525,9 @@ int sched_cpu_deactivate(unsigned int cpu) * preempt-disabled and RCU users of this state to go away such that * all new such users will observe it. * + * Specifically, we rely on ttwu to no longer target this CPU, see + * ttwu_queue_cond() and is_cpu_allowed(). + * * Do sync before park smpboot threads to take care the rcu boost case. */ synchronize_rcu(); -- cgit v1.2.3 From 741ba80f6f9a4702089c122129f22df9774b3e64 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 16 Jan 2021 11:56:37 +0100 Subject: sched: Relax the set_cpus_allowed_ptr() semantics Now that we have KTHREAD_IS_PER_CPU to denote the critical per-cpu tasks to retain during CPU offline, we can relax the warning in set_cpus_allowed_ptr(). Any spurious kthread that wants to get on at the last minute will get pushed off before it can run. While during CPU online there is no harm, and actual benefit, to allowing kthreads back on early, it simplifies hotplug code and fixes a number of outstanding races. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Lai jiangshan Reviewed-by: Valentin Schneider Tested-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210121103507.240724591@infradead.org --- kernel/sched/core.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 56b09628692a..ff74fca39ed2 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2342,7 +2342,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { /* - * Kernel threads are allowed on online && !active CPUs. + * Kernel threads are allowed on online && !active CPUs, + * however, during cpu-hot-unplug, even these might get pushed + * away if not KTHREAD_IS_PER_CPU. * * Specifically, migration_disabled() tasks must not fail the * cpumask_any_and_distribute() pick below, esp. so on @@ -2386,16 +2388,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, __do_set_cpus_allowed(p, new_mask, flags); - if (p->flags & PF_KTHREAD) { - /* - * For kernel threads that do indeed end up on online && - * !active we want to ensure they are strict per-CPU threads. - */ - WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && - !cpumask_intersects(new_mask, cpu_active_mask) && - p->nr_cpus_allowed != 1); - } - return affine_move_task(rq, p, &rf, dest_cpu, flags); out: @@ -7518,6 +7510,13 @@ int sched_cpu_deactivate(unsigned int cpu) int ret; set_cpu_active(cpu, false); + + /* + * From this point forward, this CPU will refuse to run any task that + * is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively + * push those tasks away until this gets cleared, see + * sched_cpu_dying(). + */ balance_push_set(cpu, true); /* -- cgit v1.2.3 From bb8b81e396f7afbe7c50d789e2107512274d2a35 Mon Sep 17 00:00:00 2001 From: Loris Reiff Date: Fri, 22 Jan 2021 17:42:31 +0100 Subject: bpf, cgroup: Fix optlen WARN_ON_ONCE toctou A toctou issue in `__cgroup_bpf_run_filter_getsockopt` can trigger a WARN_ON_ONCE in a check of `copy_from_user`. `*optlen` is checked to be non-negative in the individual getsockopt functions beforehand. Changing `*optlen` in a race to a negative value will result in a `copy_from_user(ctx.optval, optval, ctx.optlen)` with `ctx.optlen` being a negative integer. Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Signed-off-by: Loris Reiff Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20210122164232.61770-1-loris.reiff@liblor.ch --- kernel/bpf/cgroup.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 96555a8a2c54..6ec8f02f463b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1442,6 +1442,11 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } + if (ctx.optlen < 0) { + ret = -EFAULT; + goto out; + } + if (copy_from_user(ctx.optval, optval, min(ctx.optlen, max_optlen)) != 0) { ret = -EFAULT; -- cgit v1.2.3 From f4a2da755a7e1f5d845c52aee71336cee289935a Mon Sep 17 00:00:00 2001 From: Loris Reiff Date: Fri, 22 Jan 2021 17:42:32 +0100 Subject: bpf, cgroup: Fix problematic bounds check Since ctx.optlen is signed, a larger value than max_value could be passed, as it is later on used as unsigned, which causes a WARN_ON_ONCE in the copy_to_user. Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks") Signed-off-by: Loris Reiff Signed-off-by: Daniel Borkmann Reviewed-by: Stanislav Fomichev Link: https://lore.kernel.org/bpf/20210122164232.61770-2-loris.reiff@liblor.ch --- kernel/bpf/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6ec8f02f463b..6aa9e10c6335 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1464,7 +1464,7 @@ int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, goto out; } - if (ctx.optlen > max_optlen) { + if (ctx.optlen > max_optlen || ctx.optlen < 0) { ret = -EFAULT; goto out; } -- cgit v1.2.3 From b9557caaf872271671bdc1ef003d72f421eb72f6 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Wed, 20 Jan 2021 18:08:56 -0800 Subject: bpf, inode_storage: Put file handler if no storage was found Put file f if inode_storage_ptr() returns NULL. Fixes: 8ea636848aca ("bpf: Implement bpf_local_storage for inodes") Signed-off-by: Pan Bian Signed-off-by: Daniel Borkmann Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210121020856.25507-1-bianpan2016@163.com --- kernel/bpf/bpf_inode_storage.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 2f0597320b6d..6639640523c0 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -125,8 +125,12 @@ static int bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key, fd = *(int *)key; f = fget_raw(fd); - if (!f || !inode_storage_ptr(f->f_inode)) + if (!f) + return -EBADF; + if (!inode_storage_ptr(f->f_inode)) { + fput(f); return -EBADF; + } sdata = bpf_local_storage_update(f->f_inode, (struct bpf_local_storage_map *)map, -- cgit v1.2.3 From 18b24d78d537c6ed2ff409637d714fc15053409b Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 21 Jan 2021 18:43:24 +0100 Subject: bpf: Fix typo in scalar{,32}_min_max_rsh comments s/bounts/bounds/ Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210121174324.24127-1-tklauser@distanz.ch --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 785d25392ead..d0eae51b31e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6266,7 +6266,7 @@ static void scalar32_min_max_rsh(struct bpf_reg_state *dst_reg, * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. + * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result. @@ -6297,7 +6297,7 @@ static void scalar_min_max_rsh(struct bpf_reg_state *dst_reg, * 3) the signed bounds cross zero, so they tell us nothing * about the result * If the value in dst_reg is known nonnegative, then again the - * unsigned bounts capture the signed bounds. + * unsigned bounds capture the signed bounds. * Thus, in all cases it suffices to blow away our signed bounds * and rely on inferring new ones from the unsigned bounds and * var_off of the result. -- cgit v1.2.3 From b4b7914a6a73fc169fd1ce2fcd78a1d83d9528a9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 8 Dec 2020 13:45:49 -0800 Subject: rcu: Make call_rcu() print mem_dump_obj() info for double-freed callback The debug-object double-free checks in __call_rcu() print out the RCU callback function, which is usually sufficient to track down the double free. However, all uses of things like queue_rcu_work() will have the same RCU callback function (rcu_work_rcufn() in this case), so a diagnostic message for a double queue_rcu_work() needs more than just the callback function. This commit therefore calls mem_dump_obj() to dump out any additional available information on the double-freed callback. Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Andrew Morton Cc: Reported-by: Andrii Nakryiko Tested-by: Naresh Kamboju Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..84513c52d9b0 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2941,6 +2941,7 @@ static void check_cb_ovld(struct rcu_data *rdp) static void __call_rcu(struct rcu_head *head, rcu_callback_t func) { + static atomic_t doublefrees; unsigned long flags; struct rcu_data *rdp; bool was_alldone; @@ -2954,8 +2955,10 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func) * Use rcu:rcu_callback trace event to find the previous * time callback was passed to __call_rcu(). */ - WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pS()!!!\n", - head, head->func); + if (atomic_inc_return(&doublefrees) < 4) { + pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); + mem_dump_obj(head); + } WRITE_ONCE(head->func, rcu_leak_callback); return; } -- cgit v1.2.3 From 309dca309fc39a9e3c31b916393b74bd174fd74e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 24 Jan 2021 11:02:34 +0100 Subject: block: store a block_device pointer in struct bio Replace the gendisk pointer in struct bio with a pointer to the newly improved struct block device. From that the gendisk can be trivially accessed with an extra indirection, but it also allows to directly look up all information related to partition remapping. Signed-off-by: Christoph Hellwig Acked-by: Tejun Heo Signed-off-by: Jens Axboe --- arch/m68k/emu/nfblock.c | 2 +- arch/xtensa/platforms/iss/simdisk.c | 2 +- block/bio-integrity.c | 18 +++++++++--------- block/bio.c | 31 +++++++++++-------------------- block/blk-cgroup.c | 7 ++++--- block/blk-core.c | 37 +++++++++++++++++-------------------- block/blk-crypto-fallback.c | 2 +- block/blk-crypto.c | 2 +- block/blk-merge.c | 17 ++++++++--------- block/blk-mq.c | 2 +- block/blk-throttle.c | 2 +- block/blk.h | 2 -- block/bounce.c | 2 +- block/genhd.c | 2 +- drivers/block/brd.c | 2 +- drivers/block/drbd/drbd_int.h | 4 ++-- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/null_blk/main.c | 2 +- drivers/block/pktcdvd.c | 4 ++-- drivers/block/ps3vram.c | 2 +- drivers/block/rsxx/dev.c | 2 +- drivers/block/umem.c | 2 +- drivers/block/zram/zram_drv.c | 2 +- drivers/lightnvm/pblk-init.c | 2 +- drivers/md/bcache/debug.c | 2 +- drivers/md/bcache/request.c | 7 ++++--- drivers/md/dm-bio-record.h | 9 +++------ drivers/md/dm-raid1.c | 10 +++++----- drivers/md/dm.c | 14 +++++++------- drivers/md/md-linear.c | 2 +- drivers/md/md.c | 2 +- drivers/md/md.h | 6 +++--- drivers/md/raid1.c | 6 +++--- drivers/md/raid10.c | 12 ++++++------ drivers/md/raid5.c | 2 +- drivers/nvdimm/blk.c | 4 ++-- drivers/nvdimm/btt.c | 4 ++-- drivers/nvdimm/pmem.c | 4 ++-- drivers/nvme/host/core.c | 6 +++--- drivers/nvme/host/lightnvm.c | 3 +-- drivers/nvme/host/multipath.c | 6 +++--- drivers/nvme/host/rdma.c | 2 +- drivers/s390/block/dcssblk.c | 2 +- drivers/s390/block/xpram.c | 2 +- fs/btrfs/check-integrity.c | 10 +++++----- fs/btrfs/raid56.c | 7 ++----- fs/btrfs/scrub.c | 2 +- fs/direct-io.c | 2 +- fs/f2fs/data.c | 12 +----------- include/linux/bio.h | 18 ++++++++---------- include/linux/blk-mq.h | 4 ++-- include/linux/blk_types.h | 3 +-- include/linux/blkdev.h | 5 +++-- kernel/trace/blktrace.c | 16 +++++++++------- mm/page_io.c | 2 +- 55 files changed, 154 insertions(+), 184 deletions(-) (limited to 'kernel') diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index 92d26c812441..ba808543161a 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -61,7 +61,7 @@ struct nfhd_device { static blk_qc_t nfhd_submit_bio(struct bio *bio) { - struct nfhd_device *dev = bio->bi_disk->private_data; + struct nfhd_device *dev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; struct bvec_iter iter; int dir, len, shift; diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 3447556d276d..fc09be7b1347 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -103,7 +103,7 @@ static void simdisk_transfer(struct simdisk *dev, unsigned long sector, static blk_qc_t simdisk_submit_bio(struct bio *bio) { - struct simdisk *dev = bio->bi_disk->private_data; + struct simdisk *dev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; struct bvec_iter iter; sector_t sector = bio->bi_iter.bi_sector; diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 9ffd7e289554..c3e5abcfdc98 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -140,7 +140,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; if (bip->bip_vcnt && - bvec_gap_to_prev(bio->bi_disk->queue, + bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue, &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; @@ -162,7 +162,7 @@ EXPORT_SYMBOL(bio_integrity_add_page); static blk_status_t bio_integrity_process(struct bio *bio, struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct blk_integrity_iter iter; struct bvec_iter bviter; struct bio_vec bv; @@ -171,7 +171,7 @@ static blk_status_t bio_integrity_process(struct bio *bio, void *prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; - iter.disk_name = bio->bi_disk->disk_name; + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; iter.seed = proc_iter->bi_sector; iter.prot_buf = prot_buf; @@ -208,8 +208,8 @@ static blk_status_t bio_integrity_process(struct bio *bio, bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); - struct request_queue *q = bio->bi_disk->queue; + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); + struct request_queue *q = bio->bi_bdev->bd_disk->queue; void *buf; unsigned long start, end; unsigned int len, nr_pages; @@ -329,7 +329,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); /* * At the moment verify is called bio's iterator was advanced @@ -355,7 +355,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) */ bool __bio_integrity_endio(struct bio *bio) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && @@ -381,7 +381,7 @@ bool __bio_integrity_endio(struct bio *bio) void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); bip->bip_iter.bi_sector += bytes_done >> 9; @@ -397,7 +397,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) void bio_integrity_trim(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); } diff --git a/block/bio.c b/block/bio.c index 1f2cc1fbe283..0b70ade17da6 100644 --- a/block/bio.c +++ b/block/bio.c @@ -607,16 +607,7 @@ void bio_truncate(struct bio *bio, unsigned new_size) */ void guard_bio_eod(struct bio *bio) { - sector_t maxsector; - struct block_device *part; - - rcu_read_lock(); - part = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (part) - maxsector = bdev_nr_sectors(part); - else - maxsector = get_capacity(bio->bi_disk); - rcu_read_unlock(); + sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; @@ -676,11 +667,10 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); /* - * most users will be overriding ->bi_disk with a new target, + * most users will be overriding ->bi_bdev with a new target, * so we don't set nor calculate new physical/hw segment counts here */ - bio->bi_disk = bio_src->bi_disk; - bio->bi_partno = bio_src->bi_partno; + bio->bi_bdev = bio_src->bi_bdev; bio_set_flag(bio, BIO_CLONED); if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); @@ -730,7 +720,7 @@ EXPORT_SYMBOL(bio_clone_fast); const char *bio_devname(struct bio *bio, char *buf) { - return disk_name(bio->bi_disk, bio->bi_partno, buf); + return bdevname(bio->bi_bdev, buf); } EXPORT_SYMBOL(bio_devname); @@ -1037,7 +1027,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) { unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; unsigned int max_append_sectors = queue_max_zone_append_sectors(q); struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; @@ -1145,7 +1135,8 @@ static void submit_bio_wait_endio(struct bio *bio) */ int submit_bio_wait(struct bio *bio) { - DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); unsigned long hang_check; bio->bi_private = &done; @@ -1422,8 +1413,8 @@ again: if (!bio_integrity_endio(bio)) return; - if (bio->bi_disk) - rq_qos_done_bio(bio->bi_disk->queue, bio); + if (bio->bi_bdev) + rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); /* * Need to have a real endio function for chained bios, otherwise @@ -1438,8 +1429,8 @@ again: goto again; } - if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_disk->queue, bio); + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 031114d454a6..3465d6ee708e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1800,7 +1800,8 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, struct blkcg_gq *blkg, *ret_blkg = NULL; rcu_read_lock(); - blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue); + blkg = blkg_lookup_create(css_to_blkcg(css), + bio->bi_bdev->bd_disk->queue); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; @@ -1836,8 +1837,8 @@ void bio_associate_blkg_from_css(struct bio *bio, if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { - blkg_get(bio->bi_disk->queue->root_blkg); - bio->bi_blkg = bio->bi_disk->queue->root_blkg; + blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg); + bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); diff --git a/block/blk-core.c b/block/blk-core.c index 08ff8ca32529..a3a54cd86c9c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -476,7 +476,7 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) static inline int bio_queue_enter(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; bool nowait = bio->bi_opf & REQ_NOWAIT; int ret; @@ -712,7 +712,7 @@ static inline bool bio_check_ro(struct bio *bio, struct block_device *part) static noinline int should_fail_bio(struct bio *bio) { - if (should_fail_request(bio->bi_disk->part0, bio->bi_iter.bi_size)) + if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; return 0; } @@ -741,13 +741,9 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) */ static inline int blk_partition_remap(struct bio *bio) { - struct block_device *p; + struct block_device *p = bio->bi_bdev; int ret = -EIO; - rcu_read_lock(); - p = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (unlikely(!p)) - goto out; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) goto out; if (unlikely(bio_check_ro(bio, p))) @@ -761,10 +757,9 @@ static inline int blk_partition_remap(struct bio *bio) bio->bi_iter.bi_sector - p->bd_start_sect); } - bio->bi_partno = 0; + bio->bi_bdev = bdev_whole(p); ret = 0; out: - rcu_read_unlock(); return ret; } @@ -805,7 +800,8 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, static noinline_for_stack bool submit_bio_checks(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct block_device *bdev = bio->bi_bdev; + struct request_queue *q = bdev->bd_disk->queue; blk_status_t status = BLK_STS_IOERR; struct blk_plug *plug; @@ -825,13 +821,13 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) if (should_fail_bio(bio)) goto end_io; - if (bio->bi_partno) { + if (bio->bi_bdev->bd_partno) { if (unlikely(blk_partition_remap(bio))) goto end_io; } else { - if (unlikely(bio_check_ro(bio, bio->bi_disk->part0))) + if (unlikely(bio_check_ro(bio, bdev_whole(bdev)))) goto end_io; - if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) + if (unlikely(bio_check_eod(bio, get_capacity(bdev->bd_disk)))) goto end_io; } @@ -924,7 +920,7 @@ end_io: static blk_qc_t __submit_bio(struct bio *bio) { - struct gendisk *disk = bio->bi_disk; + struct gendisk *disk = bio->bi_bdev->bd_disk; blk_qc_t ret = BLK_QC_T_NONE; if (blk_crypto_bio_prep(&bio)) { @@ -966,7 +962,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) current->bio_list = bio_list_on_stack; do { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct bio_list lower, same; if (unlikely(bio_queue_enter(bio) != 0)) @@ -987,7 +983,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bio->bi_disk->queue) + if (q == bio->bi_bdev->bd_disk->queue) bio_list_add(&same, bio); else bio_list_add(&lower, bio); @@ -1012,7 +1008,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) current->bio_list = bio_list; do { - struct gendisk *disk = bio->bi_disk; + struct gendisk *disk = bio->bi_bdev->bd_disk; if (unlikely(bio_queue_enter(bio) != 0)) continue; @@ -1055,7 +1051,7 @@ blk_qc_t submit_bio_noacct(struct bio *bio) return BLK_QC_T_NONE; } - if (!bio->bi_disk->fops->submit_bio) + if (!bio->bi_bdev->bd_disk->fops->submit_bio) return __submit_bio_noacct_mq(bio); return __submit_bio_noacct(bio); } @@ -1067,7 +1063,7 @@ EXPORT_SYMBOL(submit_bio_noacct); * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The - * bio will be send to the device described by the bi_disk and bi_partno fields. + * bio will be send to the device described by the bi_bdev field. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback @@ -1087,7 +1083,8 @@ blk_qc_t submit_bio(struct bio *bio) unsigned int count; if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - count = queue_logical_block_size(bio->bi_disk->queue) >> 9; + count = queue_logical_block_size( + bio->bi_bdev->bd_disk->queue) >> 9; else count = bio_sectors(bio); diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index c162b754efbd..8f1e18176731 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -167,7 +167,7 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL); if (!bio) return NULL; - bio->bi_disk = bio_src->bi_disk; + bio->bi_bdev = bio_src->bi_bdev; bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 5da43f0973b4..09fcb18fa778 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -280,7 +280,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ - if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, + if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm, &bc_key->crypto_cfg)) return true; diff --git a/block/blk-merge.c b/block/blk-merge.c index 808768f6b174..ffb4aa0ea68b 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -298,14 +298,13 @@ split: * Split a bio into two bios, chain the two bios, submit the second half and * store a pointer to the first half in *@bio. If the second bio is still too * big it will be split by a recursive call to this function. Since this - * function may allocate a new bio from @bio->bi_disk->queue->bio_split, it is - * the responsibility of the caller to ensure that - * @bio->bi_disk->queue->bio_split is only released after processing of the - * split bio has finished. + * function may allocate a new bio from q->bio_split, it is the responsibility + * of the caller to ensure that q->bio_split is only released after processing + * of the split bio has finished. */ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) { - struct request_queue *q = (*bio)->bi_disk->queue; + struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue; struct bio *split = NULL; switch (bio_op(*bio)) { @@ -358,9 +357,9 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) * * Split a bio into two bios, chains the two bios, submit the second half and * store a pointer to the first half in *@bio. Since this function may allocate - * a new bio from @bio->bi_disk->queue->bio_split, it is the responsibility of - * the caller to ensure that @bio->bi_disk->queue->bio_split is only released - * after processing of the split bio has finished. + * a new bio from q->bio_split, it is the responsibility of the caller to ensure + * that q->bio_split is only released after processing of the split bio has + * finished. */ void blk_queue_split(struct bio **bio) { @@ -866,7 +865,7 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) return false; /* must be same device */ - if (rq->rq_disk != bio->bi_disk) + if (rq->rq_disk != bio->bi_bdev->bd_disk) return false; /* only merge integrity protected bio into ditto rq */ diff --git a/block/blk-mq.c b/block/blk-mq.c index f285a9123a8b..74b17b396f4c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2128,7 +2128,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) */ blk_qc_t blk_mq_submit_bio(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); struct blk_mq_alloc_data data = { diff --git a/block/blk-throttle.c b/block/blk-throttle.c index d52cac9f3a7c..b1b22d863bdf 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -2178,7 +2178,7 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) bool blk_throtl_bio(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); diff --git a/block/blk.h b/block/blk.h index 7550364c326c..10ab7c0d0766 100644 --- a/block/blk.h +++ b/block/blk.h @@ -202,8 +202,6 @@ static inline void elevator_exit(struct request_queue *q, __elevator_exit(q, e); } -struct block_device *__disk_get_part(struct gendisk *disk, int partno); - ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, diff --git a/block/bounce.c b/block/bounce.c index d3f51acd6e3b..a22a8a1942b2 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -246,7 +246,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); if (!bio) return NULL; - bio->bi_disk = bio_src->bi_disk; + bio->bi_bdev = bio_src->bi_bdev; bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; diff --git a/block/genhd.c b/block/genhd.c index ca5d880af512..e536d0b4bbae 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -161,7 +161,7 @@ static void part_in_flight_rw(struct block_device *part, inflight[1] = 0; } -struct block_device *__disk_get_part(struct gendisk *disk, int partno) +static struct block_device *__disk_get_part(struct gendisk *disk, int partno) { struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c7c821419079..18bf99906662 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -284,7 +284,7 @@ out: static blk_qc_t brd_submit_bio(struct bio *bio) { - struct brd_device *brd = bio->bi_disk->private_data; + struct brd_device *brd = bio->bi_bdev->bd_disk->private_data; sector_t sector = bio->bi_iter.bi_sector; struct bio_vec bvec; struct bvec_iter iter; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 8f879e5c2f67..b2c93a29c251 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1579,8 +1579,8 @@ static inline void drbd_submit_bio_noacct(struct drbd_device *device, int fault_type, struct bio *bio) { __release(local); - if (!bio->bi_disk) { - drbd_err(device, "drbd_submit_bio_noacct: bio->bi_disk == NULL\n"); + if (!bio->bi_bdev) { + drbd_err(device, "drbd_submit_bio_noacct: bio->bi_bdev == NULL\n"); bio->bi_status = BLK_STS_IOERR; bio_endio(bio); return; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 330f851cb8f0..ea0f31ab3343 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -1595,7 +1595,7 @@ void do_submit(struct work_struct *ws) blk_qc_t drbd_submit_bio(struct bio *bio) { - struct drbd_device *device = bio->bi_disk->private_data; + struct drbd_device *device = bio->bi_bdev->bd_disk->private_data; unsigned long start_jif; blk_queue_split(&bio); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 5357c3a4a36f..d6c821d48090 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1420,7 +1420,7 @@ static blk_qc_t null_submit_bio(struct bio *bio) { sector_t sector = bio->bi_iter.bi_sector; sector_t nr_sectors = bio_sectors(bio); - struct nullb *nullb = bio->bi_disk->private_data; + struct nullb *nullb = bio->bi_bdev->bd_disk->private_data; struct nullb_queue *nq = nullb_to_queue(nullb); struct nullb_cmd *cmd; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index b8bb8ec7538d..658a0981cb54 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2374,7 +2374,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio) blk_queue_split(&bio); - pd = bio->bi_disk->queue->queuedata; + pd = bio->bi_bdev->bd_disk->queue->queuedata; if (!pd) { pr_err("%s incorrect request queue\n", bio_devname(bio, b)); goto end_io; @@ -2418,7 +2418,7 @@ static blk_qc_t pkt_submit_bio(struct bio *bio) split = bio; } - pkt_make_request_write(bio->bi_disk->queue, split); + pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split); } while (split != bio); return BLK_QC_T_NONE; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index b71d28372ef3..1d738999fb69 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -581,7 +581,7 @@ out: static blk_qc_t ps3vram_submit_bio(struct bio *bio) { - struct ps3_system_bus_device *dev = bio->bi_disk->private_data; + struct ps3_system_bus_device *dev = bio->bi_bdev->bd_disk->private_data; struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); int busy; diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index edacefff6e35..9a28322a8cd8 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -122,7 +122,7 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card, static blk_qc_t rsxx_submit_bio(struct bio *bio) { - struct rsxx_cardinfo *card = bio->bi_disk->private_data; + struct rsxx_cardinfo *card = bio->bi_bdev->bd_disk->private_data; struct rsxx_bio_meta *bio_meta; blk_status_t st = BLK_STS_IOERR; diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 2b95d7b33b91..982732dbe82e 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -521,7 +521,7 @@ static int mm_check_plugged(struct cardinfo *card) static blk_qc_t mm_submit_bio(struct bio *bio) { - struct cardinfo *card = bio->bi_disk->private_data; + struct cardinfo *card = bio->bi_bdev->bd_disk->private_data; pr_debug("mm_make_request %llu %u\n", (unsigned long long)bio->bi_iter.bi_sector, diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e2933cb7a82a..d6243dbc53cc 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1596,7 +1596,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) */ static blk_qc_t zram_submit_bio(struct bio *bio) { - struct zram *zram = bio->bi_disk->private_data; + struct zram *zram = bio->bi_bdev->bd_disk->private_data; if (!valid_io_request(zram, bio->bi_iter.bi_sector, bio->bi_iter.bi_size)) { diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c index b6246f73895c..5924f09c217b 100644 --- a/drivers/lightnvm/pblk-init.c +++ b/drivers/lightnvm/pblk-init.c @@ -49,7 +49,7 @@ struct bio_set pblk_bio_set; static blk_qc_t pblk_submit_bio(struct bio *bio) { - struct pblk *pblk = bio->bi_disk->queue->queuedata; + struct pblk *pblk = bio->bi_bdev->bd_disk->queue->queuedata; if (bio_op(bio) == REQ_OP_DISCARD) { pblk_discard(pblk, bio); diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index b00fd08d696b..058dd8014428 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -114,7 +114,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) check = bio_kmalloc(GFP_NOIO, bio_segments(bio)); if (!check) return; - check->bi_disk = bio->bi_disk; + check->bi_bdev = bio->bi_bdev; check->bi_opf = REQ_OP_READ; check->bi_iter.bi_sector = bio->bi_iter.bi_sector; check->bi_iter.bi_size = bio->bi_iter.bi_size; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 85b1f2a9b72d..dfc35d6d05ed 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -894,7 +894,8 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s, !(bio->bi_opf & (REQ_META|REQ_PRIO)) && s->iop.c->gc_stats.in_use < CUTOFF_CACHE_READA) reada = min_t(sector_t, dc->readahead >> 9, - get_capacity(bio->bi_disk) - bio_end_sector(bio)); + get_capacity(bio->bi_bdev->bd_disk) - + bio_end_sector(bio)); s->insert_bio_sectors = min(sectors, bio_sectors(bio) + reada); @@ -1167,7 +1168,7 @@ static void quit_max_writeback_rate(struct cache_set *c, blk_qc_t cached_dev_submit_bio(struct bio *bio) { struct search *s; - struct bcache_device *d = bio->bi_disk->private_data; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; struct cached_dev *dc = container_of(d, struct cached_dev, disk); int rw = bio_data_dir(bio); @@ -1274,7 +1275,7 @@ blk_qc_t flash_dev_submit_bio(struct bio *bio) { struct search *s; struct closure *cl; - struct bcache_device *d = bio->bi_disk->private_data; + struct bcache_device *d = bio->bi_bdev->bd_disk->private_data; if (unlikely(d->c && test_bit(CACHE_SET_IO_DISABLE, &d->c->flags))) { bio->bi_status = BLK_STS_IOERR; diff --git a/drivers/md/dm-bio-record.h b/drivers/md/dm-bio-record.h index 2ea0360108e1..a3b71350eec8 100644 --- a/drivers/md/dm-bio-record.h +++ b/drivers/md/dm-bio-record.h @@ -18,8 +18,7 @@ */ struct dm_bio_details { - struct gendisk *bi_disk; - u8 bi_partno; + struct block_device *bi_bdev; int __bi_remaining; unsigned long bi_flags; struct bvec_iter bi_iter; @@ -31,8 +30,7 @@ struct dm_bio_details { static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) { - bd->bi_disk = bio->bi_disk; - bd->bi_partno = bio->bi_partno; + bd->bi_bdev = bio->bi_bdev; bd->bi_flags = bio->bi_flags; bd->bi_iter = bio->bi_iter; bd->__bi_remaining = atomic_read(&bio->__bi_remaining); @@ -44,8 +42,7 @@ static inline void dm_bio_record(struct dm_bio_details *bd, struct bio *bio) static inline void dm_bio_restore(struct dm_bio_details *bd, struct bio *bio) { - bio->bi_disk = bd->bi_disk; - bio->bi_partno = bd->bi_partno; + bio->bi_bdev = bd->bi_bdev; bio->bi_flags = bd->bi_flags; bio->bi_iter = bd->bi_iter; atomic_set(&bio->__bi_remaining, bd->__bi_remaining); diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index fa09bc4e4c54..b0a82f29a2e4 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -145,7 +145,7 @@ static void dispatch_bios(void *context, struct bio_list *bio_list) struct dm_raid1_bio_record { struct mirror *m; - /* if details->bi_disk == NULL, details were not saved */ + /* if details->bi_bdev == NULL, details were not saved */ struct dm_bio_details details; region_t write_region; }; @@ -1190,7 +1190,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio) struct dm_raid1_bio_record *bio_record = dm_per_bio_data(bio, sizeof(struct dm_raid1_bio_record)); - bio_record->details.bi_disk = NULL; + bio_record->details.bi_bdev = NULL; if (rw == WRITE) { /* Save region for mirror_end_io() handler */ @@ -1257,7 +1257,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, goto out; if (unlikely(*error)) { - if (!bio_record->details.bi_disk) { + if (!bio_record->details.bi_bdev) { /* * There wasn't enough memory to record necessary * information for a retry or there was no other @@ -1282,7 +1282,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, bd = &bio_record->details; dm_bio_restore(bd, bio); - bio_record->details.bi_disk = NULL; + bio_record->details.bi_bdev = NULL; bio->bi_status = 0; queue_bio(ms, bio, rw); @@ -1292,7 +1292,7 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, } out: - bio_record->details.bi_disk = NULL; + bio_record->details.bi_bdev = NULL; return DM_ENDIO_DONE; } diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 7bac564f3faa..479ec5bea09e 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -977,16 +977,17 @@ static void clone_endio(struct bio *bio) struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; struct bio *orig_bio = io->orig_bio; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; if (unlikely(error == BLK_STS_TARGET)) { if (bio_op(bio) == REQ_OP_DISCARD && - !bio->bi_disk->queue->limits.max_discard_sectors) + !q->limits.max_discard_sectors) disable_discard(md); else if (bio_op(bio) == REQ_OP_WRITE_SAME && - !bio->bi_disk->queue->limits.max_write_same_sectors) + !q->limits.max_write_same_sectors) disable_write_same(md); else if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !bio->bi_disk->queue->limits.max_write_zeroes_sectors) + !q->limits.max_write_zeroes_sectors) disable_write_zeroes(md); } @@ -996,7 +997,7 @@ static void clone_endio(struct bio *bio) */ if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) { sector_t written_sector = bio->bi_iter.bi_sector; - struct request_queue *q = orig_bio->bi_disk->queue; + struct request_queue *q = orig_bio->bi_bdev->bd_disk->queue; u64 mask = (u64)blk_queue_zone_sectors(q) - 1; orig_bio->bi_iter.bi_sector += written_sector & mask; @@ -1422,8 +1423,7 @@ static int __send_empty_flush(struct clone_info *ci) */ bio_init(&flush_bio, NULL, 0); flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; - flush_bio.bi_disk = ci->io->md->disk; - bio_associate_blkg(&flush_bio); + bio_set_dev(&flush_bio, ci->io->md->disk->part0); ci->bio = &flush_bio; ci->sector_count = 0; @@ -1626,7 +1626,7 @@ static blk_qc_t __split_and_process_bio(struct mapped_device *md, static blk_qc_t dm_submit_bio(struct bio *bio) { - struct mapped_device *md = bio->bi_disk->private_data; + struct mapped_device *md = bio->bi_bdev->bd_disk->private_data; blk_qc_t ret = BLK_QC_T_NONE; int srcu_idx; struct dm_table *map; diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 68cac7d19278..63ed8329a98d 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -252,7 +252,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) start_sector + data_offset; if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) { + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) { /* Just ignore it */ bio_endio(bio); } else { diff --git a/drivers/md/md.c b/drivers/md/md.c index 04384452a7ab..cf06dbb1aa53 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -486,7 +486,7 @@ static void md_end_io(struct bio *bio) static blk_qc_t md_submit_bio(struct bio *bio) { const int rw = bio_data_dir(bio); - struct mddev *mddev = bio->bi_disk->private_data; + struct mddev *mddev = bio->bi_bdev->bd_disk->private_data; if (mddev == NULL || mddev->pers == NULL) { bio_io_error(bio); diff --git a/drivers/md/md.h b/drivers/md/md.h index 34070ab30a8a..f13290ccc1c2 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -556,7 +556,7 @@ static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sect static inline void md_sync_acct_bio(struct bio *bio, unsigned long nr_sectors) { - atomic_add(nr_sectors, &bio->bi_disk->sync_io); + md_sync_acct(bio->bi_bdev, nr_sectors); } struct md_personality @@ -793,14 +793,14 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_SAME && - !bio->bi_disk->queue->limits.max_write_same_sectors) + !bio->bi_bdev->bd_disk->queue->limits.max_write_same_sectors) mddev->queue->limits.max_write_same_sectors = 0; } static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio) { if (bio_op(bio) == REQ_OP_WRITE_ZEROES && - !bio->bi_disk->queue->limits.max_write_zeroes_sectors) + !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors) mddev->queue->limits.max_write_zeroes_sectors = 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c0347997f6ff..3b19141cdb4b 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -794,13 +794,13 @@ static void flush_bio_list(struct r1conf *conf, struct bio *bio) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void *)bio->bi_disk; + struct md_rdev *rdev = (void *)bio->bi_bdev; bio->bi_next = NULL; bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1520,7 +1520,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_disk = (void *)conf->mirrors[i].rdev; + mbio->bi_bdev = (void *)conf->mirrors[i].rdev; cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); if (cb) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c5d88ef6a45c..be8f14afb6d1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -882,13 +882,13 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_disk; + struct md_rdev *rdev = (void*)bio->bi_bdev; bio->bi_next = NULL; bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1075,13 +1075,13 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; - struct md_rdev *rdev = (void*)bio->bi_disk; + struct md_rdev *rdev = (void*)bio->bi_bdev; bio->bi_next = NULL; bio_set_dev(bio, rdev->bdev); if (test_bit(Faulty, &rdev->flags)) { bio_io_error(bio); } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) && - !blk_queue_discard(bio->bi_disk->queue))) + !blk_queue_discard(bio->bi_bdev->bd_disk->queue))) /* Just ignore it */ bio_endio(bio); else @@ -1253,7 +1253,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), r10_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ - mbio->bi_disk = (void *)rdev; + mbio->bi_bdev = (void *)rdev; atomic_inc(&r10_bio->remaining); @@ -3003,7 +3003,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr, /* Again, very different code for resync and recovery. * Both must result in an r10bio with a list of bios that - * have bi_end_io, bi_sector, bi_disk set, + * have bi_end_io, bi_sector, bi_bdev set, * and bi_private set to the r10bio. * For recovery, we may actually create several r10bios * with 2 bios in each, that correspond to the bios in the main one. diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3a90cc0e43ca..f411b9e5c332 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5310,7 +5310,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio) unsigned int chunk_sectors; unsigned int bio_sectors = bio_sectors(bio); - WARN_ON_ONCE(bio->bi_partno); + WARN_ON_ONCE(bio->bi_bdev->bd_partno); chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors); return chunk_sectors >= diff --git a/drivers/nvdimm/blk.c b/drivers/nvdimm/blk.c index 22e5617b2cea..e03a1f38d750 100644 --- a/drivers/nvdimm/blk.c +++ b/drivers/nvdimm/blk.c @@ -165,7 +165,7 @@ static int nsblk_do_bvec(struct nd_namespace_blk *nsblk, static blk_qc_t nd_blk_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip; - struct nd_namespace_blk *nsblk = bio->bi_disk->private_data; + struct nd_namespace_blk *nsblk = bio->bi_bdev->bd_disk->private_data; struct bvec_iter iter; unsigned long start; struct bio_vec bvec; @@ -177,7 +177,7 @@ static blk_qc_t nd_blk_submit_bio(struct bio *bio) bip = bio_integrity(bio); rw = bio_data_dir(bio); - do_acct = blk_queue_io_stat(bio->bi_disk->queue); + do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) start = bio_start_io_acct(bio); bio_for_each_segment(bvec, bio, iter) { diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index 12ff6f8784ac..41aa1f01fc07 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1442,7 +1442,7 @@ static int btt_do_bvec(struct btt *btt, struct bio_integrity_payload *bip, static blk_qc_t btt_submit_bio(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct btt *btt = bio->bi_disk->private_data; + struct btt *btt = bio->bi_bdev->bd_disk->private_data; struct bvec_iter iter; unsigned long start; struct bio_vec bvec; @@ -1452,7 +1452,7 @@ static blk_qc_t btt_submit_bio(struct bio *bio) if (!bio_integrity_prep(bio)) return BLK_QC_T_NONE; - do_acct = blk_queue_io_stat(bio->bi_disk->queue); + do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) start = bio_start_io_acct(bio); bio_for_each_segment(bvec, bio, iter) { diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 875076b0ea6c..72740835c85c 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -197,13 +197,13 @@ static blk_qc_t pmem_submit_bio(struct bio *bio) unsigned long start; struct bio_vec bvec; struct bvec_iter iter; - struct pmem_device *pmem = bio->bi_disk->private_data; + struct pmem_device *pmem = bio->bi_bdev->bd_disk->private_data; struct nd_region *nd_region = to_region(pmem); if (bio->bi_opf & REQ_PREFLUSH) ret = nvdimm_flush(nd_region, bio); - do_acct = blk_queue_io_stat(bio->bi_disk->queue); + do_acct = blk_queue_io_stat(bio->bi_bdev->bd_disk->queue); if (do_acct) start = bio_start_io_acct(bio); bio_for_each_segment(bvec, bio, iter) { diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 566788ba4e7d..a39befb4deba 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1113,7 +1113,7 @@ static int nvme_submit_user_cmd(struct request_queue *q, { bool write = nvme_is_write(cmd); struct nvme_ns *ns = q->queuedata; - struct gendisk *disk = ns ? ns->disk : NULL; + struct block_device *bdev = ns ? ns->disk->part0 : NULL; struct request *req; struct bio *bio = NULL; void *meta = NULL; @@ -1133,8 +1133,8 @@ static int nvme_submit_user_cmd(struct request_queue *q, if (ret) goto out; bio = req->bio; - bio->bi_disk = disk; - if (disk && meta_buffer && meta_len) { + bio->bi_bdev = bdev; + if (bdev && meta_buffer && meta_len) { meta = nvme_add_user_metadata(bio, meta_buffer, meta_len, meta_seed, write); if (IS_ERR(meta)) { diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 470cef3abec3..6c8eab8de288 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -757,7 +757,6 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, { bool write = nvme_is_write((struct nvme_command *)vcmd); struct nvm_dev *dev = ns->ndev; - struct gendisk *disk = ns->disk; struct request *rq; struct bio *bio = NULL; __le64 *ppa_list = NULL; @@ -817,7 +816,7 @@ static int nvme_nvm_submit_user_cmd(struct request_queue *q, vcmd->ph_rw.metadata = cpu_to_le64(metadata_dma); } - bio->bi_disk = disk; + bio->bi_bdev = ns->disk->part0; } blk_execute_rq(q, NULL, rq, 0); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 9ac762b28811..a6d44e7a775f 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -296,7 +296,7 @@ static bool nvme_available_path(struct nvme_ns_head *head) blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) { - struct nvme_ns_head *head = bio->bi_disk->private_data; + struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data; struct device *dev = disk_to_dev(head->disk); struct nvme_ns *ns; blk_qc_t ret = BLK_QC_T_NONE; @@ -312,7 +312,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio) srcu_idx = srcu_read_lock(&head->srcu); ns = nvme_find_path(head); if (likely(ns)) { - bio->bi_disk = ns->disk; + bio->bi_bdev = ns->disk->part0; bio->bi_opf |= REQ_NVME_MPATH; trace_block_bio_remap(bio, disk_devt(ns->head->disk), bio->bi_iter.bi_sector); @@ -352,7 +352,7 @@ static void nvme_requeue_work(struct work_struct *work) * Reset disk to the mpath node and resubmit to select a new * path. */ - bio->bi_disk = head->disk; + bio->bi_bdev = head->disk->part0; submit_bio_noacct(bio); } } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index b7ce4f221d99..f5ef3edeb2fd 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1468,7 +1468,7 @@ static int nvme_rdma_map_sg_pi(struct nvme_rdma_queue *queue, if (unlikely(nr)) goto mr_put; - nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_disk), c, + nvme_rdma_set_sig_attrs(blk_get_integrity(bio->bi_bdev->bd_disk), c, req->mr->sig_attrs, ns->pi_type); nvme_rdma_set_prot_checks(c, &req->mr->sig_attrs->check_mask); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 5c5cff3f2374..da33cb4cba28 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -879,7 +879,7 @@ dcssblk_submit_bio(struct bio *bio) blk_queue_split(&bio); bytes_done = 0; - dev_info = bio->bi_disk->private_data; + dev_info = bio->bi_bdev->bd_disk->private_data; if (dev_info == NULL) goto fail; if ((bio->bi_iter.bi_sector & 7) != 0 || diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index c2536f7767b3..d1ed39162943 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -184,7 +184,7 @@ static unsigned long xpram_highest_page_index(void) */ static blk_qc_t xpram_submit_bio(struct bio *bio) { - xpram_device_t *xdev = bio->bi_disk->private_data; + xpram_device_t *xdev = bio->bi_bdev->bd_disk->private_data; struct bio_vec bvec; struct bvec_iter iter; unsigned int index; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 6ff44e53814c..113cb85c1fd4 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -2674,7 +2674,7 @@ static void __btrfsic_submit_bio(struct bio *bio) mutex_lock(&btrfsic_mutex); /* since btrfsic_submit_bio() is also called before * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio_dev(bio) + bio->bi_partno); + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { unsigned int i = 0; @@ -2690,9 +2690,9 @@ static void __btrfsic_submit_bio(struct bio *bio) bio_is_patched = 0; if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_disk=%p)\n", + pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_disk); + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); @@ -2721,8 +2721,8 @@ static void __btrfsic_submit_bio(struct bio *bio) } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, disk=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_disk); + pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_bdev); if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { if ((dev_state->state->print_mask & (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 93fbf87bdc8d..b2204a2942cb 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1105,8 +1105,7 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, * devices or if they are not contiguous */ if (last_end == disk_start && !last->bi_status && - last->bi_disk == stripe->dev->bdev->bd_disk && - last->bi_partno == stripe->dev->bdev->bd_partno) { + last->bi_bdev == stripe->dev->bdev) { ret = bio_add_page(last, page, PAGE_SIZE, 0); if (ret == PAGE_SIZE) return 0; @@ -1357,9 +1356,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->bbio->num_stripes; i++) { stripe = &rbio->bbio->stripes[i]; if (in_range(physical, stripe->physical, rbio->stripe_len) && - stripe->dev->bdev && - bio->bi_disk == stripe->dev->bdev->bd_disk && - bio->bi_partno == stripe->dev->bdev->bd_partno) { + stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; } } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 5f4f88a4d2c8..33f8f0f108bf 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1695,7 +1695,7 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) sbio = sctx->wr_curr_bio; sctx->wr_curr_bio = NULL; - WARN_ON(!sbio->bio->bi_disk); + WARN_ON(!sbio->bio->bi_bdev); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer * orders the requests before sending them to the driver which diff --git a/fs/direct-io.c b/fs/direct-io.c index d53fa92a1ab6..2660e744da2d 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -434,7 +434,7 @@ static inline void dio_bio_submit(struct dio *dio, struct dio_submit *sdio) if (dio->is_async && dio->op == REQ_OP_READ && dio->should_dirty) bio_set_pages_dirty(bio); - dio->bio_disk = bio->bi_disk; + dio->bio_disk = bio->bi_bdev->bd_disk; if (sdio->submit_io) { sdio->submit_io(bio, dio->inode, sdio->logical_offset_in_bio); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index aa34d620bec9..8cbf03159752 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -427,16 +427,6 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) return 0; } -/* - * Return true, if pre_bio's bdev is same as its target device. - */ -static bool __same_bdev(struct f2fs_sb_info *sbi, - block_t blk_addr, struct bio *bio) -{ - struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL); - return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno; -} - static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages) { struct f2fs_sb_info *sbi = fio->sbi; @@ -741,7 +731,7 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, return false; if (last_blkaddr + 1 != cur_blkaddr) return false; - return __same_bdev(sbi, cur_blkaddr, bio); + return bio->bi_bdev == f2fs_target_device(sbi, cur_blkaddr, NULL); } static bool io_type_is_mergeable(struct f2fs_bio_info *io, diff --git a/include/linux/bio.h b/include/linux/bio.h index 1edda614f7ce..12af7aa5db37 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -483,24 +483,22 @@ extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); extern const char *bio_devname(struct bio *bio, char *buffer); -#define bio_set_dev(bio, bdev) \ -do { \ - if ((bio)->bi_disk != (bdev)->bd_disk) \ - bio_clear_flag(bio, BIO_THROTTLED);\ - (bio)->bi_disk = (bdev)->bd_disk; \ - (bio)->bi_partno = (bdev)->bd_partno; \ - bio_associate_blkg(bio); \ +#define bio_set_dev(bio, bdev) \ +do { \ + if ((bio)->bi_bdev != (bdev)) \ + bio_clear_flag(bio, BIO_THROTTLED); \ + (bio)->bi_bdev = (bdev); \ + bio_associate_blkg(bio); \ } while (0) #define bio_copy_dev(dst, src) \ do { \ - (dst)->bi_disk = (src)->bi_disk; \ - (dst)->bi_partno = (src)->bi_partno; \ + (dst)->bi_bdev = (src)->bi_bdev; \ bio_clone_blkg_association(dst, src); \ } while (0) #define bio_dev(bio) \ - disk_devt((bio)->bi_disk) + disk_devt((bio)->bi_bdev->bd_disk) #ifdef CONFIG_BLK_CGROUP void bio_associate_blkg(struct bio *bio); diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index d705b174d346..6b410dab48ee 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -602,8 +602,8 @@ static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, rq->bio = rq->biotail = bio; rq->ioprio = bio_prio(bio); - if (bio->bi_disk) - rq->rq_disk = bio->bi_disk; + if (bio->bi_bdev) + rq->rq_disk = bio->bi_bdev->bd_disk; } blk_qc_t blk_mq_submit_bio(struct bio *bio); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 866f74261b3b..8ebd8be3e050 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -222,7 +222,7 @@ static inline void bio_issue_init(struct bio_issue *issue, */ struct bio { struct bio *bi_next; /* request queue link */ - struct gendisk *bi_disk; + struct block_device *bi_bdev; unsigned int bi_opf; /* bottom bits req flags, * top bits REQ_OP. Use * accessors. @@ -231,7 +231,6 @@ struct bio { unsigned short bi_ioprio; unsigned short bi_write_hint; blk_status_t bi_status; - u8 bi_partno; atomic_t __bi_remaining; struct bvec_iter bi_iter; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f94ee3089e01..b55bd534b2e1 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1967,7 +1967,8 @@ void part_end_io_acct(struct block_device *part, struct bio *bio, */ static inline unsigned long bio_start_io_acct(struct bio *bio) { - return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio)); + return disk_start_io_acct(bio->bi_bdev->bd_disk, bio_sectors(bio), + bio_op(bio)); } /** @@ -1977,7 +1978,7 @@ static inline unsigned long bio_start_io_acct(struct bio *bio) */ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) { - return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time); + return disk_end_io_acct(bio->bi_bdev->bd_disk, bio_op(bio), start_time); } int bdev_read_only(struct block_device *bdev); diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index fb0fe4c66b84..9e9ee4945043 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -903,7 +903,7 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, static void blk_add_trace_bio_bounce(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BOUNCE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, @@ -915,22 +915,24 @@ static void blk_add_trace_bio_complete(void *ignore, static void blk_add_trace_bio_backmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_BACKMERGE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_BACKMERGE, + 0); } static void blk_add_trace_bio_frontmerge(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_FRONTMERGE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_FRONTMERGE, + 0); } static void blk_add_trace_bio_queue(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_QUEUE, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, struct bio *bio) { - blk_add_trace_bio(bio->bi_disk->queue, bio, BLK_TA_GETRQ, 0); + blk_add_trace_bio(bio->bi_bdev->bd_disk->queue, bio, BLK_TA_GETRQ, 0); } static void blk_add_trace_plug(void *ignore, struct request_queue *q) @@ -967,7 +969,7 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; rcu_read_lock(); @@ -997,7 +999,7 @@ static void blk_add_trace_split(void *ignore, struct bio *bio, unsigned int pdu) static void blk_add_trace_bio_remap(void *ignore, struct bio *bio, dev_t dev, sector_t from) { - struct request_queue *q = bio->bi_disk->queue; + struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct blk_trace *bt; struct blk_io_trace_remap r; diff --git a/mm/page_io.c b/mm/page_io.c index 9bca17ecc4df..a75f35464a4e 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -433,7 +433,7 @@ int swap_readpage(struct page *page, bool synchronous) ret = -ENOMEM; goto out; } - disk = bio->bi_disk; + disk = bio->bi_bdev->bd_disk; /* * Keep this task valid during swap readpage because the oom killer may * attempt to access it in the page fault retry time check. -- cgit v1.2.3 From 08d60e5999540110576e7c1346d486220751b7f9 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Sun, 24 Jan 2021 21:33:28 +0106 Subject: printk: fix string termination for record_print_text() Commit f0e386ee0c0b ("printk: fix buffer overflow potential for print_text()") added string termination in record_print_text(). However it used the wrong base pointer for adding the terminator. This led to a 0-byte being written somewhere beyond the buffer. Use the correct base pointer when adding the terminator. Fixes: f0e386ee0c0b ("printk: fix buffer overflow potential for print_text()") Reported-by: Sven Schnelle Signed-off-by: John Ogness Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20210124202728.4718-1-john.ogness@linutronix.de --- kernel/printk/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 17fa6dc77053..c55cd1820689 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1399,7 +1399,7 @@ static size_t record_print_text(struct printk_record *r, bool syslog, * not counted in the return value. */ if (buf_size > 0) - text[len] = 0; + r->text_buf[len] = 0; return len; } -- cgit v1.2.3 From 56c91a18432b631ca18438841fd1831ef756cabf Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Fri, 22 Jan 2021 15:42:14 +0800 Subject: kernel: kexec: remove the lock operation of system_transition_mutex Function kernel_kexec() is called with lock system_transition_mutex held in reboot system call. While inside kernel_kexec(), it will acquire system_transition_mutex agin. This will lead to dead lock. The dead lock should be easily triggered, it hasn't caused any failure report just because the feature 'kexec jump' is almost not used by anyone as far as I know. An inquiry can be made about who is using 'kexec jump' and where it's used. Before that, let's simply remove the lock operation inside CONFIG_KEXEC_JUMP ifdeffery scope. Fixes: 55f2503c3b69 ("PM / reboot: Eliminate race between reboot and suspend") Signed-off-by: Baoquan He Reported-by: Dan Carpenter Reviewed-by: Pingfan Liu Cc: 4.19+ # 4.19+ Signed-off-by: Rafael J. Wysocki --- kernel/kexec_core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index 4f8efc278aa7..aa919585c24b 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -1134,7 +1134,6 @@ int kernel_kexec(void) #ifdef CONFIG_KEXEC_JUMP if (kexec_image->preserve_context) { - lock_system_sleep(); pm_prepare_console(); error = freeze_processes(); if (error) { @@ -1197,7 +1196,6 @@ int kernel_kexec(void) thaw_processes(); Restore_console: pm_restore_console(); - unlock_system_sleep(); } #endif -- cgit v1.2.3 From fef9c8d28e28a808274a18fbd8cc2685817fd62a Mon Sep 17 00:00:00 2001 From: Laurent Badel Date: Fri, 22 Jan 2021 17:19:41 +0100 Subject: PM: hibernate: flush swap writer after marking MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flush the swap writer after, not before, marking the files, to ensure the signature is properly written. Fixes: 6f612af57821 ("PM / Hibernate: Group swap ops") Signed-off-by: Laurent Badel Cc: All applicable Signed-off-by: Rafael J. Wysocki --- kernel/power/swap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/swap.c b/kernel/power/swap.c index c73f2e295167..72e33054a2e1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -497,10 +497,10 @@ static int swap_writer_finish(struct swap_map_handle *handle, unsigned int flags, int error) { if (!error) { - flush_swap_writer(handle); pr_info("S"); error = mark_swapfiles(handle, flags); pr_cont("|\n"); + flush_swap_writer(handle); } if (error) -- cgit v1.2.3 From 12bb3f7f1b03d5913b3f9d4236a488aa7774dfe9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Jan 2021 16:00:24 +0100 Subject: futex: Ensure the correct return value from futex_lock_pi() In case that futex_lock_pi() was aborted by a signal or a timeout and the task returned without acquiring the rtmutex, but is the designated owner of the futex due to a concurrent futex_unlock_pi() fixup_owner() is invoked to establish consistent state. In that case it invokes fixup_pi_state_owner() which in turn tries to acquire the rtmutex again. If that succeeds then it does not propagate this success to fixup_owner() and futex_lock_pi() returns -EINTR or -ETIMEOUT despite having the futex locked. Return success from fixup_pi_state_owner() in all cases where the current task owns the rtmutex and therefore the futex and propagate it correctly through fixup_owner(). Fixup the other callsite which does not expect a positive return value. Fixes: c1e2f0eaf015 ("futex: Avoid violating the 10th rule of futex") Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- kernel/futex.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index c47d1015d759..d5e61c2e865e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2373,8 +2373,8 @@ retry: } if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { - /* We got the lock after all, nothing to fix. */ - ret = 0; + /* We got the lock. pi_state is correct. Tell caller. */ + ret = 1; goto out_unlock; } @@ -2402,7 +2402,7 @@ retry: * We raced against a concurrent self; things are * already fixed up. Nothing to do. */ - ret = 0; + ret = 1; goto out_unlock; } newowner = argowner; @@ -2448,7 +2448,7 @@ retry: raw_spin_unlock(&newowner->pi_lock); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - return 0; + return argowner == current; /* * In order to reschedule or handle a page fault, we need to drop the @@ -2490,7 +2490,7 @@ handle_err: * Check if someone else fixed it for us: */ if (pi_state->owner != oldowner) { - ret = 0; + ret = argowner == current; goto out_unlock; } @@ -2523,8 +2523,6 @@ static long futex_wait_restart(struct restart_block *restart); */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { - int ret = 0; - if (locked) { /* * Got the lock. We might not be the anticipated owner if we @@ -2535,8 +2533,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * stable state, anything else needs more attention. */ if (q->pi_state->owner != current) - ret = fixup_pi_state_owner(uaddr, q, current); - return ret ? ret : locked; + return fixup_pi_state_owner(uaddr, q, current); + return 1; } /* @@ -2547,10 +2545,8 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * Another speculative read; pi_state->owner == current is unstable * but needs our attention. */ - if (q->pi_state->owner == current) { - ret = fixup_pi_state_owner(uaddr, q, NULL); - return ret; - } + if (q->pi_state->owner == current) + return fixup_pi_state_owner(uaddr, q, NULL); /* * Paranoia check. If we did not take the lock, then we should not be @@ -2563,7 +2559,7 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) q->pi_state->owner); } - return ret; + return 0; } /** @@ -3261,7 +3257,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + if (ret < 0 && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { pi_state = q.pi_state; get_pi_state(pi_state); } @@ -3271,6 +3267,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); + /* + * Adjust the return value. It's either -EFAULT or + * success (1) but the caller expects 0 for success. + */ + ret = ret < 0 ? ret : 0; } } else { struct rt_mutex *pi_mutex; -- cgit v1.2.3 From 04b79c55201f02ffd675e1231d731365e335c307 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Jan 2021 16:06:10 +0100 Subject: futex: Replace pointless printk in fixup_owner() If that unexpected case of inconsistent arguments ever happens then the futex state is left completely inconsistent and the printk is not really helpful. Replace it with a warning and make the state consistent. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- kernel/futex.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index d5e61c2e865e..5dc8f893d523 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2550,14 +2550,10 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) /* * Paranoia check. If we did not take the lock, then we should not be - * the owner of the rt_mutex. + * the owner of the rt_mutex. Warn and establish consistent state. */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { - printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " - "pi-state %p\n", ret, - q->pi_state->pi_mutex.owner, - q->pi_state->owner); - } + if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current)) + return fixup_pi_state_owner(uaddr, q, current); return 0; } -- cgit v1.2.3 From c5cade200ab9a2a3be9e7f32a752c8d86b502ec7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Jan 2021 15:21:35 +0100 Subject: futex: Provide and use pi_state_update_owner() Updating pi_state::owner is done at several places with the same code. Provide a function for it and use that at the obvious places. This is also a preparation for a bug fix to avoid yet another copy of the same code or alternatively introducing a completely unpenetratable mess of gotos. Originally-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- kernel/futex.c | 66 +++++++++++++++++++++++++++++----------------------------- 1 file changed, 33 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 5dc8f893d523..7837f9e561fa 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -763,6 +763,29 @@ static struct futex_pi_state *alloc_pi_state(void) return pi_state; } +static void pi_state_update_owner(struct futex_pi_state *pi_state, + struct task_struct *new_owner) +{ + struct task_struct *old_owner = pi_state->owner; + + lockdep_assert_held(&pi_state->pi_mutex.wait_lock); + + if (old_owner) { + raw_spin_lock(&old_owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + raw_spin_unlock(&old_owner->pi_lock); + } + + if (new_owner) { + raw_spin_lock(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &new_owner->pi_state_list); + pi_state->owner = new_owner; + raw_spin_unlock(&new_owner->pi_lock); + } +} + static void get_pi_state(struct futex_pi_state *pi_state) { WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount)); @@ -1521,26 +1544,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ ret = -EINVAL; } - if (ret) - goto out_unlock; - - /* - * This is a point of no return; once we modify the uval there is no - * going back and subsequent operations must not fail. - */ - - raw_spin_lock(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock(&pi_state->owner->pi_lock); - - raw_spin_lock(&new_owner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &new_owner->pi_state_list); - pi_state->owner = new_owner; - raw_spin_unlock(&new_owner->pi_lock); - - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + if (!ret) { + /* + * This is a point of no return; once we modified the uval + * there is no going back and subsequent operations must + * not fail. + */ + pi_state_update_owner(pi_state, new_owner); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + } out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); @@ -2433,19 +2445,7 @@ retry: * We fixed up user space. Now we need to fix the pi_state * itself. */ - if (pi_state->owner != NULL) { - raw_spin_lock(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - raw_spin_unlock(&pi_state->owner->pi_lock); - } - - pi_state->owner = newowner; - - raw_spin_lock(&newowner->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock(&newowner->pi_lock); + pi_state_update_owner(pi_state, newowner); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return argowner == current; -- cgit v1.2.3 From 2156ac1934166d6deb6cd0f6ffc4c1076ec63697 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Jan 2021 11:32:07 +0100 Subject: rtmutex: Remove unused argument from rt_mutex_proxy_unlock() Nothing uses the argument. Remove it as preparation to use pi_state_update_owner(). Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- kernel/futex.c | 2 +- kernel/locking/rtmutex.c | 3 +-- kernel/locking/rtmutex_common.h | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 7837f9e561fa..cfca221c1bc5 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -818,7 +818,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) list_del_init(&pi_state->list); raw_spin_unlock(&owner->pi_lock); } - rt_mutex_proxy_unlock(&pi_state->pi_mutex, owner); + rt_mutex_proxy_unlock(&pi_state->pi_mutex); raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index cfdd5b93264d..2f8cd616d3b2 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1716,8 +1716,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, * possible because it belongs to the pi_state which is about to be freed * and it is not longer visible to other tasks. */ -void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner) +void rt_mutex_proxy_unlock(struct rt_mutex *lock) { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index d1d62f942be2..ca6fb489007b 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -133,8 +133,7 @@ enum rtmutex_chainwalk { extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); -extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, - struct task_struct *proxy_owner); +extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, -- cgit v1.2.3 From 6ccc84f917d33312eb2846bd7b567639f585ad6d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 Jan 2021 11:35:19 +0100 Subject: futex: Use pi_state_update_owner() in put_pi_state() No point in open coding it. This way it gains the extra sanity checks. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- kernel/futex.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index cfca221c1bc5..a0fe63c34248 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -808,16 +808,10 @@ static void put_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - struct task_struct *owner; unsigned long flags; raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags); - owner = pi_state->owner; - if (owner) { - raw_spin_lock(&owner->pi_lock); - list_del_init(&pi_state->list); - raw_spin_unlock(&owner->pi_lock); - } + pi_state_update_owner(pi_state, NULL); rt_mutex_proxy_unlock(&pi_state->pi_mutex); raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags); } -- cgit v1.2.3 From f2dac39d93987f7de1e20b3988c8685523247ae2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 Jan 2021 16:26:38 +0100 Subject: futex: Simplify fixup_pi_state_owner() Too many gotos already and an upcoming fix would make it even more unreadable. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- kernel/futex.c | 53 ++++++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index a0fe63c34248..7a38ead96cae 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2329,18 +2329,13 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); } -static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *argowner) +static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) { struct futex_pi_state *pi_state = q->pi_state; - u32 uval, curval, newval; struct task_struct *oldowner, *newowner; - u32 newtid; - int ret, err = 0; - - lockdep_assert_held(q->lock_ptr); - - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + u32 uval, curval, newval, newtid; + int err = 0; oldowner = pi_state->owner; @@ -2374,14 +2369,12 @@ retry: * We raced against a concurrent self; things are * already fixed up. Nothing to do. */ - ret = 0; - goto out_unlock; + return 0; } if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { /* We got the lock. pi_state is correct. Tell caller. */ - ret = 1; - goto out_unlock; + return 1; } /* @@ -2408,8 +2401,7 @@ retry: * We raced against a concurrent self; things are * already fixed up. Nothing to do. */ - ret = 1; - goto out_unlock; + return 1; } newowner = argowner; } @@ -2440,7 +2432,6 @@ retry: * itself. */ pi_state_update_owner(pi_state, newowner); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return argowner == current; @@ -2463,17 +2454,16 @@ handle_err: switch (err) { case -EFAULT: - ret = fault_in_user_writeable(uaddr); + err = fault_in_user_writeable(uaddr); break; case -EAGAIN: cond_resched(); - ret = 0; + err = 0; break; default: WARN_ON_ONCE(1); - ret = err; break; } @@ -2483,17 +2473,26 @@ handle_err: /* * Check if someone else fixed it for us: */ - if (pi_state->owner != oldowner) { - ret = argowner == current; - goto out_unlock; - } + if (pi_state->owner != oldowner) + return argowner == current; - if (ret) - goto out_unlock; + /* Retry if err was -EAGAIN or the fault in succeeded */ + if (!err) + goto retry; - goto retry; + return err; +} -out_unlock: +static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, + struct task_struct *argowner) +{ + struct futex_pi_state *pi_state = q->pi_state; + int ret; + + lockdep_assert_held(q->lock_ptr); + + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + ret = __fixup_pi_state_owner(uaddr, q, argowner); raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; } -- cgit v1.2.3 From 34b1a1ce1458f50ef27c54e28eb9b1947012907a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 Jan 2021 19:01:21 +0100 Subject: futex: Handle faults correctly for PI futexes fixup_pi_state_owner() tries to ensure that the state of the rtmutex, pi_state and the user space value related to the PI futex are consistent before returning to user space. In case that the user space value update faults and the fault cannot be resolved by faulting the page in via fault_in_user_writeable() the function returns with -EFAULT and leaves the rtmutex and pi_state owner state inconsistent. A subsequent futex_unlock_pi() operates on the inconsistent pi_state and releases the rtmutex despite not owning it which can corrupt the RB tree of the rtmutex and cause a subsequent kernel stack use after free. It was suggested to loop forever in fixup_pi_state_owner() if the fault cannot be resolved, but that results in runaway tasks which is especially undesired when the problem happens due to a programming error and not due to malice. As the user space value cannot be fixed up, the proper solution is to make the rtmutex and the pi_state consistent so both have the same owner. This leaves the user space value out of sync. Any subsequent operation on the futex will fail because the 10th rule of PI futexes (pi_state owner and user space value are consistent) has been violated. As a consequence this removes the inept attempts of 'fixing' the situation in case that the current task owns the rtmutex when returning with an unresolvable fault by unlocking the rtmutex which left pi_state::owner and rtmutex::owner out of sync in a different and only slightly less dangerous way. Fixes: 1b7558e457ed ("futexes: fix fault handling in futex_lock_pi") Reported-by: gzobqq@gmail.com Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org --- kernel/futex.c | 57 ++++++++++++++++++++------------------------------------- 1 file changed, 20 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 7a38ead96cae..45a13eb8894e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -958,7 +958,8 @@ static inline void exit_pi_state_list(struct task_struct *curr) { } * FUTEX_OWNER_DIED bit. See [4] * * [10] There is no transient state which leaves owner and user space - * TID out of sync. + * TID out of sync. Except one error case where the kernel is denied + * write access to the user address, see fixup_pi_state_owner(). * * * Serialization and lifetime rules: @@ -2480,6 +2481,24 @@ handle_err: if (!err) goto retry; + /* + * fault_in_user_writeable() failed so user state is immutable. At + * best we can make the kernel state consistent but user state will + * be most likely hosed and any subsequent unlock operation will be + * rejected due to PI futex rule [10]. + * + * Ensure that the rtmutex owner is also the pi_state owner despite + * the user space value claiming something different. There is no + * point in unlocking the rtmutex if current is the owner as it + * would need to wait until the next waiter has taken the rtmutex + * to guarantee consistent state. Keep it simple. Userspace asked + * for this wreckaged state. + * + * The rtmutex has an owner - either current or some other + * task. See the EAGAIN loop above. + */ + pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex)); + return err; } @@ -2756,7 +2775,6 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to; - struct futex_pi_state *pi_state = NULL; struct task_struct *exiting = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; @@ -2892,23 +2910,8 @@ no_block: if (res) ret = (res < 0) ? res : 0; - /* - * If fixup_owner() faulted and was unable to handle the fault, unlock - * it and return the fault to userspace. - */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } - /* Unqueue and drop the lock */ unqueue_me_pi(&q); - - if (pi_state) { - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); - } - goto out; out_unlock_put_key: @@ -3168,7 +3171,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to; - struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; @@ -3246,10 +3248,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret < 0 && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. @@ -3291,25 +3289,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (res) ret = (res < 0) ? res : 0; - /* - * If fixup_pi_state_owner() faulted and was unable to handle - * the fault, unlock the rt_mutex and return the fault to - * userspace. - */ - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { - pi_state = q.pi_state; - get_pi_state(pi_state); - } - /* Unqueue and drop the lock. */ unqueue_me_pi(&q); } - if (pi_state) { - rt_mutex_futex_unlock(&pi_state->pi_mutex); - put_pi_state(pi_state); - } - if (ret == -EINTR) { /* * We've already been requeued, but cannot restart by calling -- cgit v1.2.3 From 78031381ae9c88f4f914d66154f4745122149c58 Mon Sep 17 00:00:00 2001 From: Mikko Ylinen Date: Mon, 25 Jan 2021 08:39:36 +0200 Subject: bpf: Drop disabled LSM hooks from the sleepable set Some networking and keys LSM hooks are conditionally enabled and when building the new sleepable BPF LSM hooks with those LSM hooks disabled, the following build error occurs: BTFIDS vmlinux FAILED unresolved symbol bpf_lsm_socket_socketpair To fix the error, conditionally add the relevant networking/keys LSM hooks to the sleepable set. Fixes: 423f16108c9d8 ("bpf: Augment the set of sleepable LSM hooks") Signed-off-by: Mikko Ylinen Signed-off-by: Daniel Borkmann Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210125063936.89365-1-mikko.ylinen@linux.intel.com --- kernel/bpf/bpf_lsm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 70e5e0b6d69d..1622a44d1617 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -149,7 +149,11 @@ BTF_ID(func, bpf_lsm_file_ioctl) BTF_ID(func, bpf_lsm_file_lock) BTF_ID(func, bpf_lsm_file_open) BTF_ID(func, bpf_lsm_file_receive) + +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_inet_conn_established) +#endif /* CONFIG_SECURITY_NETWORK */ + BTF_ID(func, bpf_lsm_inode_create) BTF_ID(func, bpf_lsm_inode_free_security) BTF_ID(func, bpf_lsm_inode_getattr) @@ -166,7 +170,11 @@ BTF_ID(func, bpf_lsm_inode_symlink) BTF_ID(func, bpf_lsm_inode_unlink) BTF_ID(func, bpf_lsm_kernel_module_request) BTF_ID(func, bpf_lsm_kernfs_init_security) + +#ifdef CONFIG_KEYS BTF_ID(func, bpf_lsm_key_free) +#endif /* CONFIG_KEYS */ + BTF_ID(func, bpf_lsm_mmap_file) BTF_ID(func, bpf_lsm_netlink_send) BTF_ID(func, bpf_lsm_path_notify) @@ -181,6 +189,8 @@ BTF_ID(func, bpf_lsm_sb_show_options) BTF_ID(func, bpf_lsm_sb_statfs) BTF_ID(func, bpf_lsm_sb_umount) BTF_ID(func, bpf_lsm_settime) + +#ifdef CONFIG_SECURITY_NETWORK BTF_ID(func, bpf_lsm_socket_accept) BTF_ID(func, bpf_lsm_socket_bind) BTF_ID(func, bpf_lsm_socket_connect) @@ -195,6 +205,8 @@ BTF_ID(func, bpf_lsm_socket_recvmsg) BTF_ID(func, bpf_lsm_socket_sendmsg) BTF_ID(func, bpf_lsm_socket_shutdown) BTF_ID(func, bpf_lsm_socket_socketpair) +#endif /* CONFIG_SECURITY_NETWORK */ + BTF_ID(func, bpf_lsm_syslog) BTF_ID(func, bpf_lsm_task_alloc) BTF_ID(func, bpf_lsm_task_getsecid) -- cgit v1.2.3 From 150a27328b681425c8cab239894a48f2aeb870e9 Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Tue, 26 Jan 2021 16:13:20 +0000 Subject: bpf, preload: Fix build when $(O) points to a relative path Building the kernel with CONFIG_BPF_PRELOAD, and by providing a relative path for the output directory, may fail with the following error: $ make O=build bindeb-pkg ... /.../linux/tools/scripts/Makefile.include:5: *** O=build does not exist. Stop. make[7]: *** [/.../linux/kernel/bpf/preload/Makefile:9: kernel/bpf/preload/libbpf.a] Error 2 make[6]: *** [/.../linux/scripts/Makefile.build:500: kernel/bpf/preload] Error 2 make[5]: *** [/.../linux/scripts/Makefile.build:500: kernel/bpf] Error 2 make[4]: *** [/.../linux/Makefile:1799: kernel] Error 2 make[4]: *** Waiting for unfinished jobs.... In the case above, for the "bindeb-pkg" target, the error is produced by the "dummy" check in Makefile.include, called from libbpf's Makefile. This check changes directory to $(PWD) before checking for the existence of $(O). But at this step we have $(PWD) pointing to "/.../linux/build", and $(O) pointing to "build". So the Makefile.include tries in fact to assert the existence of a directory named "/.../linux/build/build", which does not exist. Note that the error does not occur for all make targets and architectures combinations. This was observed on x86 for "bindeb-pkg", or for a regular build for UML [0]. Here are some details. The root Makefile recursively calls itself once, after changing directory to $(O). The content for the variable $(PWD) is preserved across recursive calls to make, so it is unchanged at this step. For "bindeb-pkg", $(PWD) is eventually updated because the target writes a new Makefile (as debian/rules) and calls it indirectly through dpkg-buildpackage. This script does not preserve $(PWD), which is reset to the current working directory when the target in debian/rules is called. Although not investigated, it seems likely that something similar causes UML to change its value for $(PWD). Non-trivial fixes could be to remove the use of $(PWD) from the "dummy" check, or to make sure that $(PWD) and $(O) are preserved or updated to always play well and form a valid $(PWD)/$(O) path across the different targets and architectures. Instead, we take a simpler approach and just update $(O) when calling libbpf's Makefile, so it points to an absolute path which should always resolve for the "dummy" check run (through includes) by that Makefile. David Gow previously posted a slightly different version of this patch as a RFC [0], two months ago or so. [0] https://lore.kernel.org/bpf/20201119085022.3606135-1-davidgow@google.com/t/#u Fixes: d71fa5c9763c ("bpf: Add kernel module with user mode driver that populates bpffs.") Reported-by: David Gow Signed-off-by: Quentin Monnet Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Cc: Brendan Higgins Cc: Masahiro Yamada Link: https://lore.kernel.org/bpf/20210126161320.24561-1-quentin@isovalent.com --- kernel/bpf/preload/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/preload/Makefile b/kernel/bpf/preload/Makefile index 23ee310b6eb4..1951332dd15f 100644 --- a/kernel/bpf/preload/Makefile +++ b/kernel/bpf/preload/Makefile @@ -4,8 +4,11 @@ LIBBPF_SRCS = $(srctree)/tools/lib/bpf/ LIBBPF_A = $(obj)/libbpf.a LIBBPF_OUT = $(abspath $(obj)) +# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test +# in tools/scripts/Makefile.include always succeeds when building the kernel +# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg". $(LIBBPF_A): - $(Q)$(MAKE) -C $(LIBBPF_SRCS) OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a + $(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \ -I $(srctree)/tools/lib/ -Wno-unused-result -- cgit v1.2.3 From d17405d52bacd14fe7fdbb10c0434934ea496914 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Mon, 25 Jan 2021 14:13:06 +1300 Subject: dma-mapping: benchmark: fix kernel crash when dma_map_single fails if dma_map_single() fails, kernel will give the below oops since task_struct has been destroyed and we are running into the memory corruption due to use-after-free in kthread_stop(): [ 48.095310] Unable to handle kernel paging request at virtual address 000000c473548040 [ 48.095736] Mem abort info: [ 48.095864] ESR = 0x96000004 [ 48.096025] EC = 0x25: DABT (current EL), IL = 32 bits [ 48.096268] SET = 0, FnV = 0 [ 48.096401] EA = 0, S1PTW = 0 [ 48.096538] Data abort info: [ 48.096659] ISV = 0, ISS = 0x00000004 [ 48.096820] CM = 0, WnR = 0 [ 48.097079] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000104639000 [ 48.098099] [000000c473548040] pgd=0000000000000000, p4d=0000000000000000 [ 48.098832] Internal error: Oops: 96000004 [#1] PREEMPT SMP [ 48.099232] Modules linked in: [ 48.099387] CPU: 0 PID: 2 Comm: kthreadd Tainted: G W [ 48.099887] Hardware name: linux,dummy-virt (DT) [ 48.100078] pstate: 60000005 (nZCv daif -PAN -UAO -TCO BTYPE=--) [ 48.100516] pc : __kmalloc_node+0x214/0x368 [ 48.100944] lr : __kmalloc_node+0x1f4/0x368 [ 48.101458] sp : ffff800011f0bb80 [ 48.101843] x29: ffff800011f0bb80 x28: ffff0000c0098ec0 [ 48.102330] x27: 0000000000000000 x26: 00000000001d4600 [ 48.102648] x25: ffff0000c0098ec0 x24: ffff800011b6a000 [ 48.102988] x23: 00000000ffffffff x22: ffff0000c0098ec0 [ 48.103333] x21: ffff8000101d7a54 x20: 0000000000000dc0 [ 48.103657] x19: ffff0000c0001e00 x18: 0000000000000000 [ 48.104069] x17: 0000000000000000 x16: 0000000000000000 [ 48.105449] x15: 000001aa0304e7b9 x14: 00000000000003b1 [ 48.106401] x13: ffff8000122d5000 x12: ffff80001228d000 [ 48.107296] x11: ffff0000c0154340 x10: 0000000000000000 [ 48.107862] x9 : ffff80000fffffff x8 : ffff0000c473527f [ 48.108326] x7 : ffff800011e62f58 x6 : ffff0000c01c8ed8 [ 48.108778] x5 : ffff0000c0098ec0 x4 : 0000000000000000 [ 48.109223] x3 : 00000000001d4600 x2 : 0000000000000040 [ 48.109656] x1 : 0000000000000001 x0 : ff0000c473548000 [ 48.110104] Call trace: [ 48.110287] __kmalloc_node+0x214/0x368 [ 48.110493] __vmalloc_node_range+0xc4/0x298 [ 48.110805] copy_process+0x2c8/0x15c8 [ 48.111133] kernel_clone+0x5c/0x3c0 [ 48.111373] kernel_thread+0x64/0x90 [ 48.111604] kthreadd+0x158/0x368 [ 48.111810] ret_from_fork+0x10/0x30 [ 48.112336] Code: 17ffffe9 b9402a62 b94008a1 11000421 (f8626802) [ 48.112884] ---[ end trace d4890e21e75419d5 ]--- Signed-off-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index b1496e744c68..1b1b8ff875cb 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -147,8 +147,10 @@ static int do_map_benchmark(struct map_benchmark_data *map) atomic64_set(&map->sum_sq_unmap, 0); atomic64_set(&map->loops, 0); - for (i = 0; i < threads; i++) + for (i = 0; i < threads; i++) { + get_task_struct(tsk[i]); wake_up_process(tsk[i]); + } msleep_interruptible(map->bparam.seconds * 1000); @@ -183,6 +185,8 @@ static int do_map_benchmark(struct map_benchmark_data *map) } out: + for (i = 0; i < threads; i++) + put_task_struct(tsk[i]); put_device(map->dev); kfree(tsk); return ret; -- cgit v1.2.3 From 0ae78eec8aa64e645866e75005162603a77a0f49 Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 19 Jan 2021 12:07:55 +0000 Subject: sched/eas: Don't update misfit status if the task is pinned If the task is pinned to a cpu, setting the misfit status means that we'll unnecessarily continuously attempt to migrate the task but fail. This continuous failure will cause the balance_interval to increase to a high value, and eventually cause unnecessary significant delays in balancing the system when real imbalance happens. Caught while testing uclamp where rt-app calibration loop was pinned to cpu 0, shortly after which we spawn another task with high util_clamp value. The task was failing to migrate after over 40ms of runtime due to balance_interval unnecessary expanded to a very high value from the calibration loop. Not done here, but it could be useful to extend the check for pinning to verify that the affinity of the task has a cpu that fits. We could end up in a similar situation otherwise. Fixes: 3b1baa6496e6 ("sched/fair: Add 'group_misfit_task' load-balance type") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Quentin Perret Acked-by: Valentin Schneider Link: https://lkml.kernel.org/r/20210119120755.2425264-1-qais.yousef@arm.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 197a51473e0c..9379a481dd8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4060,7 +4060,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) if (!static_branch_unlikely(&sched_asym_cpucapacity)) return; - if (!p) { + if (!p || p->nr_cpus_allowed == 1) { rq->misfit_task_load = 0; return; } -- cgit v1.2.3 From 620a6dc40754dc218f5b6389b5d335e9a107fd29 Mon Sep 17 00:00:00 2001 From: Valentin Schneider Date: Fri, 22 Jan 2021 12:39:43 +0000 Subject: sched/topology: Make sched_init_numa() use a set for the deduplicating sort MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The deduplicating sort in sched_init_numa() assumes that the first line in the distance table contains all unique values in the entire table. I've been trying to pen what this exactly means for the topology, but it's not straightforward. For instance, topology.c uses this example: node 0 1 2 3 0: 10 20 20 30 1: 20 10 20 20 2: 20 20 10 20 3: 30 20 20 10 0 ----- 1 | / | | / | | / | 2 ----- 3 Which works out just fine. However, if we swap nodes 0 and 1: 1 ----- 0 | / | | / | | / | 2 ----- 3 we get this distance table: node 0 1 2 3 0: 10 20 20 20 1: 20 10 20 30 2: 20 20 10 20 3: 20 30 20 10 Which breaks the deduplicating sort (non-representative first line). In this case this would just be a renumbering exercise, but it so happens that we can have a deduplicating sort that goes through the whole table in O(n²) at the extra cost of a temporary memory allocation (i.e. any form of set). The ACPI spec (SLIT) mentions distances are encoded on 8 bits. Following this, implement the set as a 256-bits bitmap. Should this not be satisfactory (i.e. we want to support 32-bit values), then we'll have to go for some other sparse set implementation. This has the added benefit of letting us allocate just the right amount of memory for sched_domains_numa_distance[], rather than an arbitrary (nr_node_ids + 1). Note: DT binding equivalent (distance-map) decodes distances as 32-bit values. Signed-off-by: Valentin Schneider Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210122123943.1217-2-valentin.schneider@arm.com --- include/linux/topology.h | 1 + kernel/sched/topology.c | 99 +++++++++++++++++++++++------------------------- 2 files changed, 49 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/include/linux/topology.h b/include/linux/topology.h index ad03df1cc266..7634cd737061 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -48,6 +48,7 @@ int arch_update_cpu_topology(void); /* Conform to ACPI 2.0 SLIT distance definitions */ #define LOCAL_DISTANCE 10 #define REMOTE_DISTANCE 20 +#define DISTANCE_BITS 8 #ifndef node_distance #define node_distance(from,to) ((from) == (to) ? LOCAL_DISTANCE : REMOTE_DISTANCE) #endif diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 5d3675c7a76b..bf5c9bd10bfe 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1596,66 +1596,58 @@ static void init_numa_topology_type(void) } } + +#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) + void sched_init_numa(void) { - int next_distance, curr_distance = node_distance(0, 0); struct sched_domain_topology_level *tl; - int level = 0; - int i, j, k; - - sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL); - if (!sched_domains_numa_distance) - return; - - /* Includes NUMA identity node at level 0. */ - sched_domains_numa_distance[level++] = curr_distance; - sched_domains_numa_levels = level; + unsigned long *distance_map; + int nr_levels = 0; + int i, j; /* * O(nr_nodes^2) deduplicating selection sort -- in order to find the * unique distances in the node_distance() table. - * - * Assumes node_distance(0,j) includes all distances in - * node_distance(i,j) in order to avoid cubic time. */ - next_distance = curr_distance; + distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); + if (!distance_map) + return; + + bitmap_zero(distance_map, NR_DISTANCE_VALUES); for (i = 0; i < nr_node_ids; i++) { for (j = 0; j < nr_node_ids; j++) { - for (k = 0; k < nr_node_ids; k++) { - int distance = node_distance(i, k); - - if (distance > curr_distance && - (distance < next_distance || - next_distance == curr_distance)) - next_distance = distance; - - /* - * While not a strong assumption it would be nice to know - * about cases where if node A is connected to B, B is not - * equally connected to A. - */ - if (sched_debug() && node_distance(k, i) != distance) - sched_numa_warn("Node-distance not symmetric"); + int distance = node_distance(i, j); - if (sched_debug() && i && !find_numa_distance(distance)) - sched_numa_warn("Node-0 not representative"); + if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { + sched_numa_warn("Invalid distance value range"); + return; } - if (next_distance != curr_distance) { - sched_domains_numa_distance[level++] = next_distance; - sched_domains_numa_levels = level; - curr_distance = next_distance; - } else break; + + bitmap_set(distance_map, distance, 1); } + } + /* + * We can now figure out how many unique distance values there are and + * allocate memory accordingly. + */ + nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES); - /* - * In case of sched_debug() we verify the above assumption. - */ - if (!sched_debug()) - break; + sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL); + if (!sched_domains_numa_distance) { + bitmap_free(distance_map); + return; + } + + for (i = 0, j = 0; i < nr_levels; i++, j++) { + j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j); + sched_domains_numa_distance[i] = j; } + bitmap_free(distance_map); + /* - * 'level' contains the number of unique distances + * 'nr_levels' contains the number of unique distances * * The sched_domains_numa_distance[] array includes the actual distance * numbers. @@ -1664,15 +1656,15 @@ void sched_init_numa(void) /* * Here, we should temporarily reset sched_domains_numa_levels to 0. * If it fails to allocate memory for array sched_domains_numa_masks[][], - * the array will contain less then 'level' members. This could be + * the array will contain less then 'nr_levels' members. This could be * dangerous when we use it to iterate array sched_domains_numa_masks[][] * in other functions. * - * We reset it to 'level' at the end of this function. + * We reset it to 'nr_levels' at the end of this function. */ sched_domains_numa_levels = 0; - sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); + sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL); if (!sched_domains_numa_masks) return; @@ -1680,7 +1672,7 @@ void sched_init_numa(void) * Now for each level, construct a mask per node which contains all * CPUs of nodes that are that many hops away from us. */ - for (i = 0; i < level; i++) { + for (i = 0; i < nr_levels; i++) { sched_domains_numa_masks[i] = kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); if (!sched_domains_numa_masks[i]) @@ -1688,12 +1680,17 @@ void sched_init_numa(void) for (j = 0; j < nr_node_ids; j++) { struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); + int k; + if (!mask) return; sched_domains_numa_masks[i][j] = mask; for_each_node(k) { + if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) + sched_numa_warn("Node-distance not symmetric"); + if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -1705,7 +1702,7 @@ void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + level + 1) * + tl = kzalloc((i + nr_levels) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; @@ -1728,7 +1725,7 @@ void sched_init_numa(void) /* * .. and append 'j' levels of NUMA goodness. */ - for (j = 1; j < level; i++, j++) { + for (j = 1; j < nr_levels; i++, j++) { tl[i] = (struct sched_domain_topology_level){ .mask = sd_numa_mask, .sd_flags = cpu_numa_flags, @@ -1740,8 +1737,8 @@ void sched_init_numa(void) sched_domain_topology = tl; - sched_domains_numa_levels = level; - sched_max_numa_distance = sched_domains_numa_distance[level - 1]; + sched_domains_numa_levels = nr_levels; + sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1]; init_numa_topology_type(); } -- cgit v1.2.3 From e6e0dc2d5497f7f3ed970052917e2923c6f453f4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jan 2021 08:59:06 +0000 Subject: sched/fair: Remove SIS_AVG_CPU SIS_AVG_CPU was introduced as a means of avoiding a search when the average search cost indicated that the search would likely fail. It was a blunt instrument and disabled by commit 4c77b18cf8b7 ("sched/fair: Make select_idle_cpu() more aggressive") and later replaced with a proportional search depth by commit 1ad3aaf3fcd2 ("sched/core: Implement new approach to scale select_idle_cpu()"). While there are corner cases where SIS_AVG_CPU is better, it has now been disabled for almost three years. As the intent of SIS_PROP is to reduce the time complexity of select_idle_cpu(), lets drop SIS_AVG_CPU and focus on SIS_PROP as a throttling mechanism. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210125085909.4600-2-mgorman@techsingularity.net --- kernel/sched/fair.c | 20 +++++++++----------- kernel/sched/features.h | 1 - 2 files changed, 9 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9379a481dd8c..124cb6ec6b7b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6158,7 +6158,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); struct sched_domain *this_sd; - u64 avg_cost, avg_idle; u64 time; int this = smp_processor_id(); int cpu, nr = INT_MAX; @@ -6167,18 +6166,17 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t if (!this_sd) return -1; - /* - * Due to large variance we need a large fuzz factor; hackbench in - * particularly is sensitive here. - */ - avg_idle = this_rq()->avg_idle / 512; - avg_cost = this_sd->avg_scan_cost + 1; + if (sched_feat(SIS_PROP)) { + u64 avg_cost, avg_idle, span_avg; - if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) - return -1; + /* + * Due to large variance we need a large fuzz factor; + * hackbench in particularly is sensitive here. + */ + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; - if (sched_feat(SIS_PROP)) { - u64 span_avg = sd->span_weight * avg_idle; + span_avg = sd->span_weight * avg_idle; if (span_avg > 4*avg_cost) nr = div_u64(span_avg, avg_cost); else diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 68d369cba9e4..e875eabb6600 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -54,7 +54,6 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_AVG_CPU, false) SCHED_FEAT(SIS_PROP, true) /* -- cgit v1.2.3 From bae4ec13640b0915e7dd86da7e65c5d085160571 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jan 2021 08:59:07 +0000 Subject: sched/fair: Move avg_scan_cost calculations under SIS_PROP As noted by Vincent Guittot, avg_scan_costs are calculated for SIS_PROP even if SIS_PROP is disabled. Move the time calculations under a SIS_PROP check and while we are at it, exclude the cost of initialising the CPU mask from the average scan cost. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210125085909.4600-3-mgorman@techsingularity.net --- kernel/sched/fair.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 124cb6ec6b7b..4c18ef6c1542 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6166,6 +6166,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t if (!this_sd) return -1; + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + if (sched_feat(SIS_PROP)) { u64 avg_cost, avg_idle, span_avg; @@ -6181,11 +6183,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t nr = div_u64(span_avg, avg_cost); else nr = 4; - } - - time = cpu_clock(this); - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + time = cpu_clock(this); + } for_each_cpu_wrap(cpu, cpus, target) { if (!--nr) @@ -6194,8 +6194,10 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t break; } - time = cpu_clock(this) - time; - update_avg(&this_sd->avg_scan_cost, time); + if (sched_feat(SIS_PROP)) { + time = cpu_clock(this) - time; + update_avg(&this_sd->avg_scan_cost, time); + } return cpu; } -- cgit v1.2.3 From ccf7ce46ab91515a7146c00300e168efa9dc777e Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 25 Jan 2021 12:18:28 +0800 Subject: PM: sleep: No need to check PF_WQ_WORKER in thaw_kernel_threads() Because PF_KTHREAD is set for all wq worker threads, it is not necessary to check PF_WQ_WORKER in addition to it in thaw_kernel_threads(), so stop doing that. Signed-off-by: Zqiang [ rjw: Subject and changelog rewrite ] Signed-off-by: Rafael J. Wysocki --- kernel/power/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 45b054b7b5ec..50cc63534486 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -235,7 +235,7 @@ void thaw_kernel_threads(void) read_lock(&tasklist_lock); for_each_process_thread(g, p) { - if (p->flags & (PF_KTHREAD | PF_WQ_WORKER)) + if (p->flags & PF_KTHREAD) __thaw_task(p); } read_unlock(&tasklist_lock); -- cgit v1.2.3 From 60e578e82b7d73fbd9a0966e8fc70a95d8e12e13 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 26 Jan 2021 18:25:07 -0800 Subject: bpf: Change 'BPF_ADD' to 'BPF_AND' in print_bpf_insn() This 'BPF_ADD' is duplicated, and I belive it should be 'BPF_AND'. Fixes: 981f94c3e921 ("bpf: Add bitwise atomic instructions") Signed-off-by: Menglong Dong Signed-off-by: Daniel Borkmann Acked-by: Brendan Jackman Link: https://lore.kernel.org/bpf/20210127022507.23674-1-dong.menglong@zte.com.cn --- kernel/bpf/disasm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 19ff8fed7f4b..3acc7e0b6916 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -161,7 +161,7 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_ATOMIC && - (insn->imm == BPF_ADD || insn->imm == BPF_ADD || + (insn->imm == BPF_ADD || insn->imm == BPF_AND || insn->imm == BPF_OR || insn->imm == BPF_XOR)) { verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) %s r%d\n", insn->code, -- cgit v1.2.3 From 772412176fb98493158929b220fe250127f611af Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Wed, 27 Jan 2021 11:31:39 -0800 Subject: bpf: Allow rewriting to ports under ip_unprivileged_port_start At the moment, BPF_CGROUP_INET{4,6}_BIND hooks can rewrite user_port to the privileged ones (< ip_unprivileged_port_start), but it will be rejected later on in the __inet_bind or __inet6_bind. Let's add another return value to indicate that CAP_NET_BIND_SERVICE check should be ignored. Use the same idea as we currently use in cgroup/egress where bit #1 indicates CN. Instead, for cgroup/bind{4,6}, bit #1 indicates that CAP_NET_BIND_SERVICE should be bypassed. v5: - rename flags to be less confusing (Andrey Ignatov) - rework BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY to work on flags and accept BPF_RET_SET_CN (no behavioral changes) v4: - Add missing IPv6 support (Martin KaFai Lau) v3: - Update description (Martin KaFai Lau) - Fix capability restore in selftest (Martin KaFai Lau) v2: - Switch to explicit return code (Martin KaFai Lau) Signed-off-by: Stanislav Fomichev Signed-off-by: Alexei Starovoitov Reviewed-by: Martin KaFai Lau Acked-by: Andrey Ignatov Link: https://lore.kernel.org/bpf/20210127193140.3170382-1-sdf@google.com --- include/linux/bpf-cgroup.h | 38 ++++++++++++++++++++++++--------- include/linux/bpf.h | 52 +++++++++++++++++++++++++++++----------------- include/net/inet_common.h | 2 ++ kernel/bpf/cgroup.c | 8 +++++-- kernel/bpf/verifier.c | 3 +++ net/ipv4/af_inet.c | 9 +++++--- net/ipv6/af_inet6.c | 9 +++++--- 7 files changed, 84 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 0748fd87969e..c42e02b4d84b 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -125,7 +125,8 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, enum bpf_attach_type type, - void *t_ctx); + void *t_ctx, + u32 *flags); int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, @@ -231,30 +232,48 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key, #define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ ({ \ + u32 __unused_flags; \ int __ret = 0; \ if (cgroup_bpf_enabled(type)) \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ - NULL); \ + NULL, \ + &__unused_flags); \ __ret; \ }) #define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type, t_ctx) \ ({ \ + u32 __unused_flags; \ int __ret = 0; \ if (cgroup_bpf_enabled(type)) { \ lock_sock(sk); \ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ - t_ctx); \ + t_ctx, \ + &__unused_flags); \ release_sock(sk); \ } \ __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_BIND, NULL) - -#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) \ - BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_BIND, NULL) +/* BPF_CGROUP_INET4_BIND and BPF_CGROUP_INET6_BIND can return extra flags + * via upper bits of return code. The only flag that is supported + * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check + * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE). + */ +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, bind_flags) \ +({ \ + u32 __flags = 0; \ + int __ret = 0; \ + if (cgroup_bpf_enabled(type)) { \ + lock_sock(sk); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type, \ + NULL, &__flags); \ + release_sock(sk); \ + if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ + *bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE; \ + } \ + __ret; \ +}) #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) \ ((cgroup_bpf_enabled(BPF_CGROUP_INET4_CONNECT) || \ @@ -453,8 +472,7 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr) ({ 0; }) -#define BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, type, flags) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1aac2af12fed..321966fc35db 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1073,6 +1073,34 @@ int bpf_prog_array_copy(struct bpf_prog_array *old_array, struct bpf_prog *include_prog, struct bpf_prog_array **new_array); +/* BPF program asks to bypass CAP_NET_BIND_SERVICE in bind. */ +#define BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE (1 << 0) +/* BPF program asks to set CN on the packet. */ +#define BPF_RET_SET_CN (1 << 0) + +#define BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, ret_flags) \ + ({ \ + struct bpf_prog_array_item *_item; \ + struct bpf_prog *_prog; \ + struct bpf_prog_array *_array; \ + u32 _ret = 1; \ + u32 func_ret; \ + migrate_disable(); \ + rcu_read_lock(); \ + _array = rcu_dereference(array); \ + _item = &_array->items[0]; \ + while ((_prog = READ_ONCE(_item->prog))) { \ + bpf_cgroup_storage_set(_item->cgroup_storage); \ + func_ret = func(_prog, ctx); \ + _ret &= (func_ret & 1); \ + *(ret_flags) |= (func_ret >> 1); \ + _item++; \ + } \ + rcu_read_unlock(); \ + migrate_enable(); \ + _ret; \ + }) + #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null) \ ({ \ struct bpf_prog_array_item *_item; \ @@ -1120,25 +1148,11 @@ _out: \ */ #define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \ ({ \ - struct bpf_prog_array_item *_item; \ - struct bpf_prog *_prog; \ - struct bpf_prog_array *_array; \ - u32 ret; \ - u32 _ret = 1; \ - u32 _cn = 0; \ - migrate_disable(); \ - rcu_read_lock(); \ - _array = rcu_dereference(array); \ - _item = &_array->items[0]; \ - while ((_prog = READ_ONCE(_item->prog))) { \ - bpf_cgroup_storage_set(_item->cgroup_storage); \ - ret = func(_prog, ctx); \ - _ret &= (ret & 1); \ - _cn |= (ret & 2); \ - _item++; \ - } \ - rcu_read_unlock(); \ - migrate_enable(); \ + u32 _flags = 0; \ + bool _cn; \ + u32 _ret; \ + _ret = BPF_PROG_RUN_ARRAY_FLAGS(array, ctx, func, &_flags); \ + _cn = _flags & BPF_RET_SET_CN; \ if (_ret) \ _ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \ else \ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index cb2818862919..cad2a611efde 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -41,6 +41,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); #define BIND_WITH_LOCK (1 << 1) /* Called from BPF program. */ #define BIND_FROM_BPF (1 << 2) +/* Skip CAP_NET_BIND_SERVICE check. */ +#define BIND_NO_CAP_NET_BIND_SERVICE (1 << 3) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index da649f20d6b2..cdf3c7e611d9 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1055,6 +1055,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); * @uaddr: sockaddr struct provided by user * @type: The type of program to be exectuted * @t_ctx: Pointer to attach type specific context + * @flags: Pointer to u32 which contains higher bits of BPF program + * return value (OR'ed together). * * socket is expected to be of type INET or INET6. * @@ -1064,7 +1066,8 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, struct sockaddr *uaddr, enum bpf_attach_type type, - void *t_ctx) + void *t_ctx, + u32 *flags) { struct bpf_sock_addr_kern ctx = { .sk = sk, @@ -1087,7 +1090,8 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, } cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + ret = BPF_PROG_RUN_ARRAY_FLAGS(cgrp->bpf.effective[type], &ctx, + BPF_PROG_RUN, flags); return ret == 1 ? 0 : -EPERM; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d0eae51b31e4..972fc38eb62d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -7986,6 +7986,9 @@ static int check_return_code(struct bpf_verifier_env *env) env->prog->expected_attach_type == BPF_CGROUP_INET4_GETSOCKNAME || env->prog->expected_attach_type == BPF_CGROUP_INET6_GETSOCKNAME) range = tnum_range(1, 1); + if (env->prog->expected_attach_type == BPF_CGROUP_INET4_BIND || + env->prog->expected_attach_type == BPF_CGROUP_INET6_BIND) + range = tnum_range(0, 3); break; case BPF_PROG_TYPE_CGROUP_SKB: if (env->prog->expected_attach_type == BPF_CGROUP_INET_EGRESS) { diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6ba2930ff49b..aaa94bea19c3 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -438,6 +438,7 @@ EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + u32 flags = BIND_WITH_LOCK; int err; /* If the socket has its own bind function then use it. (RAW) */ @@ -450,11 +451,12 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET4_BIND_LOCK(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + BPF_CGROUP_INET4_BIND, &flags); if (err) return err; - return __inet_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); + return __inet_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet_bind); @@ -499,7 +501,8 @@ int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, snum = ntohs(addr->sin_port); err = -EACCES; - if (snum && inet_port_requires_bind_service(net, snum) && + if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && + snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) goto out; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index b9c654836b72..f091fe9b4da5 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -295,7 +295,8 @@ static int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, return -EINVAL; snum = ntohs(addr->sin6_port); - if (snum && inet_port_requires_bind_service(net, snum) && + if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) && + snum && inet_port_requires_bind_service(net, snum) && !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; @@ -439,6 +440,7 @@ out_unlock: int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + u32 flags = BIND_WITH_LOCK; int err = 0; /* If the socket has its own bind function then use it. */ @@ -451,11 +453,12 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) /* BPF prog is run before any checks are done so that if the prog * changes context in a wrong way it will be caught. */ - err = BPF_CGROUP_RUN_PROG_INET6_BIND_LOCK(sk, uaddr); + err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, + BPF_CGROUP_INET6_BIND, &flags); if (err) return err; - return __inet6_bind(sk, uaddr, addr_len, BIND_WITH_LOCK); + return __inet6_bind(sk, uaddr, addr_len, flags); } EXPORT_SYMBOL(inet6_bind); -- cgit v1.2.3 From c260954177c4f1926b423823bca5728f19b40d67 Mon Sep 17 00:00:00 2001 From: Emil Renner Berthing Date: Sat, 23 Jan 2021 19:24:56 +0100 Subject: genirq: Use new tasklet API for resend_tasklet This converts the resend_tasklet to use the new API in commit 12cc923f1ccc ("tasklet: Introduce new initialization API") The new API changes the argument passed to the callback function, but fortunately the argument isn't used so it is straight forward to use DECLARE_TASKLET() rather than DECLARE_TASKLET_OLD(). Signed-off-by: Emil Renner Berthing Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210123182456.6521-1-esmil@mailme.dk --- kernel/irq/resend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 8ccd32a0cc80..bd1d85c610aa 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -27,7 +27,7 @@ static DECLARE_BITMAP(irqs_resend, IRQ_BITMAP_BITS); /* * Run software resends of IRQ's */ -static void resend_irqs(unsigned long arg) +static void resend_irqs(struct tasklet_struct *unused) { struct irq_desc *desc; int irq; @@ -45,7 +45,7 @@ static void resend_irqs(unsigned long arg) } /* Tasklet to handle resend: */ -static DECLARE_TASKLET_OLD(resend_tasklet, resend_irqs); +static DECLARE_TASKLET(resend_tasklet, resend_irqs); static int irq_sw_resend(struct irq_desc *desc) { -- cgit v1.2.3 From 1ce53e2c2ac069e7b3c400a427002a70deb4a916 Mon Sep 17 00:00:00 2001 From: Alejandro Colomar Date: Sat, 28 Nov 2020 13:39:46 +0100 Subject: futex: Change utime parameter to be 'const ... *' futex(2) says that 'utime' is a pointer to 'const'. The implementation doesn't use 'const'; however, it _never_ modifies the contents of utime. - futex() either uses 'utime' as a pointer to struct or as a 'u32'. - In case it's used as a 'u32', it makes a copy of it, and of course it is not dereferenced. - In case it's used as a 'struct __kernel_timespec __user *', the pointer is not dereferenced inside the futex() definition, and it is only passed to a function: get_timespec64(), which accepts a 'const struct __kernel_timespec __user *'. [ tglx: Make the same change to the compat syscall and fixup the prototypes. ] Signed-off-by: Alejandro Colomar Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201128123945.4592-1-alx.manpages@gmail.com --- include/linux/syscalls.h | 8 ++++---- kernel/futex.c | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index f3929aff39cf..5cb74edd9a4f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -583,11 +583,11 @@ asmlinkage long sys_unshare(unsigned long unshare_flags); /* kernel/futex.c */ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, - struct __kernel_timespec __user *utime, u32 __user *uaddr2, - u32 val3); + const struct __kernel_timespec __user *utime, + u32 __user *uaddr2, u32 val3); asmlinkage long sys_futex_time32(u32 __user *uaddr, int op, u32 val, - struct old_timespec32 __user *utime, u32 __user *uaddr2, - u32 val3); + const struct old_timespec32 __user *utime, + u32 __user *uaddr2, u32 val3); asmlinkage long sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, size_t __user *len_ptr); diff --git a/kernel/futex.c b/kernel/futex.c index c47d1015d759..d0775aab8da9 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3790,8 +3790,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, - struct __kernel_timespec __user *, utime, u32 __user *, uaddr2, - u32, val3) + const struct __kernel_timespec __user *, utime, + u32 __user *, uaddr2, u32, val3) { struct timespec64 ts; ktime_t t, *tp = NULL; @@ -3986,7 +3986,7 @@ err_unlock: #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE6(futex_time32, u32 __user *, uaddr, int, op, u32, val, - struct old_timespec32 __user *, utime, u32 __user *, uaddr2, + const struct old_timespec32 __user *, utime, u32 __user *, uaddr2, u32, val3) { struct timespec64 ts; -- cgit v1.2.3 From 0f9438503ea1312ef49be4d9762e0f0006546364 Mon Sep 17 00:00:00 2001 From: Jangwoong Kim <6812skiii@gmail.com> Date: Wed, 30 Dec 2020 21:29:53 +0900 Subject: futex: Remove unneeded gotos Get rid of gotos that do not contain any cleanup. These were not removed in commit 9180bd467f9a ("futex: Remove put_futex_key()"). Signed-off-by: Jangwoong Kim <6812skiii@gmail.com> Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20201230122953.10473-1-6812skiii@gmail.com --- kernel/futex.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index d0775aab8da9..f3570a276990 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -3024,7 +3024,7 @@ retry: * Success, we're done! No tricky corner cases. */ if (!ret) - goto out_putkey; + return ret; /* * The atomic access to the futex value generated a * pagefault, so retry the user-access and the wakeup: @@ -3041,7 +3041,7 @@ retry: * wake_futex_pi has detected invalid state. Tell user * space. */ - goto out_putkey; + return ret; } /* @@ -3062,7 +3062,7 @@ retry: default: WARN_ON_ONCE(1); - goto out_putkey; + return ret; } } @@ -3073,7 +3073,6 @@ retry: out_unlock: spin_unlock(&hb->lock); -out_putkey: return ret; pi_retry: -- cgit v1.2.3 From bf594bf400016a1ac58c753bcc0393a39c36f669 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Fri, 13 Nov 2020 16:58:11 +0800 Subject: locking/rtmutex: Add missing kernel-doc markup To fix the following issues: kernel/locking/rtmutex.c:1612: warning: Function parameter or member 'lock' not described in '__rt_mutex_futex_unlock' kernel/locking/rtmutex.c:1612: warning: Function parameter or member 'wake_q' not described in '__rt_mutex_futex_unlock' kernel/locking/rtmutex.c:1675: warning: Function parameter or member 'name' not described in '__rt_mutex_init' kernel/locking/rtmutex.c:1675: warning: Function parameter or member 'key' not described in '__rt_mutex_init' [ tglx: Change rt lock to rt_mutex for consistency sake ] Signed-off-by: Alex Shi Signed-off-by: Thomas Gleixner Acked-by: Will Deacon Link: https://lore.kernel.org/r/1605257895-5536-2-git-send-email-alex.shi@linux.alibaba.com --- kernel/locking/rtmutex.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index cfdd5b93264d..a201e5ef9374 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1604,8 +1604,11 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_unlock); /** - * Futex variant, that since futex variants do not use the fast-path, can be - * simple and will not need to retry. + * __rt_mutex_futex_unlock - Futex variant, that since futex variants + * do not use the fast-path, can be simple and will not need to retry. + * + * @lock: The rt_mutex to be unlocked + * @wake_q: The wake queue head from which to get the next lock waiter */ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, struct wake_q_head *wake_q) @@ -1662,13 +1665,15 @@ void rt_mutex_destroy(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_destroy); /** - * __rt_mutex_init - initialize the rt lock + * __rt_mutex_init - initialize the rt_mutex * - * @lock: the rt lock to be initialized + * @lock: The rt_mutex to be initialized + * @name: The lock name used for debugging + * @key: The lock class key used for debugging * - * Initialize the rt lock to unlocked state. + * Initialize the rt_mutex to unlocked state. * - * Initializing of a locked rt lock is not allowed + * Initializing of a locked rt_mutex is not allowed */ void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_class_key *key) -- cgit v1.2.3 From 41c1a06d1d1544bed9692ba72a5692454eee1945 Mon Sep 17 00:00:00 2001 From: Yuxuan Shui Date: Sat, 23 Jan 2021 03:21:32 -0800 Subject: entry: Unbreak single step reporting behaviour The move of TIF_SYSCALL_EMU to SYSCALL_WORK_SYSCALL_EMU broke single step reporting. The original code reported the single step when TIF_SINGLESTEP was set and TIF_SYSCALL_EMU was not set. The SYSCALL_WORK conversion got the logic wrong and now the reporting only happens when both bits are set. Restore the original behaviour. [ tglx: Massaged changelog and dropped the pointless double negation ] Fixes: 64eb35f701f0 ("ptrace: Migrate TIF_SYSCALL_EMU to use SYSCALL_WORK flag") Signed-off-by: Yuxuan Shui Signed-off-by: Thomas Gleixner Reviewed-by: Gabriel Krisman Bertazi Link: https://lore.kernel.org/r/877do3gaq9.fsf@m5Zedd9JOGzJrf0 --- kernel/entry/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 378341642f94..6dd82be60df8 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -222,7 +222,7 @@ static inline bool report_single_step(unsigned long work) */ static inline bool report_single_step(unsigned long work) { - if (!(work & SYSCALL_WORK_SYSCALL_EMU)) + if (work & SYSCALL_WORK_SYSCALL_EMU) return false; return !!(current_thread_info()->flags & _TIF_SINGLESTEP); -- cgit v1.2.3 From 61ca36c8c4eb3bae35a285b1ae18c514cde65439 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Wed, 27 Jan 2021 18:46:15 +0100 Subject: bpf: Simplify cases in bpf_base_func_proto !perfmon_capable() is checked before the last switch(func_id) in bpf_base_func_proto. Thus, the cases BPF_FUNC_trace_printk and BPF_FUNC_snprintf_btf can be moved to that last switch(func_id) to omit the inline !perfmon_capable() checks. Signed-off-by: Tobias Klauser Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210127174615.3038-1-tklauser@distanz.ch --- kernel/bpf/helpers.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 41ca280b1dc1..308427fe03a3 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -720,14 +720,6 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: return &bpf_spin_unlock_proto; - case BPF_FUNC_trace_printk: - if (!perfmon_capable()) - return NULL; - return bpf_get_trace_printk_proto(); - case BPF_FUNC_snprintf_btf: - if (!perfmon_capable()) - return NULL; - return &bpf_snprintf_btf_proto; case BPF_FUNC_jiffies64: return &bpf_jiffies64_proto; case BPF_FUNC_per_cpu_ptr: @@ -742,6 +734,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return NULL; switch (func_id) { + case BPF_FUNC_trace_printk: + return bpf_get_trace_printk_proto(); case BPF_FUNC_get_current_task: return &bpf_get_current_task_proto; case BPF_FUNC_probe_read_user: @@ -752,6 +746,8 @@ bpf_base_func_proto(enum bpf_func_id func_id) return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: return &bpf_probe_read_kernel_str_proto; + case BPF_FUNC_snprintf_btf: + return &bpf_snprintf_btf_proto; default: return NULL; } -- cgit v1.2.3 From be65de6b03aa638c46ea51e9d11a92e4914d8103 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Thu, 14 Jan 2021 17:05:31 +0530 Subject: fs: Remove dcookies support The dcookies stuff was only used by the kernel's old oprofile code. Now that oprofile's support is removed from the kernel, there is no need for dcookies as well. Remove it. Suggested-by: Christoph Hellwig Suggested-by: Linus Torvalds Signed-off-by: Viresh Kumar Acked-by: Robert Richter Acked-by: William Cohen Acked-by: Al Viro Acked-by: Thomas Gleixner --- fs/Makefile | 1 - fs/dcookies.c | 356 ----------------------------------------------- include/linux/dcookies.h | 69 --------- kernel/sys.c | 1 - 4 files changed, 427 deletions(-) delete mode 100644 fs/dcookies.c delete mode 100644 include/linux/dcookies.h (limited to 'kernel') diff --git a/fs/Makefile b/fs/Makefile index 999d1a23f036..3215fe205256 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -64,7 +64,6 @@ obj-$(CONFIG_SYSFS) += sysfs/ obj-$(CONFIG_CONFIGFS_FS) += configfs/ obj-y += devpts/ -obj-$(CONFIG_PROFILING) += dcookies.o obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line diff --git a/fs/dcookies.c b/fs/dcookies.c deleted file mode 100644 index 6eeb61100a09..000000000000 --- a/fs/dcookies.c +++ /dev/null @@ -1,356 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * dcookies.c - * - * Copyright 2002 John Levon - * - * Persistent cookie-path mappings. These are used by - * profilers to convert a per-task EIP value into something - * non-transitory that can be processed at a later date. - * This is done by locking the dentry/vfsmnt pair in the - * kernel until released by the tasks needing the persistent - * objects. The tag is simply an unsigned long that refers - * to the pair and can be looked up from userspace. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* The dcookies are allocated from a kmem_cache and - * hashed onto a small number of lists. None of the - * code here is particularly performance critical - */ -struct dcookie_struct { - struct path path; - struct list_head hash_list; -}; - -static LIST_HEAD(dcookie_users); -static DEFINE_MUTEX(dcookie_mutex); -static struct kmem_cache *dcookie_cache __read_mostly; -static struct list_head *dcookie_hashtable __read_mostly; -static size_t hash_size __read_mostly; - -static inline int is_live(void) -{ - return !(list_empty(&dcookie_users)); -} - - -/* The dentry is locked, its address will do for the cookie */ -static inline unsigned long dcookie_value(struct dcookie_struct * dcs) -{ - return (unsigned long)dcs->path.dentry; -} - - -static size_t dcookie_hash(unsigned long dcookie) -{ - return (dcookie >> L1_CACHE_SHIFT) & (hash_size - 1); -} - - -static struct dcookie_struct * find_dcookie(unsigned long dcookie) -{ - struct dcookie_struct *found = NULL; - struct dcookie_struct * dcs; - struct list_head * pos; - struct list_head * list; - - list = dcookie_hashtable + dcookie_hash(dcookie); - - list_for_each(pos, list) { - dcs = list_entry(pos, struct dcookie_struct, hash_list); - if (dcookie_value(dcs) == dcookie) { - found = dcs; - break; - } - } - - return found; -} - - -static void hash_dcookie(struct dcookie_struct * dcs) -{ - struct list_head * list = dcookie_hashtable + dcookie_hash(dcookie_value(dcs)); - list_add(&dcs->hash_list, list); -} - - -static struct dcookie_struct *alloc_dcookie(const struct path *path) -{ - struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache, - GFP_KERNEL); - struct dentry *d; - if (!dcs) - return NULL; - - d = path->dentry; - spin_lock(&d->d_lock); - d->d_flags |= DCACHE_COOKIE; - spin_unlock(&d->d_lock); - - dcs->path = *path; - path_get(path); - hash_dcookie(dcs); - return dcs; -} - - -/* This is the main kernel-side routine that retrieves the cookie - * value for a dentry/vfsmnt pair. - */ -int get_dcookie(const struct path *path, unsigned long *cookie) -{ - int err = 0; - struct dcookie_struct * dcs; - - mutex_lock(&dcookie_mutex); - - if (!is_live()) { - err = -EINVAL; - goto out; - } - - if (path->dentry->d_flags & DCACHE_COOKIE) { - dcs = find_dcookie((unsigned long)path->dentry); - } else { - dcs = alloc_dcookie(path); - if (!dcs) { - err = -ENOMEM; - goto out; - } - } - - *cookie = dcookie_value(dcs); - -out: - mutex_unlock(&dcookie_mutex); - return err; -} - - -/* And here is where the userspace process can look up the cookie value - * to retrieve the path. - */ -static int do_lookup_dcookie(u64 cookie64, char __user *buf, size_t len) -{ - unsigned long cookie = (unsigned long)cookie64; - int err = -EINVAL; - char * kbuf; - char * path; - size_t pathlen; - struct dcookie_struct * dcs; - - /* we could leak path information to users - * without dir read permission without this - */ - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - mutex_lock(&dcookie_mutex); - - if (!is_live()) { - err = -EINVAL; - goto out; - } - - if (!(dcs = find_dcookie(cookie))) - goto out; - - err = -ENOMEM; - kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!kbuf) - goto out; - - /* FIXME: (deleted) ? */ - path = d_path(&dcs->path, kbuf, PAGE_SIZE); - - mutex_unlock(&dcookie_mutex); - - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out_free; - } - - err = -ERANGE; - - pathlen = kbuf + PAGE_SIZE - path; - if (pathlen <= len) { - err = pathlen; - if (copy_to_user(buf, path, pathlen)) - err = -EFAULT; - } - -out_free: - kfree(kbuf); - return err; -out: - mutex_unlock(&dcookie_mutex); - return err; -} - -SYSCALL_DEFINE3(lookup_dcookie, u64, cookie64, char __user *, buf, size_t, len) -{ - return do_lookup_dcookie(cookie64, buf, len); -} - -#ifdef CONFIG_COMPAT -COMPAT_SYSCALL_DEFINE4(lookup_dcookie, u32, w0, u32, w1, char __user *, buf, compat_size_t, len) -{ -#ifdef __BIG_ENDIAN - return do_lookup_dcookie(((u64)w0 << 32) | w1, buf, len); -#else - return do_lookup_dcookie(((u64)w1 << 32) | w0, buf, len); -#endif -} -#endif - -static int dcookie_init(void) -{ - struct list_head * d; - unsigned int i, hash_bits; - int err = -ENOMEM; - - dcookie_cache = kmem_cache_create("dcookie_cache", - sizeof(struct dcookie_struct), - 0, 0, NULL); - - if (!dcookie_cache) - goto out; - - dcookie_hashtable = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!dcookie_hashtable) - goto out_kmem; - - err = 0; - - /* - * Find the power-of-two list-heads that can fit into the allocation.. - * We don't guarantee that "sizeof(struct list_head)" is necessarily - * a power-of-two. - */ - hash_size = PAGE_SIZE / sizeof(struct list_head); - hash_bits = 0; - do { - hash_bits++; - } while ((hash_size >> hash_bits) != 0); - hash_bits--; - - /* - * Re-calculate the actual number of entries and the mask - * from the number of bits we can fit. - */ - hash_size = 1UL << hash_bits; - - /* And initialize the newly allocated array */ - d = dcookie_hashtable; - i = hash_size; - do { - INIT_LIST_HEAD(d); - d++; - i--; - } while (i); - -out: - return err; -out_kmem: - kmem_cache_destroy(dcookie_cache); - goto out; -} - - -static void free_dcookie(struct dcookie_struct * dcs) -{ - struct dentry *d = dcs->path.dentry; - - spin_lock(&d->d_lock); - d->d_flags &= ~DCACHE_COOKIE; - spin_unlock(&d->d_lock); - - path_put(&dcs->path); - kmem_cache_free(dcookie_cache, dcs); -} - - -static void dcookie_exit(void) -{ - struct list_head * list; - struct list_head * pos; - struct list_head * pos2; - struct dcookie_struct * dcs; - size_t i; - - for (i = 0; i < hash_size; ++i) { - list = dcookie_hashtable + i; - list_for_each_safe(pos, pos2, list) { - dcs = list_entry(pos, struct dcookie_struct, hash_list); - list_del(&dcs->hash_list); - free_dcookie(dcs); - } - } - - kfree(dcookie_hashtable); - kmem_cache_destroy(dcookie_cache); -} - - -struct dcookie_user { - struct list_head next; -}; - -struct dcookie_user * dcookie_register(void) -{ - struct dcookie_user * user; - - mutex_lock(&dcookie_mutex); - - user = kmalloc(sizeof(struct dcookie_user), GFP_KERNEL); - if (!user) - goto out; - - if (!is_live() && dcookie_init()) - goto out_free; - - list_add(&user->next, &dcookie_users); - -out: - mutex_unlock(&dcookie_mutex); - return user; -out_free: - kfree(user); - user = NULL; - goto out; -} - - -void dcookie_unregister(struct dcookie_user * user) -{ - mutex_lock(&dcookie_mutex); - - list_del(&user->next); - kfree(user); - - if (!is_live()) - dcookie_exit(); - - mutex_unlock(&dcookie_mutex); -} - -EXPORT_SYMBOL_GPL(dcookie_register); -EXPORT_SYMBOL_GPL(dcookie_unregister); -EXPORT_SYMBOL_GPL(get_dcookie); diff --git a/include/linux/dcookies.h b/include/linux/dcookies.h deleted file mode 100644 index ddfdac20cad0..000000000000 --- a/include/linux/dcookies.h +++ /dev/null @@ -1,69 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * dcookies.h - * - * Persistent cookie-path mappings - * - * Copyright 2002 John Levon - */ - -#ifndef DCOOKIES_H -#define DCOOKIES_H - - -#ifdef CONFIG_PROFILING - -#include -#include - -struct dcookie_user; -struct path; - -/** - * dcookie_register - register a user of dcookies - * - * Register as a dcookie user. Returns %NULL on failure. - */ -struct dcookie_user * dcookie_register(void); - -/** - * dcookie_unregister - unregister a user of dcookies - * - * Unregister as a dcookie user. This may invalidate - * any dcookie values returned from get_dcookie(). - */ -void dcookie_unregister(struct dcookie_user * user); - -/** - * get_dcookie - acquire a dcookie - * - * Convert the given dentry/vfsmount pair into - * a cookie value. - * - * Returns -EINVAL if no living task has registered as a - * dcookie user. - * - * Returns 0 on success, with *cookie filled in - */ -int get_dcookie(const struct path *path, unsigned long *cookie); - -#else - -static inline struct dcookie_user * dcookie_register(void) -{ - return NULL; -} - -static inline void dcookie_unregister(struct dcookie_user * user) -{ - return; -} - -static inline int get_dcookie(const struct path *path, unsigned long *cookie) -{ - return -ENOSYS; -} - -#endif /* CONFIG_PROFILING */ - -#endif /* DCOOKIES_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 51f00fe20e4d..6928d23c46ea 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3 From 442187f3c2de40bab13b8f9751b37925bede73b0 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 26 Jan 2021 12:17:21 +0200 Subject: locking/rwsem: Remove empty rwsem.h This is a leftover from 7f26482a872c ("locking/percpu-rwsem: Remove the embedded rwsem") Signed-off-by: Nikolay Borisov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Link: https://lkml.kernel.org/r/20210126101721.976027-1-nborisov@suse.com --- kernel/locking/rwsem.h | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 kernel/locking/rwsem.h (limited to 'kernel') diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h deleted file mode 100644 index e69de29bb2d1..000000000000 -- cgit v1.2.3 From 7e0a9220467dbcfdc5bc62825724f3e52e50ab31 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 29 Jan 2021 10:13:53 -0500 Subject: fgraph: Initialize tracing_graph_pause at task creation On some archs, the idle task can call into cpu_suspend(). The cpu_suspend() will disable or pause function graph tracing, as there's some paths in bringing down the CPU that can have issues with its return address being modified. The task_struct structure has a "tracing_graph_pause" atomic counter, that when set to something other than zero, the function graph tracer will not modify the return address. The problem is that the tracing_graph_pause counter is initialized when the function graph tracer is enabled. This can corrupt the counter for the idle task if it is suspended in these architectures. CPU 1 CPU 2 ----- ----- do_idle() cpu_suspend() pause_graph_tracing() task_struct->tracing_graph_pause++ (0 -> 1) start_graph_tracing() for_each_online_cpu(cpu) { ftrace_graph_init_idle_task(cpu) task-struct->tracing_graph_pause = 0 (1 -> 0) unpause_graph_tracing() task_struct->tracing_graph_pause-- (0 -> -1) The above should have gone from 1 to zero, and enabled function graph tracing again. But instead, it is set to -1, which keeps it disabled. There's no reason that the field tracing_graph_pause on the task_struct can not be initialized at boot up. Cc: stable@vger.kernel.org Fixes: 380c4b1411ccd ("tracing/function-graph-tracer: append the tracing_graph_flag") Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=211339 Reported-by: pierre.gondois@arm.com Signed-off-by: Steven Rostedt (VMware) --- init/init_task.c | 3 ++- kernel/trace/fgraph.c | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/init/init_task.c b/init/init_task.c index 8a992d73e6fb..3711cdaafed2 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -198,7 +198,8 @@ struct task_struct init_task .lockdep_recursion = 0, #endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER - .ret_stack = NULL, + .ret_stack = NULL, + .tracing_graph_pause = ATOMIC_INIT(0), #endif #if defined(CONFIG_TRACING) && defined(CONFIG_PREEMPTION) .trace_recursion = 0, diff --git a/kernel/trace/fgraph.c b/kernel/trace/fgraph.c index 73edb9e4f354..29a6ebeebc9e 100644 --- a/kernel/trace/fgraph.c +++ b/kernel/trace/fgraph.c @@ -394,7 +394,6 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list) } if (t->ret_stack == NULL) { - atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->curr_ret_stack = -1; t->curr_ret_depth = -1; @@ -489,7 +488,6 @@ static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack); static void graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack) { - atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; /* make curr_ret_stack visible before we add the ret_stack */ -- cgit v1.2.3 From da7f84cdf02fd5f66864041f45018b328911b722 Mon Sep 17 00:00:00 2001 From: Viktor Rosendahl Date: Tue, 19 Jan 2021 17:43:43 +0100 Subject: tracing: Use pause-on-trace with the latency tracers Eaerlier, tracing was disabled when reading the trace file. This behavior was changed with: commit 06e0a548bad0 ("tracing: Do not disable tracing when reading the trace file"). This doesn't seem to work with the latency tracers. The above mentioned commit dit not only change the behavior but also added an option to emulate the old behavior. The idea with this patch is to enable this pause-on-trace option when the latency tracers are used. Link: https://lkml.kernel.org/r/20210119164344.37500-2-Viktor.Rosendahl@bmw.de Cc: stable@vger.kernel.org Fixes: 06e0a548bad0 ("tracing: Do not disable tracing when reading the trace file") Signed-off-by: Viktor Rosendahl Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_irqsoff.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index d06aab4dcbb8..6756379b661f 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -562,6 +562,8 @@ static int __irqsoff_tracer_init(struct trace_array *tr) /* non overwrite screws up the latency tracers */ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); + /* without pause, we will produce garbage if another latency occurs */ + set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, 1); tr->max_latency = 0; irqsoff_trace = tr; @@ -583,11 +585,13 @@ static void __irqsoff_tracer_reset(struct trace_array *tr) { int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + int pause_flag = save_flags & TRACE_ITER_PAUSE_ON_TRACE; stop_irqsoff_tracer(tr, is_graph(tr)); set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); + set_tracer_flag(tr, TRACE_ITER_PAUSE_ON_TRACE, pause_flag); ftrace_reset_array_ops(tr); irqsoff_busy = false; -- cgit v1.2.3 From 97c753e62e6c31a404183898d950d8c08d752dbd Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 28 Jan 2021 00:37:51 +0900 Subject: tracing/kprobe: Fix to support kretprobe events on unloaded modules Fix kprobe_on_func_entry() returns error code instead of false so that register_kretprobe() can return an appropriate error code. append_trace_kprobe() expects the kprobe registration returns -ENOENT when the target symbol is not found, and it checks whether the target module is unloaded or not. If the target module doesn't exist, it defers to probe the target symbol until the module is loaded. However, since register_kretprobe() returns -EINVAL instead of -ENOENT in that case, it always fail on putting the kretprobe event on unloaded modules. e.g. Kprobe event: /sys/kernel/debug/tracing # echo p xfs:xfs_end_io >> kprobe_events [ 16.515574] trace_kprobe: This probe might be able to register after target module is loaded. Continue. Kretprobe event: (p -> r) /sys/kernel/debug/tracing # echo r xfs:xfs_end_io >> kprobe_events sh: write error: Invalid argument /sys/kernel/debug/tracing # cat error_log [ 41.122514] trace_kprobe: error: Failed to register probe event Command: r xfs:xfs_end_io ^ To fix this bug, change kprobe_on_func_entry() to detect symbol lookup failure and return -ENOENT in that case. Otherwise it returns -EINVAL or 0 (succeeded, given address is on the entry). Link: https://lkml.kernel.org/r/161176187132.1067016.8118042342894378981.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: 59158ec4aef7 ("tracing/kprobes: Check the probe on unloaded module correctly") Reported-by: Jianlin Lv Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- include/linux/kprobes.h | 2 +- kernel/kprobes.c | 34 +++++++++++++++++++++++++--------- kernel/trace/trace_kprobe.c | 10 ++++++---- 3 files changed, 32 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index b3a36b0cfc81..1883a4a9f16a 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -266,7 +266,7 @@ extern void kprobes_inc_nmissed_count(struct kprobe *p); extern bool arch_within_kprobe_blacklist(unsigned long addr); extern int arch_populate_kprobe_blacklist(void); extern bool arch_kprobe_on_func_entry(unsigned long offset); -extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); +extern int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); extern bool within_kprobe_blacklist(unsigned long addr); extern int kprobe_add_ksym_blacklist(unsigned long entry); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index f7fb5d135930..1a5bc321e0a5 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1954,29 +1954,45 @@ bool __weak arch_kprobe_on_func_entry(unsigned long offset) return !offset; } -bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) +/** + * kprobe_on_func_entry() -- check whether given address is function entry + * @addr: Target address + * @sym: Target symbol name + * @offset: The offset from the symbol or the address + * + * This checks whether the given @addr+@offset or @sym+@offset is on the + * function entry address or not. + * This returns 0 if it is the function entry, or -EINVAL if it is not. + * And also it returns -ENOENT if it fails the symbol or address lookup. + * Caller must pass @addr or @sym (either one must be NULL), or this + * returns -EINVAL. + */ +int kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) { kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); if (IS_ERR(kp_addr)) - return false; + return PTR_ERR(kp_addr); - if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) || - !arch_kprobe_on_func_entry(offset)) - return false; + if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset)) + return -ENOENT; - return true; + if (!arch_kprobe_on_func_entry(offset)) + return -EINVAL; + + return 0; } int register_kretprobe(struct kretprobe *rp) { - int ret = 0; + int ret; struct kretprobe_instance *inst; int i; void *addr; - if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) - return -EINVAL; + ret = kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset); + if (ret) + return ret; if (kretprobe_blacklist_size) { addr = kprobe_addr(&rp->kp); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index e6fba1798771..56c7fbff7bd7 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -221,9 +221,9 @@ bool trace_kprobe_on_func_entry(struct trace_event_call *call) { struct trace_kprobe *tk = trace_kprobe_primary_from_call(call); - return tk ? kprobe_on_func_entry(tk->rp.kp.addr, + return tk ? (kprobe_on_func_entry(tk->rp.kp.addr, tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, - tk->rp.kp.addr ? 0 : tk->rp.kp.offset) : false; + tk->rp.kp.addr ? 0 : tk->rp.kp.offset) == 0) : false; } bool trace_kprobe_error_injectable(struct trace_event_call *call) @@ -828,9 +828,11 @@ static int trace_kprobe_create(int argc, const char *argv[]) } if (is_return) flags |= TPARG_FL_RETURN; - if (kprobe_on_func_entry(NULL, symbol, offset)) + ret = kprobe_on_func_entry(NULL, symbol, offset); + if (ret == 0) flags |= TPARG_FL_FENTRY; - if (offset && is_return && !(flags & TPARG_FL_FENTRY)) { + /* Defer the ENOENT case until register kprobe */ + if (ret == -EINVAL && is_return) { trace_probe_log_err(0, BAD_RETPROBE); goto parse_error; } -- cgit v1.2.3 From 0188b87899ffc4a1d36a0badbe77d56c92fd91dc Mon Sep 17 00:00:00 2001 From: Wang ShaoBo Date: Thu, 28 Jan 2021 20:44:27 +0800 Subject: kretprobe: Avoid re-registration of the same kretprobe earlier Our system encountered a re-init error when re-registering same kretprobe, where the kretprobe_instance in rp->free_instances is illegally accessed after re-init. Implementation to avoid re-registration has been introduced for kprobe before, but lags for register_kretprobe(). We must check if kprobe has been re-registered before re-initializing kretprobe, otherwise it will destroy the data struct of kretprobe registered, which can lead to memory leak, system crash, also some unexpected behaviors. We use check_kprobe_rereg() to check if kprobe has been re-registered before running register_kretprobe()'s body, for giving a warning message and terminate registration process. Link: https://lkml.kernel.org/r/20210128124427.2031088-1-bobo.shaobowang@huawei.com Cc: stable@vger.kernel.org Fixes: 1f0ab40976460 ("kprobes: Prevent re-registration of the same kprobe") [ The above commit should have been done for kretprobes too ] Acked-by: Naveen N. Rao Acked-by: Ananth N Mavinakayanahalli Acked-by: Masami Hiramatsu Signed-off-by: Wang ShaoBo Signed-off-by: Cheng Jian Signed-off-by: Steven Rostedt (VMware) --- kernel/kprobes.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1a5bc321e0a5..d5a3eb74a657 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1994,6 +1994,10 @@ int register_kretprobe(struct kretprobe *rp) if (ret) return ret; + /* If only rp->kp.addr is specified, check reregistering kprobes */ + if (rp->kp.addr && check_kprobe_rereg(&rp->kp)) + return -EINVAL; + if (kretprobe_blacklist_size) { addr = kprobe_addr(&rp->kp); if (IS_ERR(addr)) -- cgit v1.2.3 From 4c457e8cb75eda91906a4f89fc39bde3f9a43922 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sat, 23 Jan 2021 12:27:59 +0000 Subject: genirq/msi: Activate Multi-MSI early when MSI_FLAG_ACTIVATE_EARLY is set When MSI_FLAG_ACTIVATE_EARLY is set (which is the case for PCI), __msi_domain_alloc_irqs() performs the activation of the interrupt (which in the case of PCI results in the endpoint being programmed) as soon as the interrupt is allocated. But it appears that this is only done for the first vector, introducing an inconsistent behaviour for PCI Multi-MSI. Fix it by iterating over the number of vectors allocated to each MSI descriptor. This is easily achieved by introducing a new "for_each_msi_vector" iterator, together with a tiny bit of refactoring. Fixes: f3b0946d629c ("genirq/msi: Make sure PCI MSIs are activated early") Reported-by: Shameer Kolothum Signed-off-by: Marc Zyngier Signed-off-by: Thomas Gleixner Tested-by: Shameer Kolothum Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20210123122759.1781359-1-maz@kernel.org --- include/linux/msi.h | 6 ++++++ kernel/irq/msi.c | 44 ++++++++++++++++++++------------------------ 2 files changed, 26 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/msi.h b/include/linux/msi.h index 360a0a7e7341..aef35fd1cf11 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -178,6 +178,12 @@ struct msi_desc { list_for_each_entry((desc), dev_to_msi_list((dev)), list) #define for_each_msi_entry_safe(desc, tmp, dev) \ list_for_each_entry_safe((desc), (tmp), dev_to_msi_list((dev)), list) +#define for_each_msi_vector(desc, __irq, dev) \ + for_each_msi_entry((desc), (dev)) \ + if ((desc)->irq) \ + for (__irq = (desc)->irq; \ + __irq < ((desc)->irq + (desc)->nvec_used); \ + __irq++) #ifdef CONFIG_IRQ_MSI_IOMMU static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index dc0e2d7fbdfd..b338d622f26e 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -436,22 +436,22 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, can_reserve = msi_check_reservation_mode(domain, info, dev); - for_each_msi_entry(desc, dev) { - virq = desc->irq; - if (desc->nvec_used == 1) - dev_dbg(dev, "irq %d for MSI\n", virq); - else + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) + goto skip_activate; + + for_each_msi_vector(desc, i, dev) { + if (desc->irq == i) { + virq = desc->irq; dev_dbg(dev, "irq [%d-%d] for MSI\n", virq, virq + desc->nvec_used - 1); - /* - * This flag is set by the PCI layer as we need to activate - * the MSI entries before the PCI layer enables MSI in the - * card. Otherwise the card latches a random msi message. - */ - if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) - continue; + } - irq_data = irq_domain_get_irq_data(domain, desc->irq); + irq_data = irq_domain_get_irq_data(domain, i); if (!can_reserve) { irqd_clr_can_reserve(irq_data); if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK) @@ -462,28 +462,24 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, goto cleanup; } +skip_activate: /* * If these interrupts use reservation mode, clear the activated bit * so request_irq() will assign the final vector. */ if (can_reserve) { - for_each_msi_entry(desc, dev) { - irq_data = irq_domain_get_irq_data(domain, desc->irq); + for_each_msi_vector(desc, i, dev) { + irq_data = irq_domain_get_irq_data(domain, i); irqd_clr_activated(irq_data); } } return 0; cleanup: - for_each_msi_entry(desc, dev) { - struct irq_data *irqd; - - if (desc->irq == virq) - break; - - irqd = irq_domain_get_irq_data(domain, desc->irq); - if (irqd_is_activated(irqd)) - irq_domain_deactivate_irq(irqd); + for_each_msi_vector(desc, i, dev) { + irq_data = irq_domain_get_irq_data(domain, i); + if (irqd_is_activated(irq_data)) + irq_domain_deactivate_irq(irq_data); } msi_domain_free_irqs(domain, dev); return ret; -- cgit v1.2.3 From 2a6c6b7d7ad346f0679d0963cb19b3f0ea7ef32c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 28 Jan 2021 14:40:07 -0800 Subject: perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT Current PERF_SAMPLE_WEIGHT sample type is very useful to expresses the cost of an action represented by the sample. This allows the profiler to scale the samples to be more informative to the programmer. It could also help to locate a hotspot, e.g., when profiling by memory latencies, the expensive load appear higher up in the histograms. But current PERF_SAMPLE_WEIGHT sample type is solely determined by one factor. This could be a problem, if users want two or more factors to contribute to the weight. For example, Golden Cove core PMU can provide both the instruction latency and the cache Latency information as factors for the memory profiling. For current X86 platforms, although meminfo::latency is defined as a u64, only the lower 32 bits include the valid data in practice (No memory access could last than 4G cycles). The higher 32 bits can be used to store new factors. Add a new sample type, PERF_SAMPLE_WEIGHT_STRUCT, to indicate the new sample weight structure. It shares the same space as the PERF_SAMPLE_WEIGHT sample type. Users can apply either the PERF_SAMPLE_WEIGHT sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type to retrieve the sample weight, but they cannot apply both sample types simultaneously. Currently, only X86 and PowerPC use the PERF_SAMPLE_WEIGHT sample type. - For PowerPC, there is nothing changed for the PERF_SAMPLE_WEIGHT sample type. There is no effect for the new PERF_SAMPLE_WEIGHT_STRUCT sample type. PowerPC can re-struct the weight field similarly later. - For X86, the same value will be dumped for the PERF_SAMPLE_WEIGHT sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type for now. The following patches will apply the new factors for the PERF_SAMPLE_WEIGHT_STRUCT sample type. The field in the union perf_sample_weight should be shared among different architectures. A generic name is required, but it's hard to abstract a name that applies to all architectures. For example, on X86, the fields are to store all kinds of latency. While on PowerPC, it stores MMCRA[TECX/TECM], which should not be latency. So a general name prefix 'var$NUM' is used here. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/1611873611-156687-2-git-send-email-kan.liang@linux.intel.com --- arch/powerpc/perf/core-book3s.c | 2 +- arch/x86/events/intel/ds.c | 17 +++++++++-------- include/linux/perf_event.h | 4 ++-- include/uapi/linux/perf_event.h | 42 +++++++++++++++++++++++++++++++++++++++-- kernel/events/core.c | 11 +++++++---- 5 files changed, 59 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 28206b1fe172..869d999a836e 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2195,7 +2195,7 @@ static void record_and_restart(struct perf_event *event, unsigned long val, if (event->attr.sample_type & PERF_SAMPLE_WEIGHT && ppmu->get_mem_weight) - ppmu->get_mem_weight(&data.weight); + ppmu->get_mem_weight(&data.weight.full); if (perf_event_overflow(event, &data, regs)) power_pmu_stop(event, 0); diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 67dbc91bccfe..2f54b1fbb895 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -960,7 +960,8 @@ static void adaptive_pebs_record_size_update(void) } #define PERF_PEBS_MEMINFO_TYPE (PERF_SAMPLE_ADDR | PERF_SAMPLE_DATA_SRC | \ - PERF_SAMPLE_PHYS_ADDR | PERF_SAMPLE_WEIGHT | \ + PERF_SAMPLE_PHYS_ADDR | \ + PERF_SAMPLE_WEIGHT_TYPE | \ PERF_SAMPLE_TRANSACTION | \ PERF_SAMPLE_DATA_PAGE_SIZE) @@ -987,7 +988,7 @@ static u64 pebs_update_adaptive_cfg(struct perf_event *event) gprs = (sample_type & PERF_SAMPLE_REGS_INTR) && (attr->sample_regs_intr & PEBS_GP_REGS); - tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT) && + tsx_weight = (sample_type & PERF_SAMPLE_WEIGHT_TYPE) && ((attr->config & INTEL_ARCH_EVENT_MASK) == x86_pmu.rtm_abort_event); @@ -1369,8 +1370,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, /* * Use latency for weight (only avail with PEBS-LL) */ - if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) - data->weight = pebs->lat; + if (fll && (sample_type & PERF_SAMPLE_WEIGHT_TYPE)) + data->weight.full = pebs->lat; /* * data.data_src encodes the data source @@ -1462,8 +1463,8 @@ static void setup_pebs_fixed_sample_data(struct perf_event *event, if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ - if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) - data->weight = intel_get_tsx_weight(pebs->tsx_tuning); + if ((sample_type & PERF_SAMPLE_WEIGHT_TYPE) && !fll) + data->weight.full = intel_get_tsx_weight(pebs->tsx_tuning); if (sample_type & PERF_SAMPLE_TRANSACTION) data->txn = intel_get_tsx_transaction(pebs->tsx_tuning, @@ -1577,8 +1578,8 @@ static void setup_pebs_adaptive_sample_data(struct perf_event *event, } if (format_size & PEBS_DATACFG_MEMINFO) { - if (sample_type & PERF_SAMPLE_WEIGHT) - data->weight = meminfo->latency ?: + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + data->weight.full = meminfo->latency ?: intel_get_tsx_weight(meminfo->tsx_tuning); if (sample_type & PERF_SAMPLE_DATA_SRC) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 9a38f579bc76..fab42cfbd350 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -998,7 +998,7 @@ struct perf_sample_data { struct perf_raw_record *raw; struct perf_branch_stack *br_stack; u64 period; - u64 weight; + union perf_sample_weight weight; u64 txn; union perf_mem_data_src data_src; @@ -1047,7 +1047,7 @@ static inline void perf_sample_data_init(struct perf_sample_data *data, data->raw = NULL; data->br_stack = NULL; data->period = period; - data->weight = 0; + data->weight.full = 0; data->data_src.val = PERF_MEM_NA; data->txn = 0; } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b15e3447cd9f..b2cc246ec119 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -145,12 +145,14 @@ enum perf_event_sample_format { PERF_SAMPLE_CGROUP = 1U << 21, PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, + PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, - PERF_SAMPLE_MAX = 1U << 24, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; +#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) /* * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set * @@ -890,7 +892,24 @@ enum perf_event_type { * char data[size]; * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * - * { u64 weight; } && PERF_SAMPLE_WEIGHT + * { union perf_sample_weight + * { + * u64 full; && PERF_SAMPLE_WEIGHT + * #if defined(__LITTLE_ENDIAN_BITFIELD) + * struct { + * u32 var1_dw; + * u16 var2_w; + * u16 var3_w; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #elif defined(__BIG_ENDIAN_BITFIELD) + * struct { + * u16 var3_w; + * u16 var2_w; + * u32 var1_dw; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #endif + * } + * } * { u64 data_src; } && PERF_SAMPLE_DATA_SRC * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 abi; # enum perf_sample_regs_abi @@ -1248,4 +1267,23 @@ struct perf_branch_entry { reserved:40; }; +union perf_sample_weight { + __u64 full; +#if defined(__LITTLE_ENDIAN_BITFIELD) + struct { + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; + }; +#elif defined(__BIG_ENDIAN_BITFIELD) + struct { + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; + }; +#else +#error "Unknown endianness" +#endif +}; + #endif /* _UAPI_LINUX_PERF_EVENT_H */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 55d18791a72d..5206097d4d3d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1879,8 +1879,8 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_PERIOD) size += sizeof(data->period); - if (sample_type & PERF_SAMPLE_WEIGHT) - size += sizeof(data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + size += sizeof(data->weight.full); if (sample_type & PERF_SAMPLE_READ) size += event->read_size; @@ -6907,8 +6907,8 @@ void perf_output_sample(struct perf_output_handle *handle, data->regs_user.regs); } - if (sample_type & PERF_SAMPLE_WEIGHT) - perf_output_put(handle, data->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) + perf_output_put(handle, data->weight.full); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); @@ -11564,6 +11564,9 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->sample_type & PERF_SAMPLE_CGROUP) return -EINVAL; #endif + if ((attr->sample_type & PERF_SAMPLE_WEIGHT) && + (attr->sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) + return -EINVAL; out: return ret; -- cgit v1.2.3 From 37086bfdc737ea6f66bf68dcf16757004d68e1e1 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Tue, 2 Feb 2021 13:50:02 +0000 Subject: bpf: Propagate stack bounds to registers in atomics w/ BPF_FETCH When BPF_FETCH is set, atomic instructions load a value from memory into a register. The current verifier code first checks via check_mem_access whether we can access the memory, and then checks via check_reg_arg whether we can write into the register. For loads, check_reg_arg has the side-effect of marking the register's value as unkonwn, and check_mem_access has the side effect of propagating bounds from memory to the register. This currently only takes effect for stack memory. Therefore with the current order, bounds information is thrown away, but by simply reversing the order of check_reg_arg vs. check_mem_access, we can instead propagate bounds smartly. A simple test is added with an infinite loop that can only be proved unreachable if this propagation is present. This is implemented both with C and directly in test_verifier using assembly. Suggested-by: John Fastabend Signed-off-by: Brendan Jackman Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210202135002.4024825-1-jackmanb@google.com --- kernel/bpf/verifier.c | 32 ++++++++++++---------- .../selftests/bpf/prog_tests/atomic_bounds.c | 15 ++++++++++ tools/testing/selftests/bpf/progs/atomic_bounds.c | 24 ++++++++++++++++ .../testing/selftests/bpf/verifier/atomic_bounds.c | 27 ++++++++++++++++++ 4 files changed, 84 insertions(+), 14 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/atomic_bounds.c create mode 100644 tools/testing/selftests/bpf/progs/atomic_bounds.c create mode 100644 tools/testing/selftests/bpf/verifier/atomic_bounds.c (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 972fc38eb62d..5e09632efddb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -3665,9 +3665,26 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return -EACCES; } + if (insn->imm & BPF_FETCH) { + if (insn->imm == BPF_CMPXCHG) + load_reg = BPF_REG_0; + else + load_reg = insn->src_reg; + + /* check and record load of old value */ + err = check_reg_arg(env, load_reg, DST_OP); + if (err) + return err; + } else { + /* This instruction accesses a memory location but doesn't + * actually load it into a register. + */ + load_reg = -1; + } + /* check whether we can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, -1, true); + BPF_SIZE(insn->code), BPF_READ, load_reg, true); if (err) return err; @@ -3677,19 +3694,6 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i if (err) return err; - if (!(insn->imm & BPF_FETCH)) - return 0; - - if (insn->imm == BPF_CMPXCHG) - load_reg = BPF_REG_0; - else - load_reg = insn->src_reg; - - /* check and record load of old value */ - err = check_reg_arg(env, load_reg, DST_OP); - if (err) - return err; - return 0; } diff --git a/tools/testing/selftests/bpf/prog_tests/atomic_bounds.c b/tools/testing/selftests/bpf/prog_tests/atomic_bounds.c new file mode 100644 index 000000000000..addf127068e4 --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/atomic_bounds.c @@ -0,0 +1,15 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +#include "atomic_bounds.skel.h" + +void test_atomic_bounds(void) +{ + struct atomic_bounds *skel; + __u32 duration = 0; + + skel = atomic_bounds__open_and_load(); + if (CHECK(!skel, "skel_load", "couldn't load program\n")) + return; +} diff --git a/tools/testing/selftests/bpf/progs/atomic_bounds.c b/tools/testing/selftests/bpf/progs/atomic_bounds.c new file mode 100644 index 000000000000..e5fff7fc7f8f --- /dev/null +++ b/tools/testing/selftests/bpf/progs/atomic_bounds.c @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include + +#ifdef ENABLE_ATOMICS_TESTS +bool skip_tests __attribute((__section__(".data"))) = false; +#else +bool skip_tests = true; +#endif + +SEC("fentry/bpf_fentry_test1") +int BPF_PROG(sub, int x) +{ +#ifdef ENABLE_ATOMICS_TESTS + int a = 0; + int b = __sync_fetch_and_add(&a, 1); + /* b is certainly 0 here. Can the verifier tell? */ + while (b) + continue; +#endif + return 0; +} diff --git a/tools/testing/selftests/bpf/verifier/atomic_bounds.c b/tools/testing/selftests/bpf/verifier/atomic_bounds.c new file mode 100644 index 000000000000..e82183e4914f --- /dev/null +++ b/tools/testing/selftests/bpf/verifier/atomic_bounds.c @@ -0,0 +1,27 @@ +{ + "BPF_ATOMIC bounds propagation, mem->reg", + .insns = { + /* a = 0; */ + /* + * Note this is implemented with two separate instructions, + * where you might think one would suffice: + * + * BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + * + * This is because BPF_ST_MEM doesn't seem to set the stack slot + * type to 0 when storing an immediate. + */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + /* b = atomic_fetch_add(&a, 1); */ + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_ATOMIC_OP(BPF_DW, BPF_ADD | BPF_FETCH, BPF_REG_10, BPF_REG_1, -8), + /* Verifier should be able to tell that this infinite loop isn't reachable. */ + /* if (b) while (true) continue; */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, -1), + BPF_EXIT_INSN(), + }, + .result = ACCEPT, + .result_unpriv = REJECT, + .errstr_unpriv = "back-edge", +}, -- cgit v1.2.3 From 548f1191d86ccb9bde2a5305988877b7584c01eb Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 2 Feb 2021 23:06:36 -0800 Subject: bpf: Unbreak BPF_PROG_TYPE_KPROBE when kprobe is called via do_int3 The commit 0d00449c7a28 ("x86: Replace ist_enter() with nmi_enter()") converted do_int3 handler to be "NMI-like". That made old if (in_nmi()) check abort execution of bpf programs attached to kprobe when kprobe is firing via int3 (For example when kprobe is placed in the middle of the function). Remove the check to restore user visible behavior. Fixes: 0d00449c7a28 ("x86: Replace ist_enter() with nmi_enter()") Reported-by: Nikolay Borisov Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Tested-by: Nikolay Borisov Reviewed-by: Masami Hiramatsu Link: https://lore.kernel.org/bpf/20210203070636.70926-1-alexei.starovoitov@gmail.com --- kernel/trace/bpf_trace.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6c0018abe68a..764400260eb6 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -96,9 +96,6 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { unsigned int ret; - if (in_nmi()) /* not supported yet */ - return 1; - cant_sleep(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { -- cgit v1.2.3 From 6183f4d3a0a2ad230511987c6c362ca43ec0055f Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Wed, 27 Jan 2021 06:36:53 +0000 Subject: bpf: Check for integer overflow when using roundup_pow_of_two() On 32-bit architecture, roundup_pow_of_two() can return 0 when the argument has upper most bit set due to resulting 1UL << 32. Add a check for this case. Fixes: d5a3b1f69186 ("bpf: introduce BPF_MAP_TYPE_STACK_TRACE") Signed-off-by: Bui Quang Minh Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210127063653.3576-1-minhquangbui99@gmail.com --- kernel/bpf/stackmap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index aea96b638473..bfafbf115bf3 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -115,6 +115,8 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) /* hash table size must be power of 2 */ n_buckets = roundup_pow_of_two(attr->max_entries); + if (!n_buckets) + return ERR_PTR(-E2BIG); cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); -- cgit v1.2.3 From ba90c2cc0231124d6de63576e8bdf371e92c8fd3 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 4 Feb 2021 19:36:21 +0000 Subject: bpf: Allow usage of BPF ringbuffer in sleepable programs The BPF ringbuffer map is pre-allocated and the implementation logic does not rely on disabling preemption or per-cpu data structures. Using the BPF ringbuffer sleepable LSM and tracing programs does not trigger any warnings with DEBUG_ATOMIC_SLEEP, DEBUG_PREEMPT, PROVE_RCU and PROVE_LOCKING and LOCKDEP enabled. This allows helpers like bpf_copy_from_user and bpf_ima_inode_hash to write to the BPF ring buffer from sleepable BPF programs. Signed-off-by: KP Singh Signed-off-by: Andrii Nakryiko Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210204193622.3367275-2-kpsingh@kernel.org --- kernel/bpf/verifier.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5e09632efddb..9749081bd26d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10024,9 +10024,11 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } break; + case BPF_MAP_TYPE_RINGBUF: + break; default: verbose(env, - "Sleepable programs can only use array and hash maps\n"); + "Sleepable programs can only use array, hash, and ringbuf maps\n"); return -EINVAL; } -- cgit v1.2.3 From 23a2d70c7a2f28eb1a8f6bc19d68d23968cad0ce Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 4 Feb 2021 15:48:27 -0800 Subject: bpf: Refactor BPF_PSEUDO_CALL checking as a helper function There is no functionality change. This refactoring intends to facilitate next patch change with BPF_PSEUDO_FUNC. Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210204234827.1628953-1-yhs@fb.com --- kernel/bpf/verifier.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 9749081bd26d..15694246f854 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -228,6 +228,12 @@ static void bpf_map_key_store(struct bpf_insn_aux_data *aux, u64 state) (poisoned ? BPF_MAP_KEY_POISON : 0ULL); } +static bool bpf_pseudo_call(const struct bpf_insn *insn) +{ + return insn->code == (BPF_JMP | BPF_CALL) && + insn->src_reg == BPF_PSEUDO_CALL; +} + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -1486,9 +1492,7 @@ static int check_subprogs(struct bpf_verifier_env *env) /* determine subprog starts. The end is one before the next starts */ for (i = 0; i < insn_cnt; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn + i)) continue; if (!env->bpf_capable) { verbose(env, @@ -3074,9 +3078,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - if (insn[i].code != (BPF_JMP | BPF_CALL)) - continue; - if (insn[i].src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn + i)) continue; /* remember insn and function to return to */ ret_insn[frame] = i + 1; @@ -10846,8 +10848,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) return 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; /* Upon error here we cannot fall back to interpreter but * need a hard reject of the program. Thus -EFAULT is @@ -10976,8 +10977,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) for (i = 0; i < env->subprog_cnt; i++) { insn = func[i]->insnsi; for (j = 0; j < func[i]->len; j++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; subprog = insn->off; insn->imm = BPF_CAST_CALL(func[subprog]->bpf_func) - @@ -11022,8 +11022,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) * later look the same as if they were interpreted only. */ for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; insn->off = env->insn_aux_data[i].call_imm; subprog = find_subprog(env, i + insn->off + 1); @@ -11052,8 +11051,7 @@ out_undo_insn: /* cleanup main prog to be interpreted */ prog->jit_requested = 0; for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; insn->off = 0; insn->imm = env->insn_aux_data[i].call_imm; @@ -11088,8 +11086,7 @@ static int fixup_call_args(struct bpf_verifier_env *env) return -EINVAL; } for (i = 0; i < prog->len; i++, insn++) { - if (insn->code != (BPF_JMP | BPF_CALL) || - insn->src_reg != BPF_PSEUDO_CALL) + if (!bpf_pseudo_call(insn)) continue; depth = get_callee_stack_depth(env, insn, i); if (depth < 0) -- cgit v1.2.3 From 9f5f8ec50165630cfc49897410b30997d4d677b5 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 6 Feb 2021 00:33:24 +1300 Subject: dma-mapping: benchmark: use u8 for reserved field in uAPI structure The original code put five u32 before a u64 expansion[10] array. Five is odd, this will cause trouble in the extension of the structure by adding new features. This patch moves to use u8 for reserved field to avoid future alignment risk. Meanwhile, it also clears the memory of struct map_benchmark in tools, otherwise, if users use old version to run on newer kernel, the random expansion value will cause side effect on newer kernel. Signed-off-by: Barry Song Signed-off-by: Christoph Hellwig --- kernel/dma/map_benchmark.c | 2 +- tools/testing/selftests/dma/dma_map_benchmark.c | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c index 1b1b8ff875cb..da95df381483 100644 --- a/kernel/dma/map_benchmark.c +++ b/kernel/dma/map_benchmark.c @@ -36,7 +36,7 @@ struct map_benchmark { __s32 node; /* which numa node this benchmark will run on */ __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ - __u64 expansion[10]; /* For future use */ + __u8 expansion[84]; /* For future use */ }; struct map_benchmark_data { diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c b/tools/testing/selftests/dma/dma_map_benchmark.c index 7065163a8388..537d65968c48 100644 --- a/tools/testing/selftests/dma/dma_map_benchmark.c +++ b/tools/testing/selftests/dma/dma_map_benchmark.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -35,7 +36,7 @@ struct map_benchmark { __s32 node; /* which numa node this benchmark will run on */ __u32 dma_bits; /* DMA addressing capability */ __u32 dma_dir; /* DMA data direction */ - __u64 expansion[10]; /* For future use */ + __u8 expansion[84]; /* For future use */ }; int main(int argc, char **argv) @@ -102,6 +103,7 @@ int main(int argc, char **argv) exit(1); } + memset(&map, 0, sizeof(map)); map.seconds = seconds; map.threads = threads; map.node = node; -- cgit v1.2.3 From 7f82e631d236cafd28518b998c6d4d8dc2ef68f6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 1 Feb 2021 11:55:38 +0100 Subject: locking/lockdep: Avoid unmatched unlock Commit f6f48e180404 ("lockdep: Teach lockdep about "USED" <- "IN-NMI" inversions") overlooked that print_usage_bug() releases the graph_lock and called it without the graph lock held. Fixes: f6f48e180404 ("lockdep: Teach lockdep about "USED" <- "IN-NMI" inversions") Reported-by: Dmitry Vyukov Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lkml.kernel.org/r/YBfkuyIfB1+VRxXP@hirez.programming.kicks-ass.net --- kernel/locking/lockdep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index ad9afd8c7eb9..5104db0e23e4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3773,7 +3773,7 @@ static void print_usage_bug(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit) { - if (!debug_locks_off_graph_unlock() || debug_locks_silent) + if (!debug_locks_off() || debug_locks_silent) return; pr_warn("\n"); @@ -3814,6 +3814,7 @@ valid_state(struct task_struct *curr, struct held_lock *this, enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) { if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit))) { + graph_unlock(); print_usage_bug(curr, this, bad_bit, new_bit); return 0; } -- cgit v1.2.3 From 24c242ec7abb3d21fa0b1da6bb251521dc1717b5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 25 Jan 2021 15:30:39 +0100 Subject: ntp: Use freezable workqueue for RTC synchronization The bug fixed by commit e3fab2f3de081e98 ("ntp: Fix RTC synchronization on 32-bit platforms") revealed an underlying issue: RTC synchronization may happen anytime, even while the system is partially suspended. On systems where the RTC is connected to an I2C bus, the I2C bus controller may already or still be suspended, triggering a WARNING during suspend or resume from s2ram: WARNING: CPU: 0 PID: 124 at drivers/i2c/i2c-core.h:54 __i2c_transfer+0x634/0x680 i2c i2c-6: Transfer while suspended [...] Workqueue: events_power_efficient sync_hw_clock [...] (__i2c_transfer) (i2c_transfer) (regmap_i2c_read) ... (da9063_rtc_set_time) (rtc_set_time) (sync_hw_clock) (process_one_work) Fix this race condition by using the freezable instead of the normal power-efficient workqueue. Signed-off-by: Geert Uytterhoeven Signed-off-by: Thomas Gleixner Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20210125143039.1051912-1-geert+renesas@glider.be --- kernel/time/ntp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 87389b9e21ab..5247afd7f345 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -502,7 +502,7 @@ static struct hrtimer sync_hrtimer; static enum hrtimer_restart sync_timer_callback(struct hrtimer *timer) { - queue_work(system_power_efficient_wq, &sync_work); + queue_work(system_freezable_power_efficient_wq, &sync_work); return HRTIMER_NORESTART; } @@ -668,7 +668,7 @@ void ntp_notify_cmos_timer(void) * just a pointless work scheduled. */ if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer)) - queue_work(system_power_efficient_wq, &sync_work); + queue_work(system_freezable_power_efficient_wq, &sync_work); } static void __init ntp_init_cmos_sync(void) -- cgit v1.2.3 From b5c28ea601b801d0ecd5ec703b8d54f77bfe5365 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Tue, 2 Feb 2021 02:34:57 +0100 Subject: alarmtimer: Update kerneldoc Update kerneldoc comments to reflect the actual arguments and return values of the documented functions. Signed-off-by: Alexandre Belloni Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210202013457.3482388-1-alexandre.belloni@bootlin.com --- kernel/time/alarmtimer.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index f4ace1bf8382..98d7a15e8cf6 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -527,8 +527,11 @@ static enum alarmtimer_type clock2alarm(clockid_t clockid) /** * alarm_handle_timer - Callback for posix timers * @alarm: alarm that fired + * @now: time at the timer expiration * * Posix timer callback for expired alarm timers. + * + * Return: whether the timer is to be restarted */ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, ktime_t now) @@ -715,8 +718,11 @@ static int alarm_timer_create(struct k_itimer *new_timer) /** * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep * @alarm: ptr to alarm that fired + * @now: time at the timer expiration * * Wakes up the task that set the alarmtimer + * + * Return: ALARMTIMER_NORESTART */ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, ktime_t now) @@ -733,6 +739,7 @@ static enum alarmtimer_restart alarmtimer_nsleep_wakeup(struct alarm *alarm, * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation * @alarm: ptr to alarmtimer * @absexp: absolute expiration time + * @type: alarm type (BOOTTIME/REALTIME). * * Sets the alarm timer and sleeps until it is fired or interrupted. */ @@ -806,7 +813,6 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) * @which_clock: clockid * @flags: determins abstime or relative * @tsreq: requested sleep time (abs or rel) - * @rmtp: remaining sleep time saved * * Handles clock_nanosleep calls against _ALARM clockids */ -- cgit v1.2.3 From 174bcc691f44fdd05046c694fc650933819f72c7 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 29 Dec 2020 00:54:02 +0300 Subject: timens: Delete no-op time_ns_init() Signed-off-by: Alexey Dobriyan Signed-off-by: Thomas Gleixner Acked-by: Andrei Vagin Link: https://lore.kernel.org/r/20201228215402.GA572900@localhost.localdomain --- kernel/time/namespace.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/time/namespace.c b/kernel/time/namespace.c index 6ca625f5e554..12eab0d2ae28 100644 --- a/kernel/time/namespace.c +++ b/kernel/time/namespace.c @@ -465,9 +465,3 @@ struct time_namespace init_time_ns = { .ns.ops = &timens_operations, .frozen_offsets = true, }; - -static int __init time_ns_init(void) -{ - return 0; -} -subsys_initcall(time_ns_init); -- cgit v1.2.3 From 55b6f763d8bcb5546997933105d66d3e6b080e6a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 4 Feb 2021 18:32:28 -0800 Subject: init/gcov: allow CONFIG_CONSTRUCTORS on UML to fix module gcov On ARCH=um, loading a module doesn't result in its constructors getting called, which breaks module gcov since the debugfs files are never registered. On the other hand, in-kernel constructors have already been called by the dynamic linker, so we can't call them again. Get out of this conundrum by allowing CONFIG_CONSTRUCTORS to be selected, but avoiding the in-kernel constructor calls. Also remove the "if !UML" from GCOV selecting CONSTRUCTORS now, since we really do want CONSTRUCTORS, just not kernel binary ones. Link: https://lkml.kernel.org/r/20210120172041.c246a2cac2fb.I1358f584b76f1898373adfed77f4462c8705b736@changeid Signed-off-by: Johannes Berg Reviewed-by: Peter Oberparleiter Cc: Arnd Bergmann Cc: Jessica Yu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 1 - init/main.c | 8 +++++++- kernel/gcov/Kconfig | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index b77c60f8b963..29ad68325028 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -76,7 +76,6 @@ config CC_HAS_ASM_INLINE config CONSTRUCTORS bool - depends on !UML config IRQ_WORK bool diff --git a/init/main.c b/init/main.c index c68d784376ca..a626e78dbf06 100644 --- a/init/main.c +++ b/init/main.c @@ -1066,7 +1066,13 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) /* Call all constructor functions linked into the kernel. */ static void __init do_ctors(void) { -#ifdef CONFIG_CONSTRUCTORS +/* + * For UML, the constructors have already been called by the + * normal setup code as it's just a normal ELF binary, so we + * cannot do it again - but we do need CONFIG_CONSTRUCTORS + * even on UML for modules. + */ +#if defined(CONFIG_CONSTRUCTORS) && !defined(CONFIG_UML) ctor_fn_t *fn = (ctor_fn_t *) __ctors_start; for (; fn < (ctor_fn_t *) __ctors_end; fn++) diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 3110c77230c7..f62de2dea8a3 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -4,7 +4,7 @@ menu "GCOV-based kernel profiling" config GCOV_KERNEL bool "Enable gcov-based kernel profiling" depends on DEBUG_FS - select CONSTRUCTORS if !UML + select CONSTRUCTORS default n help This option enables gcov-based code profiling (e.g. for code coverage -- cgit v1.2.3 From 256cfdd6fdf70c6fcf0f7c8ddb0ebd73ce8f3bc9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 5 Feb 2021 15:40:04 -0500 Subject: tracing: Do not count ftrace events in top level enable output The file /sys/kernel/tracing/events/enable is used to enable all events by echoing in "1", or disabling all events when echoing in "0". To know if all events are enabled, disabled, or some are enabled but not all of them, cating the file should show either "1" (all enabled), "0" (all disabled), or "X" (some enabled but not all of them). This works the same as the "enable" files in the individule system directories (like tracing/events/sched/enable). But when all events are enabled, the top level "enable" file shows "X". The reason is that its checking the "ftrace" events, which are special events that only exist for their format files. These include the format for the function tracer events, that are enabled when the function tracer is enabled, but not by the "enable" file. The check includes these events, which will always be disabled, and even though all true events are enabled, the top level "enable" file will show "X" instead of "1". To fix this, have the check test the event's flags to see if it has the "IGNORE_ENABLE" flag set, and if so, not test it. Cc: stable@vger.kernel.org Fixes: 553552ce1796c ("tracing: Combine event filter_active and enable into single flags field") Reported-by: "Yordan Karadzhov (VMware)" Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_events.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e9d28eeccb7e..d387b774ceeb 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1212,7 +1212,8 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(file, &tr->events, list) { call = file->event_call; - if (!trace_event_name(call) || !call->class || !call->class->reg) + if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) || + !trace_event_name(call) || !call->class || !call->class->reg) continue; if (system && strcmp(call->class->system, system->name) != 0) -- cgit v1.2.3 From 6342adcaa683c2b705c24ed201dc11b35854c88d Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Wed, 3 Feb 2021 13:00:48 -0500 Subject: entry: Ensure trap after single-step on system call return Commit 299155244770 ("entry: Drop usage of TIF flags in the generic syscall code") introduced a bug on architectures using the generic syscall entry code, in which processes stopped by PTRACE_SYSCALL do not trap on syscall return after receiving a TIF_SINGLESTEP. The reason is that the meaning of TIF_SINGLESTEP flag is overloaded to cause the trap after a system call is executed, but since the above commit, the syscall call handler only checks for the SYSCALL_WORK flags on the exit work. Split the meaning of TIF_SINGLESTEP such that it only means single-step mode, and create a new type of SYSCALL_WORK to request a trap immediately after a syscall in single-step mode. In the current implementation, the SYSCALL_WORK flag shadows the TIF_SINGLESTEP flag for simplicity. Update x86 to flip this bit when a tracer enables single stepping. Fixes: 299155244770 ("entry: Drop usage of TIF flags in the generic syscall code") Suggested-by: Linus Torvalds Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Tested-by: Kyle Huey Link: https://lore.kernel.org/r/87h7mtc9pr.fsf_-_@collabora.com --- arch/x86/include/asm/entry-common.h | 2 -- arch/x86/kernel/step.c | 10 ++++++++-- include/linux/entry-common.h | 1 + include/linux/thread_info.h | 2 ++ kernel/entry/common.c | 12 ++---------- 5 files changed, 13 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index 6fe54b2813c1..2b87b191b3b8 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -43,8 +43,6 @@ static __always_inline void arch_check_user_regs(struct pt_regs *regs) } #define arch_check_user_regs arch_check_user_regs -#define ARCH_SYSCALL_EXIT_WORK (_TIF_SINGLESTEP) - static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, unsigned long ti_work) { diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 60d2c3798ba2..0f3c307b37b3 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -127,12 +127,17 @@ static int enable_single_step(struct task_struct *child) regs->flags |= X86_EFLAGS_TF; /* - * Always set TIF_SINGLESTEP - this guarantees that - * we single-step system calls etc.. This will also + * Always set TIF_SINGLESTEP. This will also * cause us to set TF when returning to user mode. */ set_tsk_thread_flag(child, TIF_SINGLESTEP); + /* + * Ensure that a trap is triggered once stepping out of a system + * call prior to executing any user instruction. + */ + set_task_syscall_work(child, SYSCALL_EXIT_TRAP); + oflags = regs->flags; /* Set TF on the kernel stack.. */ @@ -230,6 +235,7 @@ void user_disable_single_step(struct task_struct *child) /* Always clear TIF_SINGLESTEP... */ clear_tsk_thread_flag(child, TIF_SINGLESTEP); + clear_task_syscall_work(child, SYSCALL_EXIT_TRAP); /* But touch TF only if it was set by us.. */ if (test_and_clear_tsk_thread_flag(child, TIF_FORCED_TF)) diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index ca86a00abe86..a104b298019a 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -46,6 +46,7 @@ SYSCALL_WORK_SYSCALL_TRACE | \ SYSCALL_WORK_SYSCALL_AUDIT | \ SYSCALL_WORK_SYSCALL_USER_DISPATCH | \ + SYSCALL_WORK_SYSCALL_EXIT_TRAP | \ ARCH_SYSCALL_WORK_EXIT) /* diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index c8a974cead73..9b2158c69275 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -43,6 +43,7 @@ enum syscall_work_bit { SYSCALL_WORK_BIT_SYSCALL_EMU, SYSCALL_WORK_BIT_SYSCALL_AUDIT, SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH, + SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP, }; #define SYSCALL_WORK_SECCOMP BIT(SYSCALL_WORK_BIT_SECCOMP) @@ -51,6 +52,7 @@ enum syscall_work_bit { #define SYSCALL_WORK_SYSCALL_EMU BIT(SYSCALL_WORK_BIT_SYSCALL_EMU) #define SYSCALL_WORK_SYSCALL_AUDIT BIT(SYSCALL_WORK_BIT_SYSCALL_AUDIT) #define SYSCALL_WORK_SYSCALL_USER_DISPATCH BIT(SYSCALL_WORK_BIT_SYSCALL_USER_DISPATCH) +#define SYSCALL_WORK_SYSCALL_EXIT_TRAP BIT(SYSCALL_WORK_BIT_SYSCALL_EXIT_TRAP) #endif #include diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 6dd82be60df8..f9d491b17b78 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -209,15 +209,9 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) lockdep_sys_exit(); } -#ifndef _TIF_SINGLESTEP -static inline bool report_single_step(unsigned long work) -{ - return false; -} -#else /* * If SYSCALL_EMU is set, then the only reason to report is when - * TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall + * SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall * instruction has been already reported in syscall_enter_from_user_mode(). */ static inline bool report_single_step(unsigned long work) @@ -225,10 +219,8 @@ static inline bool report_single_step(unsigned long work) if (work & SYSCALL_WORK_SYSCALL_EMU) return false; - return !!(current_thread_info()->flags & _TIF_SINGLESTEP); + return work & SYSCALL_WORK_SYSCALL_EXIT_TRAP; } -#endif - static void syscall_exit_work(struct pt_regs *regs, unsigned long work) { -- cgit v1.2.3 From 36a6c843fd0d8e02506681577e96dabd203dd8e8 Mon Sep 17 00:00:00 2001 From: Gabriel Krisman Bertazi Date: Fri, 5 Feb 2021 13:43:21 -0500 Subject: entry: Use different define for selector variable in SUD Michael Kerrisk suggested that, from an API perspective, it is a bad idea to share the PR_SYS_DISPATCH_ defines between the prctl operation and the selector variable. Therefore, define two new constants to be used by SUD's selector variable and update the corresponding documentation and test cases. While this changes the API syscall user dispatch has never been part of a Linux release, it will show up for the first time in 5.11. Suggested-by: Michael Kerrisk (man-pages) Signed-off-by: Gabriel Krisman Bertazi Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/r/20210205184321.2062251-1-krisman@collabora.com --- Documentation/admin-guide/syscall-user-dispatch.rst | 4 ++-- include/uapi/linux/prctl.h | 3 +++ kernel/entry/syscall_user_dispatch.c | 4 ++-- .../selftests/syscall_user_dispatch/sud_benchmark.c | 8 +++++--- tools/testing/selftests/syscall_user_dispatch/sud_test.c | 14 ++++++++------ 5 files changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/syscall-user-dispatch.rst b/Documentation/admin-guide/syscall-user-dispatch.rst index a380d6515774..60314953c728 100644 --- a/Documentation/admin-guide/syscall-user-dispatch.rst +++ b/Documentation/admin-guide/syscall-user-dispatch.rst @@ -70,8 +70,8 @@ trampoline code on the vDSO, that trampoline is never intercepted. [selector] is a pointer to a char-sized region in the process memory region, that provides a quick way to enable disable syscall redirection thread-wide, without the need to invoke the kernel directly. selector -can be set to PR_SYS_DISPATCH_ON or PR_SYS_DISPATCH_OFF. Any other -value should terminate the program with a SIGSYS. +can be set to SYSCALL_DISPATCH_FILTER_ALLOW or SYSCALL_DISPATCH_FILTER_BLOCK. +Any other value should terminate the program with a SIGSYS. Security Notes -------------- diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 90deb41c8a34..667f1aed091c 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -251,5 +251,8 @@ struct prctl_mm_map { #define PR_SET_SYSCALL_USER_DISPATCH 59 # define PR_SYS_DISPATCH_OFF 0 # define PR_SYS_DISPATCH_ON 1 +/* The control values for the user space selector when dispatch is enabled */ +# define SYSCALL_DISPATCH_FILTER_ALLOW 0 +# define SYSCALL_DISPATCH_FILTER_BLOCK 1 #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/entry/syscall_user_dispatch.c b/kernel/entry/syscall_user_dispatch.c index b0338a5625d9..c240302f56e2 100644 --- a/kernel/entry/syscall_user_dispatch.c +++ b/kernel/entry/syscall_user_dispatch.c @@ -50,10 +50,10 @@ bool syscall_user_dispatch(struct pt_regs *regs) if (unlikely(__get_user(state, sd->selector))) do_exit(SIGSEGV); - if (likely(state == PR_SYS_DISPATCH_OFF)) + if (likely(state == SYSCALL_DISPATCH_FILTER_ALLOW)) return false; - if (state != PR_SYS_DISPATCH_ON) + if (state != SYSCALL_DISPATCH_FILTER_BLOCK) do_exit(SIGSYS); } diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c b/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c index 6689f1183dbf..073a03702ff5 100644 --- a/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c +++ b/tools/testing/selftests/syscall_user_dispatch/sud_benchmark.c @@ -22,6 +22,8 @@ # define PR_SET_SYSCALL_USER_DISPATCH 59 # define PR_SYS_DISPATCH_OFF 0 # define PR_SYS_DISPATCH_ON 1 +# define SYSCALL_DISPATCH_FILTER_ALLOW 0 +# define SYSCALL_DISPATCH_FILTER_BLOCK 1 #endif #ifdef __NR_syscalls @@ -55,8 +57,8 @@ unsigned long trapped_call_count = 0; unsigned long native_call_count = 0; char selector; -#define SYSCALL_BLOCK (selector = PR_SYS_DISPATCH_ON) -#define SYSCALL_UNBLOCK (selector = PR_SYS_DISPATCH_OFF) +#define SYSCALL_BLOCK (selector = SYSCALL_DISPATCH_FILTER_BLOCK) +#define SYSCALL_UNBLOCK (selector = SYSCALL_DISPATCH_FILTER_ALLOW) #define CALIBRATION_STEP 100000 #define CALIBRATE_TO_SECS 5 @@ -170,7 +172,7 @@ int main(void) syscall(MAGIC_SYSCALL_1); #ifdef TEST_BLOCKED_RETURN - if (selector == PR_SYS_DISPATCH_OFF) { + if (selector == SYSCALL_DISPATCH_FILTER_ALLOW) { fprintf(stderr, "Failed to return with selector blocked.\n"); exit(-1); } diff --git a/tools/testing/selftests/syscall_user_dispatch/sud_test.c b/tools/testing/selftests/syscall_user_dispatch/sud_test.c index 6498b050ef89..b5d592d4099e 100644 --- a/tools/testing/selftests/syscall_user_dispatch/sud_test.c +++ b/tools/testing/selftests/syscall_user_dispatch/sud_test.c @@ -18,6 +18,8 @@ # define PR_SET_SYSCALL_USER_DISPATCH 59 # define PR_SYS_DISPATCH_OFF 0 # define PR_SYS_DISPATCH_ON 1 +# define SYSCALL_DISPATCH_FILTER_ALLOW 0 +# define SYSCALL_DISPATCH_FILTER_BLOCK 1 #endif #ifndef SYS_USER_DISPATCH @@ -30,8 +32,8 @@ # define MAGIC_SYSCALL_1 (0xff00) /* Bad Linux syscall number */ #endif -#define SYSCALL_DISPATCH_ON(x) ((x) = 1) -#define SYSCALL_DISPATCH_OFF(x) ((x) = 0) +#define SYSCALL_DISPATCH_ON(x) ((x) = SYSCALL_DISPATCH_FILTER_BLOCK) +#define SYSCALL_DISPATCH_OFF(x) ((x) = SYSCALL_DISPATCH_FILTER_ALLOW) /* Test Summary: * @@ -56,7 +58,7 @@ TEST_SIGNAL(dispatch_trigger_sigsys, SIGSYS) { - char sel = 0; + char sel = SYSCALL_DISPATCH_FILTER_ALLOW; struct sysinfo info; int ret; @@ -79,7 +81,7 @@ TEST_SIGNAL(dispatch_trigger_sigsys, SIGSYS) TEST(bad_prctl_param) { - char sel = 0; + char sel = SYSCALL_DISPATCH_FILTER_ALLOW; int op; /* Invalid op */ @@ -220,7 +222,7 @@ TEST_SIGNAL(bad_selector, SIGSYS) sigset_t mask; struct sysinfo info; - glob_sel = 0; + glob_sel = SYSCALL_DISPATCH_FILTER_ALLOW; nr_syscalls_emulated = 0; si_code = 0; si_errno = 0; @@ -288,7 +290,7 @@ TEST(direct_dispatch_range) { int ret = 0; struct sysinfo info; - char sel = 0; + char sel = SYSCALL_DISPATCH_FILTER_ALLOW; /* * Instead of calculating libc addresses; allow the entire -- cgit v1.2.3 From ee114dd64c0071500345439fc79dd5e0f9d106ed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 5 Feb 2021 17:20:14 +0100 Subject: bpf: Fix verifier jsgt branch analysis on max bound Fix incorrect is_branch{32,64}_taken() analysis for the jsgt case. The return code for both will tell the caller whether a given conditional jump is taken or not, e.g. 1 means branch will be taken [for the involved registers] and the goto target will be executed, 0 means branch will not be taken and instead we fall-through to the next insn, and last but not least a -1 denotes that it is not known at verification time whether a branch will be taken or not. Now while the jsgt has the branch-taken case correct with reg->s32_min_value > sval, the branch-not-taken case is off-by-one when testing for reg->s32_max_value < sval since the branch will also be taken for reg->s32_max_value == sval. The jgt branch analysis, for example, gets this right. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Fixes: 4f7b3e82589e ("bpf: improve verifier branch analysis") Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e7368c5eacb7..67ea0ac3333c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6877,7 +6877,7 @@ static int is_branch32_taken(struct bpf_reg_state *reg, u32 val, u8 opcode) case BPF_JSGT: if (reg->s32_min_value > sval) return 1; - else if (reg->s32_max_value < sval) + else if (reg->s32_max_value <= sval) return 0; break; case BPF_JLT: @@ -6950,7 +6950,7 @@ static int is_branch64_taken(struct bpf_reg_state *reg, u64 val, u8 opcode) case BPF_JSGT: if (reg->smin_value > sval) return 1; - else if (reg->smax_value < sval) + else if (reg->smax_value <= sval) return 0; break; case BPF_JLT: -- cgit v1.2.3 From fd675184fc7abfd1e1c52d23e8e900676b5a1c1a Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 5 Feb 2021 20:48:21 +0100 Subject: bpf: Fix verifier jmp32 pruning decision logic Anatoly has been fuzzing with kBdysch harness and reported a hang in one of the outcomes: func#0 @0 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = 808464450 1: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (b4) w4 = 808464432 2: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP808464432 R10=fp0 2: (9c) w4 %= w0 3: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0 3: (66) if w4 s> 0x30303030 goto pc+0 R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0 4: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0 4: (7f) r0 >>= r0 5: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff),s32_max_value=808464432) R10=fp0 5: (9c) w4 %= w0 6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 6: (66) if w0 s> 0x3030 goto pc+0 R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 7: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 7: (d6) if w0 s<= 0x303030 goto pc+1 9: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 9: (95) exit propagating r0 from 6 to 7: safe 4: R0_w=invP808464450 R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umin_value=808464433,umax_value=2147483647,var_off=(0x0; 0x7fffffff)) R10=fp0 4: (7f) r0 >>= r0 5: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0,umin_value=808464433,umax_value=2147483647,var_off=(0x0; 0x7fffffff)) R10=fp0 5: (9c) w4 %= w0 6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 6: (66) if w0 s> 0x3030 goto pc+0 R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 propagating r0 7: safe propagating r0 from 6 to 7: safe processed 15 insns (limit 1000000) max_states_per_insn 0 total_states 1 peak_states 1 mark_read 1 The underlying program was xlated as follows: # bpftool p d x i 10 0: (b7) r0 = 808464450 1: (b4) w4 = 808464432 2: (bc) w0 = w0 3: (15) if r0 == 0x0 goto pc+1 4: (9c) w4 %= w0 5: (66) if w4 s> 0x30303030 goto pc+0 6: (7f) r0 >>= r0 7: (bc) w0 = w0 8: (15) if r0 == 0x0 goto pc+1 9: (9c) w4 %= w0 10: (66) if w0 s> 0x3030 goto pc+0 11: (d6) if w0 s<= 0x303030 goto pc+1 12: (05) goto pc-1 13: (95) exit The verifier rewrote original instructions it recognized as dead code with 'goto pc-1', but reality differs from verifier simulation in that we are actually able to trigger a hang due to hitting the 'goto pc-1' instructions. Taking a closer look at the verifier analysis, the reason is that it misjudges its pruning decision at the first 'from 6 to 7: safe' occasion. What happens is that while both old/cur registers are marked as precise, they get misjudged for the jmp32 case as range_within() yields true, meaning that the prior verification path with a wider register bound could be verified successfully and therefore the current path with a narrower register bound is deemed safe as well whereas in reality it's not. R0 old/cur path's bounds compare as follows: old: smin_value=0x8000000000000000,smax_value=0x7fffffffffffffff,umin_value=0x0,umax_value=0xffffffffffffffff,var_off=(0x0; 0xffffffffffffffff) cur: smin_value=0x8000000000000000,smax_value=0x7fffffff7fffffff,umin_value=0x0,umax_value=0xffffffff7fffffff,var_off=(0x0; 0xffffffff7fffffff) old: s32_min_value=0x80000000,s32_max_value=0x00003030,u32_min_value=0x00000000,u32_max_value=0xffffffff cur: s32_min_value=0x00003031,s32_max_value=0x7fffffff,u32_min_value=0x00003031,u32_max_value=0x7fffffff The 64 bit bounds generally look okay and while the information that got propagated from 32 to 64 bit looks correct as well, it's not precise enough for judging a conditional jmp32. Given the latter only operates on subregisters we also need to take these into account as well for a range_within() probe in order to be able to prune paths. Extending the range_within() constraint to both bounds will be able to tell us that the old signed 32 bit bounds are not wider than the cur signed 32 bit bounds. With the fix in place, the program will now verify the 'goto' branch case as it should have been: [...] 6: R0_w=invP(id=0) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 6: (66) if w0 s> 0x3030 goto pc+0 R0_w=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 7: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 7: (d6) if w0 s<= 0x303030 goto pc+1 9: R0=invP(id=0,s32_max_value=12336) R1=ctx(id=0,off=0,imm=0) R4=invP(id=0) R10=fp0 9: (95) exit 7: R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=12337,u32_min_value=12337,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 7: (d6) if w0 s<= 0x303030 goto pc+1 R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=3158065,u32_min_value=3158065,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 8: R0_w=invP(id=0,smax_value=9223372034707292159,umax_value=18446744071562067967,var_off=(0x0; 0xffffffff7fffffff),s32_min_value=3158065,u32_min_value=3158065,u32_max_value=2147483647) R1=ctx(id=0,off=0,imm=0) R4_w=invP(id=0) R10=fp0 8: (30) r0 = *(u8 *)skb[808464432] BPF_LD_[ABS|IND] uses reserved fields processed 11 insns (limit 1000000) max_states_per_insn 1 total_states 1 peak_states 1 mark_read 1 The bug is quite subtle in the sense that when verifier would determine that a given branch is dead code, it would (here: wrongly) remove these instructions from the program and hard-wire the taken branch for privileged programs instead of the 'goto pc-1' rewrites which will cause hard to debug problems. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: Anatoly Trosinenko Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 67ea0ac3333c..24c0e9224f3c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -8590,7 +8590,11 @@ static bool range_within(struct bpf_reg_state *old, return old->umin_value <= cur->umin_value && old->umax_value >= cur->umax_value && old->smin_value <= cur->smin_value && - old->smax_value >= cur->smax_value; + old->smax_value >= cur->smax_value && + old->u32_min_value <= cur->u32_min_value && + old->u32_max_value >= cur->u32_max_value && + old->s32_min_value <= cur->s32_min_value && + old->s32_max_value >= cur->s32_max_value; } /* Maximum number of register states that can exist at once */ -- cgit v1.2.3 From e88b2c6e5a4d9ce30d75391e4d950da74bb2bd90 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 9 Feb 2021 18:46:10 +0000 Subject: bpf: Fix 32 bit src register truncation on div/mod While reviewing a different fix, John and I noticed an oddity in one of the BPF program dumps that stood out, for example: # bpftool p d x i 13 0: (b7) r0 = 808464450 1: (b4) w4 = 808464432 2: (bc) w0 = w0 3: (15) if r0 == 0x0 goto pc+1 4: (9c) w4 %= w0 [...] In line 2 we noticed that the mov32 would 32 bit truncate the original src register for the div/mod operation. While for the two operations the dst register is typically marked unknown e.g. from adjust_scalar_min_max_vals() the src register is not, and thus verifier keeps tracking original bounds, simplified: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = -1 1: R0_w=invP-1 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (b7) r1 = -1 2: R0_w=invP-1 R1_w=invP-1 R10=fp0 2: (3c) w0 /= w1 3: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP-1 R10=fp0 3: (77) r1 >>= 32 4: R0_w=invP(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R1_w=invP4294967295 R10=fp0 4: (bf) r0 = r1 5: R0_w=invP4294967295 R1_w=invP4294967295 R10=fp0 5: (95) exit processed 6 insns (limit 1000000) max_states_per_insn 0 total_states 0 peak_states 0 mark_read 0 Runtime result of r0 at exit is 0 instead of expected -1. Remove the verifier mov32 src rewrite in div/mod and replace it with a jmp32 test instead. After the fix, we result in the following code generation when having dividend r1 and divisor r6: div, 64 bit: div, 32 bit: 0: (b7) r6 = 8 0: (b7) r6 = 8 1: (b7) r1 = 8 1: (b7) r1 = 8 2: (55) if r6 != 0x0 goto pc+2 2: (56) if w6 != 0x0 goto pc+2 3: (ac) w1 ^= w1 3: (ac) w1 ^= w1 4: (05) goto pc+1 4: (05) goto pc+1 5: (3f) r1 /= r6 5: (3c) w1 /= w6 6: (b7) r0 = 0 6: (b7) r0 = 0 7: (95) exit 7: (95) exit mod, 64 bit: mod, 32 bit: 0: (b7) r6 = 8 0: (b7) r6 = 8 1: (b7) r1 = 8 1: (b7) r1 = 8 2: (15) if r6 == 0x0 goto pc+1 2: (16) if w6 == 0x0 goto pc+1 3: (9f) r1 %= r6 3: (9c) w1 %= w6 4: (b7) r0 = 0 4: (b7) r0 = 0 5: (95) exit 5: (95) exit x86 in particular can throw a 'divide error' exception for div instruction not only for divisor being zero, but also for the case when the quotient is too large for the designated register. For the edx:eax and rdx:rax dividend pair it is not an issue in x86 BPF JIT since we always zero edx (rdx). Hence really the only protection needed is against divisor being zero. Fixes: 68fda450a7df ("bpf: fix 32-bit divide by zero") Co-developed-by: John Fastabend Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 24c0e9224f3c..37581919e050 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11003,30 +11003,28 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; - struct bpf_insn mask_and_div[] = { - BPF_MOV32_REG(insn->src_reg, insn->src_reg), + bool isdiv = BPF_OP(insn->code) == BPF_DIV; + struct bpf_insn *patchlet; + struct bpf_insn chk_and_div[] = { /* Rx div 0 -> 0 */ - BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JNE | BPF_K, insn->src_reg, + 0, 2, 0), BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), BPF_JMP_IMM(BPF_JA, 0, 0, 1), *insn, }; - struct bpf_insn mask_and_mod[] = { - BPF_MOV32_REG(insn->src_reg, insn->src_reg), + struct bpf_insn chk_and_mod[] = { /* Rx mod 0 -> Rx */ - BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | + BPF_JEQ | BPF_K, insn->src_reg, + 0, 1, 0), *insn, }; - struct bpf_insn *patchlet; - if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || - insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - patchlet = mask_and_div + (is64 ? 1 : 0); - cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); - } else { - patchlet = mask_and_mod + (is64 ? 1 : 0); - cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); - } + patchlet = isdiv ? chk_and_div : chk_and_mod; + cnt = isdiv ? ARRAY_SIZE(chk_and_div) : + ARRAY_SIZE(chk_and_mod); new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) -- cgit v1.2.3 From c8cc7e853192d520ab6a5957f5081034103587ae Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 9 Feb 2021 09:30:03 +0100 Subject: lockdep: Noinstr annotate warn_bogus_irq_restore() vmlinux.o: warning: objtool: lock_is_held_type()+0x107: call to warn_bogus_irq_restore() leaves .noinstr.text section As per the general rule that WARNs are allowed to violate noinstr to get out, annotate it away. Fixes: 997acaf6b4b5 ("lockdep: report broken irq restoration") Reported-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Acked-by: Randy Dunlap # build-tested Link: https://lkml.kernel.org/r/YCKyYg53mMp4E7YI@hirez.programming.kicks-ass.net --- kernel/locking/irqflag-debug.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/irqflag-debug.c b/kernel/locking/irqflag-debug.c index 9603d207a4ca..810b50344d35 100644 --- a/kernel/locking/irqflag-debug.c +++ b/kernel/locking/irqflag-debug.c @@ -4,8 +4,10 @@ #include #include -void warn_bogus_irq_restore(void) +noinstr void warn_bogus_irq_restore(void) { + instrumentation_begin(); WARN_ONCE(1, "raw_local_irq_restore() called with IRQs enabled\n"); + instrumentation_end(); } EXPORT_SYMBOL(warn_bogus_irq_restore); -- cgit v1.2.3 From 0f319d49a4167e402b01b2b56639386f0b6846ba Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 10 Feb 2021 09:52:47 +0100 Subject: locking/mutex: Kill mutex_trylock_recursive() There are not users of mutex_trylock_recursive() in tree as of v5.11-rc7. Remove it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210210085248.219210-2-bigeasy@linutronix.de --- include/linux/mutex.h | 25 ------------------------- kernel/locking/mutex.c | 10 ---------- 2 files changed, 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index dcd185cbfe79..0cd631a19727 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -199,29 +199,4 @@ extern void mutex_unlock(struct mutex *lock); extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); -/* - * These values are chosen such that FAIL and SUCCESS match the - * values of the regular mutex_trylock(). - */ -enum mutex_trylock_recursive_enum { - MUTEX_TRYLOCK_FAILED = 0, - MUTEX_TRYLOCK_SUCCESS = 1, - MUTEX_TRYLOCK_RECURSIVE, -}; - -/** - * mutex_trylock_recursive - trylock variant that allows recursive locking - * @lock: mutex to be locked - * - * This function should not be used, _ever_. It is purely for hysterical GEM - * raisins, and once those are gone this will be removed. - * - * Returns: - * - MUTEX_TRYLOCK_FAILED - trylock failed, - * - MUTEX_TRYLOCK_SUCCESS - lock acquired, - * - MUTEX_TRYLOCK_RECURSIVE - we already owned the lock. - */ -extern /* __deprecated */ __must_check enum mutex_trylock_recursive_enum -mutex_trylock_recursive(struct mutex *lock); - #endif /* __LINUX_MUTEX_H */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 5352ce50a97e..adb935090768 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -86,16 +86,6 @@ bool mutex_is_locked(struct mutex *lock) } EXPORT_SYMBOL(mutex_is_locked); -__must_check enum mutex_trylock_recursive_enum -mutex_trylock_recursive(struct mutex *lock) -{ - if (unlikely(__mutex_owner(lock) == current)) - return MUTEX_TRYLOCK_RECURSIVE; - - return mutex_trylock(lock); -} -EXPORT_SYMBOL(mutex_trylock_recursive); - static inline unsigned long __owner_flags(unsigned long owner) { return owner & MUTEX_FLAGS; -- cgit v1.2.3 From 01f810ace9ed37255f27608a0864abebccf0aab3 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Sat, 6 Feb 2021 20:10:24 -0500 Subject: bpf: Allow variable-offset stack access Before this patch, variable offset access to the stack was dissalowed for regular instructions, but was allowed for "indirect" accesses (i.e. helpers). This patch removes the restriction, allowing reading and writing to the stack through stack pointers with variable offsets. This makes stack-allocated buffers more usable in programs, and brings stack pointers closer to other types of pointers. The motivation is being able to use stack-allocated buffers for data manipulation. When the stack size limit is sufficient, allocating buffers on the stack is simpler than per-cpu arrays, or other alternatives. In unpriviledged programs, variable-offset reads and writes are disallowed (they were already disallowed for the indirect access case) because the speculative execution checking code doesn't support them. Additionally, when writing through a variable-offset stack pointer, if any pointers are in the accessible range, there's possilibities of later leaking pointers because the write cannot be tracked precisely. Writes with variable offset mark the whole range as initialized, even though we don't know which stack slots are actually written. This is in order to not reject future reads to these slots. Note that this doesn't affect writes done through helpers; like before, helpers need the whole stack range to be initialized to begin with. All the stack slots are in range are considered scalars after the write; variable-offset register spills are not tracked. For reads, all the stack slots in the variable range needs to be initialized (but see above about what writes do), otherwise the read is rejected. All register spilled in stack slots that might be read are marked as having been read, however reads through such pointers don't do register filling; the target register will always be either a scalar or a constant zero. Signed-off-by: Andrei Matei Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210207011027.676572-2-andreimatei1@gmail.com --- include/linux/bpf.h | 5 + include/linux/bpf_verifier.h | 3 +- kernel/bpf/verifier.c | 657 +++++++++++++++++++++++++++++++++---------- 3 files changed, 518 insertions(+), 147 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 321966fc35db..079162bbd387 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1290,6 +1290,11 @@ static inline bool bpf_allow_ptr_leaks(void) return perfmon_capable(); } +static inline bool bpf_allow_uninit_stack(void) +{ + return perfmon_capable(); +} + static inline bool bpf_allow_ptr_to_map_access(void) { return perfmon_capable(); diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index dfe6f85d97dd..532c97836d0d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -195,7 +195,7 @@ struct bpf_func_state { * 0 = main function, 1 = first callee. */ u32 frameno; - /* subprog number == index within subprog_stack_depth + /* subprog number == index within subprog_info * zero == main subprog */ u32 subprogno; @@ -404,6 +404,7 @@ struct bpf_verifier_env { u32 used_btf_cnt; /* number of used BTF objects */ u32 id_gen; /* used to generate unique reg IDs */ bool allow_ptr_leaks; + bool allow_uninit_stack; bool allow_ptr_to_map_access; bool bpf_capable; bool bypass_spec_v1; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15694246f854..400d79e99fc8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2275,12 +2275,14 @@ static void save_register_state(struct bpf_func_state *state, state->stack[spi].slot_type[i] = STACK_SPILL; } -/* check_stack_read/write functions track spill/fill of registers, +/* check_stack_{read,write}_fixed_off functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ -static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_func_state *state, /* func where register points to */ - int off, int size, int value_regno, int insn_idx) +static int check_stack_write_fixed_off(struct bpf_verifier_env *env, + /* stack frame we're writing to */ + struct bpf_func_state *state, + int off, int size, int value_regno, + int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; @@ -2406,9 +2408,175 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_func_state *reg_state /* func where register points to */, - int off, int size, int value_regno) +/* Write the stack: 'stack[ptr_regno + off] = value_regno'. 'ptr_regno' is + * known to contain a variable offset. + * This function checks whether the write is permitted and conservatively + * tracks the effects of the write, considering that each stack slot in the + * dynamic range is potentially written to. + * + * 'off' includes 'regno->off'. + * 'value_regno' can be -1, meaning that an unknown value is being written to + * the stack. + * + * Spilled pointers in range are not marked as written because we don't know + * what's going to be actually written. This means that read propagation for + * future reads cannot be terminated by this write. + * + * For privileged programs, uninitialized stack slots are considered + * initialized by this write (even though we don't know exactly what offsets + * are going to be written to). The idea is that we don't want the verifier to + * reject future reads that access slots written to through variable offsets. + */ +static int check_stack_write_var_off(struct bpf_verifier_env *env, + /* func where register points to */ + struct bpf_func_state *state, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_func_state *cur; /* state of the current function */ + int min_off, max_off; + int i, err; + struct bpf_reg_state *ptr_reg = NULL, *value_reg = NULL; + bool writing_zero = false; + /* set if the fact that we're writing a zero is used to let any + * stack slots remain STACK_ZERO + */ + bool zero_used = false; + + cur = env->cur_state->frame[env->cur_state->curframe]; + ptr_reg = &cur->regs[ptr_regno]; + min_off = ptr_reg->smin_value + off; + max_off = ptr_reg->smax_value + off + size; + if (value_regno >= 0) + value_reg = &cur->regs[value_regno]; + if (value_reg && register_is_null(value_reg)) + writing_zero = true; + + err = realloc_func_state(state, round_up(-min_off, BPF_REG_SIZE), + state->acquired_refs, true); + if (err) + return err; + + + /* Variable offset writes destroy any spilled pointers in range. */ + for (i = min_off; i < max_off; i++) { + u8 new_type, *stype; + int slot, spi; + + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + + if (!env->allow_ptr_leaks + && *stype != NOT_INIT + && *stype != SCALAR_VALUE) { + /* Reject the write if there's are spilled pointers in + * range. If we didn't reject here, the ptr status + * would be erased below (even though not all slots are + * actually overwritten), possibly opening the door to + * leaks. + */ + verbose(env, "spilled ptr in range of var-offset stack write; insn %d, ptr off: %d", + insn_idx, i); + return -EINVAL; + } + + /* Erase all spilled pointers. */ + state->stack[spi].spilled_ptr.type = NOT_INIT; + + /* Update the slot type. */ + new_type = STACK_MISC; + if (writing_zero && *stype == STACK_ZERO) { + new_type = STACK_ZERO; + zero_used = true; + } + /* If the slot is STACK_INVALID, we check whether it's OK to + * pretend that it will be initialized by this write. The slot + * might not actually be written to, and so if we mark it as + * initialized future reads might leak uninitialized memory. + * For privileged programs, we will accept such reads to slots + * that may or may not be written because, if we're reject + * them, the error would be too confusing. + */ + if (*stype == STACK_INVALID && !env->allow_uninit_stack) { + verbose(env, "uninit stack in range of var-offset write prohibited for !root; insn %d, off: %d", + insn_idx, i); + return -EINVAL; + } + *stype = new_type; + } + if (zero_used) { + /* backtracking doesn't work for STACK_ZERO yet. */ + err = mark_chain_precision(env, value_regno); + if (err) + return err; + } + return 0; +} + +/* When register 'dst_regno' is assigned some values from stack[min_off, + * max_off), we set the register's type according to the types of the + * respective stack slots. If all the stack values are known to be zeros, then + * so is the destination reg. Otherwise, the register is considered to be + * SCALAR. This function does not deal with register filling; the caller must + * ensure that all spilled registers in the stack range have been marked as + * read. + */ +static void mark_reg_stack_read(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *ptr_state, + int min_off, int max_off, int dst_regno) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + int i, slot, spi; + u8 *stype; + int zeros = 0; + + for (i = min_off; i < max_off; i++) { + slot = -i - 1; + spi = slot / BPF_REG_SIZE; + stype = ptr_state->stack[spi].slot_type; + if (stype[slot % BPF_REG_SIZE] != STACK_ZERO) + break; + zeros++; + } + if (zeros == max_off - min_off) { + /* any access_size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[dst_regno]); + /* backtracking doesn't support STACK_ZERO yet, + * so mark it precise here, so that later + * backtracking can stop here. + * Backtracking may not need this if this register + * doesn't participate in pointer adjustment. + * Forward propagation of precise flag is not + * necessary either. This mark is only to stop + * backtracking. Any register that contributed + * to const 0 was marked precise before spill. + */ + state->regs[dst_regno].precise = true; + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, dst_regno); + } + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; +} + +/* Read the stack at 'off' and put the results into the register indicated by + * 'dst_regno'. It handles reg filling if the addressed stack slot is a + * spilled reg. + * + * 'dst_regno' can be -1, meaning that the read value is not going to a + * register. + * + * The access is assumed to be within the current stack bounds. + */ +static int check_stack_read_fixed_off(struct bpf_verifier_env *env, + /* func where src register points to */ + struct bpf_func_state *reg_state, + int off, int size, int dst_regno) { struct bpf_verifier_state *vstate = env->cur_state; struct bpf_func_state *state = vstate->frame[vstate->curframe]; @@ -2416,11 +2584,6 @@ static int check_stack_read(struct bpf_verifier_env *env, struct bpf_reg_state *reg; u8 *stype; - if (reg_state->allocated_stack <= slot) { - verbose(env, "invalid read from stack off %d+0 size %d\n", - off, size); - return -EACCES; - } stype = reg_state->stack[spi].slot_type; reg = ®_state->stack[spi].spilled_ptr; @@ -2431,9 +2594,9 @@ static int check_stack_read(struct bpf_verifier_env *env, verbose(env, "invalid size of register fill\n"); return -EACCES; } - if (value_regno >= 0) { - mark_reg_unknown(env, state->regs, value_regno); - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + if (dst_regno >= 0) { + mark_reg_unknown(env, state->regs, dst_regno); + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); return 0; @@ -2445,16 +2608,16 @@ static int check_stack_read(struct bpf_verifier_env *env, } } - if (value_regno >= 0) { + if (dst_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = *reg; + state->regs[dst_regno] = *reg; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ - state->regs[value_regno].live |= REG_LIVE_WRITTEN; + state->regs[dst_regno].live |= REG_LIVE_WRITTEN; } else if (__is_pointer_value(env->allow_ptr_leaks, reg)) { - /* If value_regno==-1, the caller is asking us whether + /* If dst_regno==-1, the caller is asking us whether * it is acceptable to use this value as a SCALAR_VALUE * (e.g. for XADD). * We must not allow unprivileged callers to do that @@ -2466,70 +2629,167 @@ static int check_stack_read(struct bpf_verifier_env *env, } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); } else { - int zeros = 0; + u8 type; for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + type = stype[(slot - i) % BPF_REG_SIZE]; + if (type == STACK_MISC) continue; - if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { - zeros++; + if (type == STACK_ZERO) continue; - } verbose(env, "invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; } mark_reg_read(env, reg, reg->parent, REG_LIVE_READ64); - if (value_regno >= 0) { - if (zeros == size) { - /* any size read into register is zero extended, - * so the whole register == const_zero - */ - __mark_reg_const_zero(&state->regs[value_regno]); - /* backtracking doesn't support STACK_ZERO yet, - * so mark it precise here, so that later - * backtracking can stop here. - * Backtracking may not need this if this register - * doesn't participate in pointer adjustment. - * Forward propagation of precise flag is not - * necessary either. This mark is only to stop - * backtracking. Any register that contributed - * to const 0 was marked precise before spill. - */ - state->regs[value_regno].precise = true; - } else { - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); - } - state->regs[value_regno].live |= REG_LIVE_WRITTEN; - } + if (dst_regno >= 0) + mark_reg_stack_read(env, reg_state, off, off + size, dst_regno); } return 0; } -static int check_stack_access(struct bpf_verifier_env *env, - const struct bpf_reg_state *reg, - int off, int size) +enum stack_access_src { + ACCESS_DIRECT = 1, /* the access is performed by an instruction */ + ACCESS_HELPER = 2, /* the access is performed by a helper */ +}; + +static int check_stack_range_initialized(struct bpf_verifier_env *env, + int regno, int off, int access_size, + bool zero_size_allowed, + enum stack_access_src type, + struct bpf_call_arg_meta *meta); + +static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) +{ + return cur_regs(env) + regno; +} + +/* Read the stack at 'ptr_regno + off' and put the result into the register + * 'dst_regno'. + * 'off' includes the pointer register's fixed offset(i.e. 'ptr_regno.off'), + * but not its variable offset. + * 'size' is assumed to be <= reg size and the access is assumed to be aligned. + * + * As opposed to check_stack_read_fixed_off, this function doesn't deal with + * filling registers (i.e. reads of spilled register cannot be detected when + * the offset is not fixed). We conservatively mark 'dst_regno' as containing + * SCALAR_VALUE. That's why we assert that the 'ptr_regno' has a variable + * offset; for a fixed offset check_stack_read_fixed_off should be used + * instead. + */ +static int check_stack_read_var_off(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, int dst_regno) { - /* Stack accesses must be at a fixed offset, so that we - * can determine what type of data were returned. See - * check_stack_read(). + /* The state of the source register. */ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *ptr_state = func(env, reg); + int err; + int min_off, max_off; + + /* Note that we pass a NULL meta, so raw access will not be permitted. */ - if (!tnum_is_const(reg->var_off)) { + err = check_stack_range_initialized(env, ptr_regno, off, size, + false, ACCESS_DIRECT, NULL); + if (err) + return err; + + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; + mark_reg_stack_read(env, ptr_state, min_off, max_off + size, dst_regno); + return 0; +} + +/* check_stack_read dispatches to check_stack_read_fixed_off or + * check_stack_read_var_off. + * + * The caller must ensure that the offset falls within the allocated stack + * bounds. + * + * 'dst_regno' is a register which will receive the value from the stack. It + * can be -1, meaning that the read value is not going to a register. + */ +static int check_stack_read(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int dst_regno) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + /* Some accesses are only permitted with a static offset. */ + bool var_off = !tnum_is_const(reg->var_off); + + /* The offset is required to be static when reads don't go to a + * register, in order to not leak pointers (see + * check_stack_read_fixed_off). + */ + if (dst_regno < 0 && var_off) { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "variable stack access var_off=%s off=%d size=%d\n", + verbose(env, "variable offset stack pointer cannot be passed into helper function; var_off=%s off=%d size=%d\n", tn_buf, off, size); return -EACCES; } + /* Variable offset is prohibited for unprivileged mode for simplicity + * since it requires corresponding support in Spectre masking for stack + * ALU. See also retrieve_ptr_limit(). + */ + if (!env->bypass_spec_v1 && var_off) { + char tn_buf[48]; - if (off >= 0 || off < -MAX_BPF_STACK) { - verbose(env, "invalid stack off=%d size=%d\n", off, size); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable offset stack access prohibited for !root, var_off=%s\n", + ptr_regno, tn_buf); return -EACCES; } - return 0; + if (!var_off) { + off += reg->var_off.value; + err = check_stack_read_fixed_off(env, state, off, size, + dst_regno); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. Note that dst_regno >= 0 on this + * branch. + */ + err = check_stack_read_var_off(env, ptr_regno, off, size, + dst_regno); + } + return err; +} + + +/* check_stack_write dispatches to check_stack_write_fixed_off or + * check_stack_write_var_off. + * + * 'ptr_regno' is the register used as a pointer into the stack. + * 'off' includes 'ptr_regno->off', but not its variable offset (if any). + * 'value_regno' is the register whose value we're writing to the stack. It can + * be -1, meaning that we're not writing from a register. + * + * The caller must ensure that the offset falls within the maximum stack size. + */ +static int check_stack_write(struct bpf_verifier_env *env, + int ptr_regno, int off, int size, + int value_regno, int insn_idx) +{ + struct bpf_reg_state *reg = reg_state(env, ptr_regno); + struct bpf_func_state *state = func(env, reg); + int err; + + if (tnum_is_const(reg->var_off)) { + off += reg->var_off.value; + err = check_stack_write_fixed_off(env, state, off, size, + value_regno, insn_idx); + } else { + /* Variable offset stack reads need more conservative handling + * than fixed offset ones. + */ + err = check_stack_write_var_off(env, state, + ptr_regno, off, size, + value_regno, insn_idx); + } + return err; } static int check_map_access_type(struct bpf_verifier_env *env, u32 regno, @@ -2862,11 +3122,6 @@ static int check_sock_access(struct bpf_verifier_env *env, int insn_idx, return -EACCES; } -static struct bpf_reg_state *reg_state(struct bpf_verifier_env *env, int regno) -{ - return cur_regs(env) + regno; -} - static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { return __is_pointer_value(env->allow_ptr_leaks, reg_state(env, regno)); @@ -2985,8 +3240,8 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_STACK: pointer_desc = "stack "; - /* The stack spill tracking logic in check_stack_write() - * and check_stack_read() relies on stack accesses being + /* The stack spill tracking logic in check_stack_write_fixed_off() + * and check_stack_read_fixed_off() relies on stack accesses being * aligned. */ strict = true; @@ -3402,6 +3657,91 @@ static int check_ptr_to_map_access(struct bpf_verifier_env *env, return 0; } +/* Check that the stack access at the given offset is within bounds. The + * maximum valid offset is -1. + * + * The minimum valid offset is -MAX_BPF_STACK for writes, and + * -state->allocated_stack for reads. + */ +static int check_stack_slot_within_bounds(int off, + struct bpf_func_state *state, + enum bpf_access_type t) +{ + int min_valid_off; + + if (t == BPF_WRITE) + min_valid_off = -MAX_BPF_STACK; + else + min_valid_off = -state->allocated_stack; + + if (off < min_valid_off || off > -1) + return -EACCES; + return 0; +} + +/* Check that the stack access at 'regno + off' falls within the maximum stack + * bounds. + * + * 'off' includes `regno->offset`, but not its dynamic part (if any). + */ +static int check_stack_access_within_bounds( + struct bpf_verifier_env *env, + int regno, int off, int access_size, + enum stack_access_src src, enum bpf_access_type type) +{ + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state = func(env, reg); + int min_off, max_off; + int err; + char *err_extra; + + if (src == ACCESS_HELPER) + /* We don't know if helpers are reading or writing (or both). */ + err_extra = " indirect access to"; + else if (type == BPF_READ) + err_extra = " read from"; + else + err_extra = " write to"; + + if (tnum_is_const(reg->var_off)) { + min_off = reg->var_off.value + off; + if (access_size > 0) + max_off = min_off + access_size - 1; + else + max_off = min_off; + } else { + if (reg->smax_value >= BPF_MAX_VAR_OFF || + reg->smin_value <= -BPF_MAX_VAR_OFF) { + verbose(env, "invalid unbounded variable-offset%s stack R%d\n", + err_extra, regno); + return -EACCES; + } + min_off = reg->smin_value + off; + if (access_size > 0) + max_off = reg->smax_value + off + access_size - 1; + else + max_off = min_off; + } + + err = check_stack_slot_within_bounds(min_off, state, type); + if (!err) + err = check_stack_slot_within_bounds(max_off, state, type); + + if (err) { + if (tnum_is_const(reg->var_off)) { + verbose(env, "invalid%s stack R%d off=%d size=%d\n", + err_extra, regno, off, access_size); + } else { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "invalid variable-offset%s stack R%d var_off=%s size=%d\n", + err_extra, regno, tn_buf, access_size); + } + } + return err; +} /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory @@ -3517,8 +3857,8 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } } else if (reg->type == PTR_TO_STACK) { - off += reg->var_off.value; - err = check_stack_access(env, reg, off, size); + /* Basic bounds checks. */ + err = check_stack_access_within_bounds(env, regno, off, size, ACCESS_DIRECT, t); if (err) return err; @@ -3527,12 +3867,12 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (err) return err; - if (t == BPF_WRITE) - err = check_stack_write(env, state, off, size, - value_regno, insn_idx); - else - err = check_stack_read(env, state, off, size, + if (t == BPF_READ) + err = check_stack_read(env, regno, off, size, value_regno); + else + err = check_stack_write(env, regno, off, size, + value_regno, insn_idx); } else if (reg_is_pkt_pointer(reg)) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose(env, "cannot write into packet\n"); @@ -3699,49 +4039,53 @@ static int check_atomic(struct bpf_verifier_env *env, int insn_idx, struct bpf_i return 0; } -static int __check_stack_boundary(struct bpf_verifier_env *env, u32 regno, - int off, int access_size, - bool zero_size_allowed) +/* When register 'regno' is used to read the stack (either directly or through + * a helper function) make sure that it's within stack boundary and, depending + * on the access type, that all elements of the stack are initialized. + * + * 'off' includes 'regno->off', but not its dynamic part (if any). + * + * All registers that have been spilled on the stack in the slots within the + * read offsets are marked as read. + */ +static int check_stack_range_initialized( + struct bpf_verifier_env *env, int regno, int off, + int access_size, bool zero_size_allowed, + enum stack_access_src type, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = reg_state(env, regno); + struct bpf_func_state *state = func(env, reg); + int err, min_off, max_off, i, j, slot, spi; + char *err_extra = type == ACCESS_HELPER ? " indirect" : ""; + enum bpf_access_type bounds_check_type; + /* Some accesses can write anything into the stack, others are + * read-only. + */ + bool clobber = false; - if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || - access_size < 0 || (access_size == 0 && !zero_size_allowed)) { - if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid stack type R%d off=%d access_size=%d\n", - regno, off, access_size); - } else { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid stack type R%d var_off=%s access_size=%d\n", - regno, tn_buf, access_size); - } + if (access_size == 0 && !zero_size_allowed) { + verbose(env, "invalid zero-sized read\n"); return -EACCES; } - return 0; -} -/* when register 'regno' is passed into function that will read 'access_size' - * bytes from that pointer, make sure that it's within stack boundary - * and all elements of stack are initialized. - * Unlike most pointer bounds-checking functions, this one doesn't take an - * 'off' argument, so it has to add in reg->off itself. - */ -static int check_stack_boundary(struct bpf_verifier_env *env, int regno, - int access_size, bool zero_size_allowed, - struct bpf_call_arg_meta *meta) -{ - struct bpf_reg_state *reg = reg_state(env, regno); - struct bpf_func_state *state = func(env, reg); - int err, min_off, max_off, i, j, slot, spi; + if (type == ACCESS_HELPER) { + /* The bounds checks for writes are more permissive than for + * reads. However, if raw_mode is not set, we'll do extra + * checks below. + */ + bounds_check_type = BPF_WRITE; + clobber = true; + } else { + bounds_check_type = BPF_READ; + } + err = check_stack_access_within_bounds(env, regno, off, access_size, + type, bounds_check_type); + if (err) + return err; + if (tnum_is_const(reg->var_off)) { - min_off = max_off = reg->var_off.value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) - return err; + min_off = max_off = reg->var_off.value + off; } else { /* Variable offset is prohibited for unprivileged mode for * simplicity since it requires corresponding support in @@ -3752,8 +4096,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "R%d indirect variable offset stack access prohibited for !root, var_off=%s\n", - regno, tn_buf); + verbose(env, "R%d%s variable offset stack access prohibited for !root, var_off=%s\n", + regno, err_extra, tn_buf); return -EACCES; } /* Only initialized buffer on stack is allowed to be accessed @@ -3765,28 +4109,8 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (meta && meta->raw_mode) meta = NULL; - if (reg->smax_value >= BPF_MAX_VAR_OFF || - reg->smax_value <= -BPF_MAX_VAR_OFF) { - verbose(env, "R%d unbounded indirect variable offset stack access\n", - regno); - return -EACCES; - } - min_off = reg->smin_value + reg->off; - max_off = reg->smax_value + reg->off; - err = __check_stack_boundary(env, regno, min_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d min value is outside of stack bound\n", - regno); - return err; - } - err = __check_stack_boundary(env, regno, max_off, access_size, - zero_size_allowed); - if (err) { - verbose(env, "R%d max value is outside of stack bound\n", - regno); - return err; - } + min_off = reg->smin_value + off; + max_off = reg->smax_value + off; } if (meta && meta->raw_mode) { @@ -3806,8 +4130,10 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (*stype == STACK_MISC) goto mark; if (*stype == STACK_ZERO) { - /* helper can write anything into the stack */ - *stype = STACK_MISC; + if (clobber) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + } goto mark; } @@ -3818,22 +4144,24 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, if (state->stack[spi].slot_type[0] == STACK_SPILL && (state->stack[spi].spilled_ptr.type == SCALAR_VALUE || env->allow_ptr_leaks)) { - __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); - for (j = 0; j < BPF_REG_SIZE; j++) - state->stack[spi].slot_type[j] = STACK_MISC; + if (clobber) { + __mark_reg_unknown(env, &state->stack[spi].spilled_ptr); + for (j = 0; j < BPF_REG_SIZE; j++) + state->stack[spi].slot_type[j] = STACK_MISC; + } goto mark; } err: if (tnum_is_const(reg->var_off)) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - min_off, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d off %d+%d size %d\n", + err_extra, regno, min_off, i - min_off, access_size); } else { char tn_buf[48]; tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose(env, "invalid indirect read from stack var_off %s+%d size %d\n", - tn_buf, i - min_off, access_size); + verbose(env, "invalid%s read from stack R%d var_off %s+%d size %d\n", + err_extra, regno, tn_buf, i - min_off, access_size); } return -EACCES; mark: @@ -3882,8 +4210,10 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, "rdwr", &env->prog->aux->max_rdwr_access); case PTR_TO_STACK: - return check_stack_boundary(env, regno, access_size, - zero_size_allowed, meta); + return check_stack_range_initialized( + env, + regno, reg->off, access_size, + zero_size_allowed, ACCESS_HELPER, meta); default: /* scalar_value or invalid ptr */ /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && @@ -5547,6 +5877,41 @@ do_sim: return !ret ? -EFAULT : 0; } +/* check that stack access falls within stack limits and that 'reg' doesn't + * have a variable offset. + * + * Variable offset is prohibited for unprivileged mode for simplicity since it + * requires corresponding support in Spectre masking for stack ALU. See also + * retrieve_ptr_limit(). + * + * + * 'off' includes 'reg->off'. + */ +static int check_stack_access_for_ptr_arithmetic( + struct bpf_verifier_env *env, + int regno, + const struct bpf_reg_state *reg, + int off) +{ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose(env, "R%d variable stack access prohibited for !root, var_off=%s off=%d\n", + regno, tn_buf, off); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose(env, "R%d stack pointer arithmetic goes out of range, " + "prohibited for !root; off=%d\n", regno, off); + return -EACCES; + } + + return 0; +} + + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -5790,10 +6155,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, "prohibited for !root\n", dst); return -EACCES; } else if (dst_reg->type == PTR_TO_STACK && - check_stack_access(env, dst_reg, dst_reg->off + - dst_reg->var_off.value, 1)) { - verbose(env, "R%d stack pointer arithmetic goes out of range, " - "prohibited for !root\n", dst); + check_stack_access_for_ptr_arithmetic( + env, dst, dst_reg, dst_reg->off + + dst_reg->var_off.value)) { return -EACCES; } } @@ -12129,6 +12493,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, env->strict_alignment = false; env->allow_ptr_leaks = bpf_allow_ptr_leaks(); + env->allow_uninit_stack = bpf_allow_uninit_stack(); env->allow_ptr_to_map_access = bpf_allow_ptr_to_map_access(); env->bypass_spec_v1 = bpf_bypass_spec_v1(); env->bypass_spec_v4 = bpf_bypass_spec_v4(); -- cgit v1.2.3 From 6df8fb83301d68ea0a0c0e1cbcc790fcc333ed12 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Tue, 9 Feb 2021 12:27:01 +0100 Subject: bpf_lru_list: Read double-checked variable once without lock For double-checked locking in bpf_common_lru_push_free(), node->type is read outside the critical section and then re-checked under the lock. However, concurrent writes to node->type result in data races. For example, the following concurrent access was observed by KCSAN: write to 0xffff88801521bc22 of 1 bytes by task 10038 on cpu 1: __bpf_lru_node_move_in kernel/bpf/bpf_lru_list.c:91 __local_list_flush kernel/bpf/bpf_lru_list.c:298 ... read to 0xffff88801521bc22 of 1 bytes by task 10043 on cpu 0: bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:507 bpf_lru_push_free kernel/bpf/bpf_lru_list.c:555 ... Fix the data races where node->type is read outside the critical section (for double-checked locking) by marking the access with READ_ONCE() as well as ensuring the variable is only accessed once. Fixes: 3a08c2fd7634 ("bpf: LRU List") Reported-by: syzbot+3536db46dfa58c573458@syzkaller.appspotmail.com Reported-by: syzbot+516acdb03d3e27d91bcd@syzkaller.appspotmail.com Signed-off-by: Marco Elver Signed-off-by: Andrii Nakryiko Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210209112701.3341724-1-elver@google.com --- kernel/bpf/bpf_lru_list.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index 1b6b9349cb85..d99e89f113c4 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -502,13 +502,14 @@ struct bpf_lru_node *bpf_lru_pop_free(struct bpf_lru *lru, u32 hash) static void bpf_common_lru_push_free(struct bpf_lru *lru, struct bpf_lru_node *node) { + u8 node_type = READ_ONCE(node->type); unsigned long flags; - if (WARN_ON_ONCE(node->type == BPF_LRU_LIST_T_FREE) || - WARN_ON_ONCE(node->type == BPF_LRU_LOCAL_LIST_T_FREE)) + if (WARN_ON_ONCE(node_type == BPF_LRU_LIST_T_FREE) || + WARN_ON_ONCE(node_type == BPF_LRU_LOCAL_LIST_T_FREE)) return; - if (node->type == BPF_LRU_LOCAL_LIST_T_PENDING) { + if (node_type == BPF_LRU_LOCAL_LIST_T_PENDING) { struct bpf_lru_locallist *loc_l; loc_l = per_cpu_ptr(lru->common_lru.local_list, node->cpu); -- cgit v1.2.3 From 700d4796ef59f5faf240d307839bd419e2b6bdff Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Feb 2021 19:36:26 -0800 Subject: bpf: Optimize program stats Move bpf_prog_stats from prog->aux into prog to avoid one extra load in critical path of program execution. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210210033634.62081-2-alexei.starovoitov@gmail.com --- include/linux/bpf.h | 8 -------- include/linux/filter.h | 14 +++++++++++--- kernel/bpf/core.c | 8 ++++---- kernel/bpf/syscall.c | 2 +- kernel/bpf/trampoline.c | 2 +- kernel/bpf/verifier.c | 2 +- 6 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 079162bbd387..5a388955e6b6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -507,12 +506,6 @@ enum bpf_cgroup_storage_type { */ #define MAX_BPF_FUNC_ARGS 12 -struct bpf_prog_stats { - u64 cnt; - u64 nsecs; - struct u64_stats_sync syncp; -} __aligned(2 * sizeof(u64)); - struct btf_func_model { u8 ret_size; u8 nr_args; @@ -845,7 +838,6 @@ struct bpf_prog_aux { u32 linfo_idx; u32 num_exentries; struct exception_table_entry *extable; - struct bpf_prog_stats __percpu *stats; union { struct work_struct work; struct rcu_head rcu; diff --git a/include/linux/filter.h b/include/linux/filter.h index 5b3137d7b690..cecb03c9d251 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -22,6 +22,7 @@ #include #include #include +#include #include @@ -539,6 +540,12 @@ struct bpf_binary_header { u8 image[] __aligned(BPF_IMAGE_ALIGNMENT); }; +struct bpf_prog_stats { + u64 cnt; + u64 nsecs; + struct u64_stats_sync syncp; +} __aligned(2 * sizeof(u64)); + struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ @@ -557,10 +564,11 @@ struct bpf_prog { u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; - struct bpf_prog_aux *aux; /* Auxiliary fields */ - struct sock_fprog_kern *orig_prog; /* Original BPF program */ + struct bpf_prog_stats __percpu *stats; unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); + struct bpf_prog_aux *aux; /* Auxiliary fields */ + struct sock_fprog_kern *orig_prog; /* Original BPF program */ /* Instructions for interpreter */ struct sock_filter insns[0]; struct bpf_insn insnsi[]; @@ -581,7 +589,7 @@ DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); struct bpf_prog_stats *__stats; \ u64 __start = sched_clock(); \ __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - __stats = this_cpu_ptr(prog->aux->stats); \ + __stats = this_cpu_ptr(prog->stats); \ u64_stats_update_begin(&__stats->syncp); \ __stats->cnt++; \ __stats->nsecs += sched_clock() - __start; \ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 5bbd4884ff7a..2cf71fd39c22 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -114,8 +114,8 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (!prog) return NULL; - prog->aux->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); - if (!prog->aux->stats) { + prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); + if (!prog->stats) { kfree(prog->aux); vfree(prog); return NULL; @@ -124,7 +124,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) for_each_possible_cpu(cpu) { struct bpf_prog_stats *pstats; - pstats = per_cpu_ptr(prog->aux->stats, cpu); + pstats = per_cpu_ptr(prog->stats, cpu); u64_stats_init(&pstats->syncp); } return prog; @@ -249,10 +249,10 @@ void __bpf_prog_free(struct bpf_prog *fp) if (fp->aux) { mutex_destroy(&fp->aux->used_maps_mutex); mutex_destroy(&fp->aux->dst_mutex); - free_percpu(fp->aux->stats); kfree(fp->aux->poke_tab); kfree(fp->aux); } + free_percpu(fp->stats); vfree(fp); } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e5999d86c76e..f7df56a704de 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1739,7 +1739,7 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog, unsigned int start; u64 tnsecs, tcnt; - st = per_cpu_ptr(prog->aux->stats, cpu); + st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin_irq(&st->syncp); tnsecs = st->nsecs; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 35c5887d82ff..5be3beeedd74 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -412,7 +412,7 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) * Hence check that 'start' is not zero. */ start) { - stats = this_cpu_ptr(prog->aux->stats); + stats = this_cpu_ptr(prog->stats); u64_stats_update_begin(&stats->syncp); stats->cnt++; stats->nsecs += sched_clock() - start; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 400d79e99fc8..424c1ba0f52f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11253,7 +11253,7 @@ static int jit_subprogs(struct bpf_verifier_env *env) /* BPF_PROG_RUN doesn't call subprogs directly, * hence main prog stats include the runtime of subprogs. * subprogs don't have IDs and not reachable via prog_get_next_id - * func[i]->aux->stats will never be accessed and stays NULL + * func[i]->stats will never be accessed and stays NULL */ func[i] = bpf_prog_alloc_no_stats(bpf_prog_size(len), GFP_USER); if (!func[i]) -- cgit v1.2.3 From 031d6e02ddbb8dea747c1abb697d556901f07dd4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Feb 2021 19:36:27 -0800 Subject: bpf: Run sleepable programs with migration disabled In older non-RT kernels migrate_disable() was the same as preempt_disable(). Since commit 74d862b682f5 ("sched: Make migrate_disable/enable() independent of RT") migrate_disable() is real and doesn't prevent sleeping. Running sleepable programs with migration disabled allows to add support for program stats and per-cpu maps later. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210210033634.62081-3-alexei.starovoitov@gmail.com --- kernel/bpf/trampoline.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 5be3beeedd74..89fc849ba271 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -425,11 +425,13 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) void notrace __bpf_prog_enter_sleepable(void) { rcu_read_lock_trace(); + migrate_disable(); might_fault(); } void notrace __bpf_prog_exit_sleepable(void) { + migrate_enable(); rcu_read_unlock_trace(); } -- cgit v1.2.3 From f2dd3b39467411c53703125a111f45b3672c1771 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Feb 2021 19:36:28 -0800 Subject: bpf: Compute program stats for sleepable programs Since sleepable programs don't migrate from the cpu the excution stats can be computed for them as well. Reuse the same infrastructure for both sleepable and non-sleepable programs. run_cnt -> the number of times the program was executed. run_time_ns -> the program execution time in nanoseconds including the off-cpu time when the program was sleeping. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210210033634.62081-4-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 31 ++++++++++++------------------- include/linux/bpf.h | 4 ++-- kernel/bpf/trampoline.c | 42 ++++++++++++++++++++++++++++-------------- 3 files changed, 42 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index a3dc3bd154ac..d11b9bcebbea 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1742,15 +1742,12 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, u8 *prog = *pprog; int cnt = 0; - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_enter_sleepable, prog)) + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_enter_sleepable : + __bpf_prog_enter, prog)) return -EINVAL; - } else { - if (emit_call(&prog, __bpf_prog_enter, prog)) - return -EINVAL; - /* remember prog start time returned by __bpf_prog_enter */ - emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); - } + /* remember prog start time returned by __bpf_prog_enter */ + emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); @@ -1770,18 +1767,14 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); - if (p->aux->sleepable) { - if (emit_call(&prog, __bpf_prog_exit_sleepable, prog)) + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); + /* arg2: mov rsi, rbx <- start time in nsec */ + emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); + if (emit_call(&prog, + p->aux->sleepable ? __bpf_prog_exit_sleepable : + __bpf_prog_exit, prog)) return -EINVAL; - } else { - /* arg1: mov rdi, progs[i] */ - emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, - (u32) (long) p); - /* arg2: mov rsi, rbx <- start time in nsec */ - emit_mov_reg(&prog, true, BPF_REG_2, BPF_REG_6); - if (emit_call(&prog, __bpf_prog_exit, prog)) - return -EINVAL; - } *pprog = prog; return 0; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5a388955e6b6..e1840ceaf55f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -563,8 +563,8 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, /* these two functions are called from generated trampoline */ u64 notrace __bpf_prog_enter(void); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); -void notrace __bpf_prog_enter_sleepable(void); -void notrace __bpf_prog_exit_sleepable(void); +u64 notrace __bpf_prog_enter_sleepable(void); +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); struct bpf_ksym { unsigned long start; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 89fc849ba271..48eb021e1421 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -381,56 +381,70 @@ out: mutex_unlock(&trampoline_mutex); } +#define NO_START_TIME 0 +static u64 notrace bpf_prog_start_time(void) +{ + u64 start = NO_START_TIME; + + if (static_branch_unlikely(&bpf_stats_enabled_key)) + start = sched_clock(); + return start; +} + /* The logic is similar to BPF_PROG_RUN, but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into - * call _bpf_prog_enter + * call __bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit */ u64 notrace __bpf_prog_enter(void) __acquires(RCU) { - u64 start = 0; - rcu_read_lock(); migrate_disable(); - if (static_branch_unlikely(&bpf_stats_enabled_key)) - start = sched_clock(); - return start; + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) - __releases(RCU) +static void notrace update_prog_stats(struct bpf_prog *prog, + u64 start) { struct bpf_prog_stats *stats; if (static_branch_unlikely(&bpf_stats_enabled_key) && - /* static_key could be enabled in __bpf_prog_enter - * and disabled in __bpf_prog_exit. + /* static_key could be enabled in __bpf_prog_enter* + * and disabled in __bpf_prog_exit*. * And vice versa. - * Hence check that 'start' is not zero. + * Hence check that 'start' is valid. */ - start) { + start > NO_START_TIME) { stats = this_cpu_ptr(prog->stats); u64_stats_update_begin(&stats->syncp); stats->cnt++; stats->nsecs += sched_clock() - start; u64_stats_update_end(&stats->syncp); } +} + +void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) + __releases(RCU) +{ + update_prog_stats(prog, start); migrate_enable(); rcu_read_unlock(); } -void notrace __bpf_prog_enter_sleepable(void) +u64 notrace __bpf_prog_enter_sleepable(void) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + return bpf_prog_start_time(); } -void notrace __bpf_prog_exit_sleepable(void) +void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) { + update_prog_stats(prog, start); migrate_enable(); rcu_read_unlock_trace(); } -- cgit v1.2.3 From ca06f55b90020cd97f4cc6d52db95436162e7dcf Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Feb 2021 19:36:29 -0800 Subject: bpf: Add per-program recursion prevention mechanism Since both sleepable and non-sleepable programs execute under migrate_disable add recursion prevention mechanism to both types of programs when they're executed via bpf trampoline. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210210033634.62081-5-alexei.starovoitov@gmail.com --- arch/x86/net/bpf_jit_comp.c | 15 ++++++++++++++ include/linux/bpf.h | 6 +++--- include/linux/filter.h | 1 + kernel/bpf/core.c | 8 ++++++++ kernel/bpf/trampoline.c | 23 ++++++++++++++++++---- .../selftests/bpf/prog_tests/fexit_stress.c | 4 ++-- .../selftests/bpf/prog_tests/trampoline_count.c | 4 ++-- 7 files changed, 50 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index d11b9bcebbea..79e7a0ec1da5 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1740,8 +1740,11 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, struct bpf_prog *p, int stack_size, bool mod_ret) { u8 *prog = *pprog; + u8 *jmp_insn; int cnt = 0; + /* arg1: mov rdi, progs[i] */ + emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); if (emit_call(&prog, p->aux->sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter, prog)) @@ -1749,6 +1752,14 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, /* remember prog start time returned by __bpf_prog_enter */ emit_mov_reg(&prog, true, BPF_REG_6, BPF_REG_0); + /* if (__bpf_prog_enter*(prog) == 0) + * goto skip_exec_of_prog; + */ + EMIT3(0x48, 0x85, 0xC0); /* test rax,rax */ + /* emit 2 nops that will be replaced with JE insn */ + jmp_insn = prog; + emit_nops(&prog, 2); + /* arg1: lea rdi, [rbp - stack_size] */ EMIT4(0x48, 0x8D, 0x7D, -stack_size); /* arg2: progs[i]->insnsi for interpreter */ @@ -1767,6 +1778,10 @@ static int invoke_bpf_prog(const struct btf_func_model *m, u8 **pprog, if (mod_ret) emit_stx(&prog, BPF_DW, BPF_REG_FP, BPF_REG_0, -8); + /* replace 2 nops with JE insn, since jmp target is known */ + jmp_insn[0] = X86_JE; + jmp_insn[1] = prog - jmp_insn - 2; + /* arg1: mov rdi, progs[i] */ emit_mov_imm64(&prog, BPF_REG_1, (long) p >> 32, (u32) (long) p); /* arg2: mov rsi, rbx <- start time in nsec */ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e1840ceaf55f..1c8ea682c007 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -529,7 +529,7 @@ struct btf_func_model { /* Each call __bpf_prog_enter + call bpf_func + call __bpf_prog_exit is ~50 * bytes on x86. Pick a number to fit into BPF_IMAGE_SIZE / 2 */ -#define BPF_MAX_TRAMP_PROGS 40 +#define BPF_MAX_TRAMP_PROGS 38 struct bpf_tramp_progs { struct bpf_prog *progs[BPF_MAX_TRAMP_PROGS]; @@ -561,9 +561,9 @@ int arch_prepare_bpf_trampoline(void *image, void *image_end, struct bpf_tramp_progs *tprogs, void *orig_call); /* these two functions are called from generated trampoline */ -u64 notrace __bpf_prog_enter(void); +u64 notrace __bpf_prog_enter(struct bpf_prog *prog); void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start); -u64 notrace __bpf_prog_enter_sleepable(void); +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog); void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start); struct bpf_ksym { diff --git a/include/linux/filter.h b/include/linux/filter.h index cecb03c9d251..6a06f3c69f4e 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -565,6 +565,7 @@ struct bpf_prog { u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; struct bpf_prog_stats __percpu *stats; + int __percpu *active; unsigned int (*bpf_func)(const void *ctx, const struct bpf_insn *insn); struct bpf_prog_aux *aux; /* Auxiliary fields */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 2cf71fd39c22..334070c4b8a1 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -91,6 +91,12 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag vfree(fp); return NULL; } + fp->active = alloc_percpu_gfp(int, GFP_KERNEL_ACCOUNT | gfp_extra_flags); + if (!fp->active) { + vfree(fp); + kfree(aux); + return NULL; + } fp->pages = size / PAGE_SIZE; fp->aux = aux; @@ -116,6 +122,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) prog->stats = alloc_percpu_gfp(struct bpf_prog_stats, gfp_flags); if (!prog->stats) { + free_percpu(prog->active); kfree(prog->aux); vfree(prog); return NULL; @@ -253,6 +260,7 @@ void __bpf_prog_free(struct bpf_prog *fp) kfree(fp->aux); } free_percpu(fp->stats); + free_percpu(fp->active); vfree(fp); } diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 48eb021e1421..89ef6320d19b 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -381,13 +381,16 @@ out: mutex_unlock(&trampoline_mutex); } -#define NO_START_TIME 0 +#define NO_START_TIME 1 static u64 notrace bpf_prog_start_time(void) { u64 start = NO_START_TIME; - if (static_branch_unlikely(&bpf_stats_enabled_key)) + if (static_branch_unlikely(&bpf_stats_enabled_key)) { start = sched_clock(); + if (unlikely(!start)) + start = NO_START_TIME; + } return start; } @@ -397,12 +400,20 @@ static u64 notrace bpf_prog_start_time(void) * call __bpf_prog_enter * call prog->bpf_func * call __bpf_prog_exit + * + * __bpf_prog_enter returns: + * 0 - skip execution of the bpf prog + * 1 - execute bpf prog + * [2..MAX_U64] - excute bpf prog and record execution time. + * This is start time. */ -u64 notrace __bpf_prog_enter(void) +u64 notrace __bpf_prog_enter(struct bpf_prog *prog) __acquires(RCU) { rcu_read_lock(); migrate_disable(); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) + return 0; return bpf_prog_start_time(); } @@ -430,21 +441,25 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start) __releases(RCU) { update_prog_stats(prog, start); + __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock(); } -u64 notrace __bpf_prog_enter_sleepable(void) +u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) { rcu_read_lock_trace(); migrate_disable(); might_fault(); + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) + return 0; return bpf_prog_start_time(); } void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start) { update_prog_stats(prog, start); + __this_cpu_dec(*(prog->active)); migrate_enable(); rcu_read_unlock_trace(); } diff --git a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c index 3b9dbf7433f0..7c9b62e971f1 100644 --- a/tools/testing/selftests/bpf/prog_tests/fexit_stress.c +++ b/tools/testing/selftests/bpf/prog_tests/fexit_stress.c @@ -2,8 +2,8 @@ /* Copyright (c) 2019 Facebook */ #include -/* x86-64 fits 55 JITed and 43 interpreted progs into half page */ -#define CNT 40 +/* that's kernel internal BPF_MAX_TRAMP_PROGS define */ +#define CNT 38 void test_fexit_stress(void) { diff --git a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c index 781c8d11604b..f3022d934e2d 100644 --- a/tools/testing/selftests/bpf/prog_tests/trampoline_count.c +++ b/tools/testing/selftests/bpf/prog_tests/trampoline_count.c @@ -4,7 +4,7 @@ #include #include -#define MAX_TRAMP_PROGS 40 +#define MAX_TRAMP_PROGS 38 struct inst { struct bpf_object *obj; @@ -52,7 +52,7 @@ void test_trampoline_count(void) struct bpf_link *link; char comm[16] = {}; - /* attach 'allowed' 40 trampoline programs */ + /* attach 'allowed' trampoline programs */ for (i = 0; i < MAX_TRAMP_PROGS; i++) { obj = bpf_object__open_file(object, NULL); if (CHECK(IS_ERR(obj), "obj_open_file", "err %ld\n", PTR_ERR(obj))) { -- cgit v1.2.3 From 9ed9e9ba2337205311398a312796c213737bac35 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Feb 2021 19:36:31 -0800 Subject: bpf: Count the number of times recursion was prevented Add per-program counter for number of times recursion prevention mechanism was triggered and expose it via show_fdinfo and bpf_prog_info. Teach bpftool to print it. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210210033634.62081-7-alexei.starovoitov@gmail.com --- include/linux/filter.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/syscall.c | 14 ++++++++++---- kernel/bpf/trampoline.c | 18 ++++++++++++++++-- tools/bpf/bpftool/prog.c | 4 ++++ tools/include/uapi/linux/bpf.h | 1 + 6 files changed, 33 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 6a06f3c69f4e..3b00fc906ccd 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -543,6 +543,7 @@ struct bpf_binary_header { struct bpf_prog_stats { u64 cnt; u64 nsecs; + u64 misses; struct u64_stats_sync syncp; } __aligned(2 * sizeof(u64)); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index c001766adcbc..c547ad1ffe43 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -4501,6 +4501,7 @@ struct bpf_prog_info { __aligned_u64 prog_tags; __u64 run_time_ns; __u64 run_cnt; + __u64 recursion_misses; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index f7df56a704de..c859bc46d06c 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1731,25 +1731,28 @@ static int bpf_prog_release(struct inode *inode, struct file *filp) static void bpf_prog_get_stats(const struct bpf_prog *prog, struct bpf_prog_stats *stats) { - u64 nsecs = 0, cnt = 0; + u64 nsecs = 0, cnt = 0, misses = 0; int cpu; for_each_possible_cpu(cpu) { const struct bpf_prog_stats *st; unsigned int start; - u64 tnsecs, tcnt; + u64 tnsecs, tcnt, tmisses; st = per_cpu_ptr(prog->stats, cpu); do { start = u64_stats_fetch_begin_irq(&st->syncp); tnsecs = st->nsecs; tcnt = st->cnt; + tmisses = st->misses; } while (u64_stats_fetch_retry_irq(&st->syncp, start)); nsecs += tnsecs; cnt += tcnt; + misses += tmisses; } stats->nsecs = nsecs; stats->cnt = cnt; + stats->misses = misses; } #ifdef CONFIG_PROC_FS @@ -1768,14 +1771,16 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp) "memlock:\t%llu\n" "prog_id:\t%u\n" "run_time_ns:\t%llu\n" - "run_cnt:\t%llu\n", + "run_cnt:\t%llu\n" + "recursion_misses:\t%llu\n", prog->type, prog->jited, prog_tag, prog->pages * 1ULL << PAGE_SHIFT, prog->aux->id, stats.nsecs, - stats.cnt); + stats.cnt, + stats.misses); } #endif @@ -3438,6 +3443,7 @@ static int bpf_prog_get_info_by_fd(struct file *file, bpf_prog_get_stats(prog, &stats); info.run_time_ns = stats.nsecs; info.run_cnt = stats.cnt; + info.recursion_misses = stats.misses; if (!bpf_capable()) { info.jited_prog_len = 0; diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index 89ef6320d19b..7bc3b3209224 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -394,6 +394,16 @@ static u64 notrace bpf_prog_start_time(void) return start; } +static void notrace inc_misses_counter(struct bpf_prog *prog) +{ + struct bpf_prog_stats *stats; + + stats = this_cpu_ptr(prog->stats); + u64_stats_update_begin(&stats->syncp); + stats->misses++; + u64_stats_update_end(&stats->syncp); +} + /* The logic is similar to BPF_PROG_RUN, but with an explicit * rcu_read_lock() and migrate_disable() which are required * for the trampoline. The macro is split into @@ -412,8 +422,10 @@ u64 notrace __bpf_prog_enter(struct bpf_prog *prog) { rcu_read_lock(); migrate_disable(); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + inc_misses_counter(prog); return 0; + } return bpf_prog_start_time(); } @@ -451,8 +463,10 @@ u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog) rcu_read_lock_trace(); migrate_disable(); might_fault(); - if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) + if (unlikely(__this_cpu_inc_return(*(prog->active)) != 1)) { + inc_misses_counter(prog); return 0; + } return bpf_prog_start_time(); } diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 1fe3ba255bad..f2b915b20546 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -368,6 +368,8 @@ static void print_prog_header_json(struct bpf_prog_info *info) jsonw_uint_field(json_wtr, "run_time_ns", info->run_time_ns); jsonw_uint_field(json_wtr, "run_cnt", info->run_cnt); } + if (info->recursion_misses) + jsonw_uint_field(json_wtr, "recursion_misses", info->recursion_misses); } static void print_prog_json(struct bpf_prog_info *info, int fd) @@ -446,6 +448,8 @@ static void print_prog_header_plain(struct bpf_prog_info *info) if (info->run_time_ns) printf(" run_time_ns %lld run_cnt %lld", info->run_time_ns, info->run_cnt); + if (info->recursion_misses) + printf(" recursion_misses %lld", info->recursion_misses); printf("\n"); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index c001766adcbc..c547ad1ffe43 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -4501,6 +4501,7 @@ struct bpf_prog_info { __aligned_u64 prog_tags; __u64 run_time_ns; __u64 run_cnt; + __u64 recursion_misses; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3 From 638e4b825d523bed7a55e776c153049fb7716466 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 9 Feb 2021 19:36:33 -0800 Subject: bpf: Allows per-cpu maps and map-in-map in sleepable programs Since sleepable programs are now executing under migrate_disable the per-cpu maps are safe to use. The map-in-map were ok to use in sleepable from the time sleepable progs were introduced. Note that non-preallocated maps are still not safe, since there is no rcu_read_lock yet in sleepable programs and dynamically allocated map elements are relying on rcu protection. The sleepable programs have rcu_read_lock_trace instead. That limitation will be addresses in the future. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210210033634.62081-9-alexei.starovoitov@gmail.com --- kernel/bpf/hashtab.c | 4 ++-- kernel/bpf/verifier.c | 7 ++++++- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index c1ac7f964bc9..d63912e73ad9 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1148,7 +1148,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; @@ -1202,7 +1202,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, /* unknown flags */ return -EINVAL; - WARN_ON_ONCE(!rcu_read_lock_held()); + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held()); key_size = map->key_size; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 424c1ba0f52f..15c15ea0abf5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10384,9 +10384,14 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_LRU_HASH: case BPF_MAP_TYPE_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: if (!is_preallocated_map(map)) { verbose(env, - "Sleepable programs can only use preallocated hash maps\n"); + "Sleepable programs can only use preallocated maps\n"); return -EINVAL; } break; -- cgit v1.2.3 From b220c049d5196dd94d992dd2dc8cba1a5e6123bf Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 10 Feb 2021 11:53:22 -0500 Subject: tracing: Check length before giving out the filter buffer When filters are used by trace events, a page is allocated on each CPU and used to copy the trace event fields to this page before writing to the ring buffer. The reason to use the filter and not write directly into the ring buffer is because a filter may discard the event and there's more overhead on discarding from the ring buffer than the extra copy. The problem here is that there is no check against the size being allocated when using this page. If an event asks for more than a page size while being filtered, it will get only a page, leading to the caller writing more that what was allocated. Check the length of the request, and if it is more than PAGE_SIZE minus the header default back to allocating from the ring buffer directly. The ring buffer may reject the event if its too big anyway, but it wont overflow. Link: https://lore.kernel.org/ath10k/1612839593-2308-1-git-send-email-wgong@codeaurora.org/ Cc: stable@vger.kernel.org Fixes: 0fc1b09ff1ff4 ("tracing: Use temp buffer when filtering events") Reported-by: Wen Gong Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b8a2d786b503..b5815a022ecc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2745,7 +2745,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); - if (val == 1) { + if ((len < (PAGE_SIZE - sizeof(*entry))) && val == 1) { trace_event_setup(entry, type, flags, pc); entry->array[0] = len; return entry; -- cgit v1.2.3 From c5dbb89fc2ac013afe67b9e4fcb3743c02b567cd Mon Sep 17 00:00:00 2001 From: Florent Revest Date: Wed, 10 Feb 2021 12:14:03 +0100 Subject: bpf: Expose bpf_get_socket_cookie to tracing programs This needs a new helper that: - can work in a sleepable context (using sock_gen_cookie) - takes a struct sock pointer and checks that it's not NULL Signed-off-by: Florent Revest Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210210111406.785541-2-revest@chromium.org --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 8 ++++++++ kernel/trace/bpf_trace.c | 2 ++ net/core/filter.c | 12 ++++++++++++ tools/include/uapi/linux/bpf.h | 8 ++++++++ 5 files changed, 31 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1c8ea682c007..cccaef1088ea 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -1885,6 +1885,7 @@ extern const struct bpf_func_proto bpf_per_cpu_ptr_proto; extern const struct bpf_func_proto bpf_this_cpu_ptr_proto; extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto; extern const struct bpf_func_proto bpf_sock_from_file_proto; +extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto; const struct bpf_func_proto *bpf_tracing_func_proto( enum bpf_func_id func_id, const struct bpf_prog *prog); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index dbf10bf08582..07cc2e404291 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1673,6 +1673,14 @@ union bpf_attr { * Return * A 8-byte long unique number. * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return * The owner UID of the socket associated to *skb*. If the socket diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 6c0018abe68a..845b2168e006 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1760,6 +1760,8 @@ tracing_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_storage_delete_tracing_proto; case BPF_FUNC_sock_from_file: return &bpf_sock_from_file_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_ptr_cookie_proto; #endif case BPF_FUNC_seq_printf: return prog->expected_attach_type == BPF_TRACE_ITER ? diff --git a/net/core/filter.c b/net/core/filter.c index e15d4741719a..57aaed478362 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4631,6 +4631,18 @@ static const struct bpf_func_proto bpf_get_socket_cookie_sock_proto = { .arg1_type = ARG_PTR_TO_CTX, }; +BPF_CALL_1(bpf_get_socket_ptr_cookie, struct sock *, sk) +{ + return sk ? sock_gen_cookie(sk) : 0; +} + +const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto = { + .func = bpf_get_socket_ptr_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_BTF_ID_SOCK_COMMON, +}; + BPF_CALL_1(bpf_get_socket_cookie_sock_ops, struct bpf_sock_ops_kern *, ctx) { return __sock_gen_cookie(ctx->sk); diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index dbf10bf08582..07cc2e404291 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -1673,6 +1673,14 @@ union bpf_attr { * Return * A 8-byte long unique number. * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * * u32 bpf_get_socket_uid(struct sk_buff *skb) * Return * The owner UID of the socket associated to *skb*. If the socket -- cgit v1.2.3 From 1336c662474edec3966c96c8de026f794d16b804 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 11 Feb 2021 19:35:00 -0800 Subject: bpf: Clear per_cpu pointers during bpf_prog_realloc bpf_prog_realloc copies contents of struct bpf_prog. The pointers have to be cleared before freeing old struct. Reported-by: Ilya Leoshkevich Fixes: 700d4796ef59 ("bpf: Optimize program stats") Fixes: ca06f55b9002 ("bpf: Add per-program recursion prevention mechanism") Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 334070c4b8a1..0ae015ad1e05 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -245,6 +245,8 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, * reallocated structure. */ fp_old->aux = NULL; + fp_old->stats = NULL; + fp_old->active = NULL; __bpf_prog_free(fp_old); } -- cgit v1.2.3 From b2e37a7114ef52b862b4421ed4cd40c4ed2a0642 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 10 Feb 2021 21:45:02 +0100 Subject: bpf: Fix subreg optimization for BPF_FETCH All 32-bit variants of BPF_FETCH (add, and, or, xor, xchg, cmpxchg) define a 32-bit subreg and thus have zext_dst set. Their encoding, however, uses dst_reg field as a base register, which causes opt_subreg_zext_lo32_rnd_hi32() to zero-extend said base register instead of the one the insn really defines (r0 or src_reg). Fix by properly choosing a register being defined, similar to how check_atomic() already does that. Signed-off-by: Ilya Leoshkevich Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210210204502.83429-1-iii@linux.ibm.com --- kernel/bpf/verifier.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 15c15ea0abf5..beae700bb56e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -10957,6 +10957,7 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, for (i = 0; i < len; i++) { int adj_idx = i + delta; struct bpf_insn insn; + u8 load_reg; insn = insns[adj_idx]; if (!aux[adj_idx].zext_dst) { @@ -10999,9 +11000,27 @@ static int opt_subreg_zext_lo32_rnd_hi32(struct bpf_verifier_env *env, if (!bpf_jit_needs_zext()) continue; + /* zext_dst means that we want to zero-extend whatever register + * the insn defines, which is dst_reg most of the time, with + * the notable exception of BPF_STX + BPF_ATOMIC + BPF_FETCH. + */ + if (BPF_CLASS(insn.code) == BPF_STX && + BPF_MODE(insn.code) == BPF_ATOMIC) { + /* BPF_STX + BPF_ATOMIC insns without BPF_FETCH do not + * define any registers, therefore zext_dst cannot be + * set. + */ + if (WARN_ON(!(insn.imm & BPF_FETCH))) + return -EINVAL; + load_reg = insn.imm == BPF_CMPXCHG ? BPF_REG_0 + : insn.src_reg; + } else { + load_reg = insn.dst_reg; + } + zext_patch[0] = insn; - zext_patch[1].dst_reg = insn.dst_reg; - zext_patch[1].src_reg = insn.dst_reg; + zext_patch[1].dst_reg = load_reg; + zext_patch[1].src_reg = load_reg; patch = zext_patch; patch_len = 2; apply_patch_buffer: -- cgit v1.2.3 From 3af2f0aa2ed04f07975ba1242002b66cd53e6290 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Feb 2021 10:54:34 +0000 Subject: PM: EM: update Kconfig description and drop "default n" option Energy Model supports now other devices like GPUs, DSPs, not only CPUs. Thus, update the description in the config option. Remove also unneeded "default n". If the "default" line is removed, it defaults to 'n'. Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index a7320f07689d..56dbc2616d5c 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -322,15 +322,14 @@ config CPU_PM bool config ENERGY_MODEL - bool "Energy Model for CPUs" + bool "Energy Model for devices with DVFS (CPUs, GPUs, etc)" depends on SMP depends on CPU_FREQ - default n help Several subsystems (thermal and/or the task scheduler for example) - can leverage information about the energy consumed by CPUs to make - smarter decisions. This config option enables the framework from - which subsystems can access the energy models. + can leverage information about the energy consumed by devices to + make smarter decisions. This config option enables the framework + from which subsystems can access the energy models. The exact usage of the energy model is subsystem-dependent. -- cgit v1.2.3 From c4cc3141b6f8e0097a03f6885cafac957421df9e Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Feb 2021 10:54:35 +0000 Subject: PM: Kconfig: remove unneeded "default n" options Remove "default n" options. If the "default" line is removed, it defaults to 'n'. Signed-off-by: Lukasz Luba Signed-off-by: Rafael J. Wysocki --- kernel/power/Kconfig | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 56dbc2616d5c..6bfe3ead10ad 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -139,7 +139,6 @@ config PM_SLEEP_SMP_NONZERO_CPU config PM_AUTOSLEEP bool "Opportunistic sleep" depends on PM_SLEEP - default n help Allow the kernel to trigger a system transition into a global sleep state automatically whenever there are no active wakeup sources. @@ -147,7 +146,6 @@ config PM_AUTOSLEEP config PM_WAKELOCKS bool "User space wakeup sources interface" depends on PM_SLEEP - default n help Allow user space to create, activate and deactivate wakeup source objects with the help of a sysfs-based interface. @@ -293,7 +291,6 @@ config PM_GENERIC_DOMAINS config WQ_POWER_EFFICIENT_DEFAULT bool "Enable workqueue power-efficient mode by default" depends on PM - default n help Per-cpu workqueues are generally preferred because they show better performance thanks to cache locality; unfortunately, -- cgit v1.2.3 From 1556057413a304b3020180240d798ec135d90844 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Mon, 1 Feb 2021 23:57:35 +0100 Subject: PM: sleep: Constify static struct attribute_group The only usage of suspend_attr_group is to put its address in an array of pointers to const attribute_group structs. Make it const to allow the compiler to put it into read-only memory. Signed-off-by: Rikard Falkeborn Signed-off-by: Rafael J. Wysocki --- kernel/power/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 0aefd6f57e0a..12c7e1bb442f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -387,7 +387,7 @@ static struct attribute *suspend_attrs[] = { NULL, }; -static struct attribute_group suspend_attr_group = { +static const struct attribute_group suspend_attr_group = { .name = "suspend_stats", .attrs = suspend_attrs, }; -- cgit v1.2.3 From 3a7b35b899dedd29468301a3cbc4fa48a49e2131 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 12 Feb 2021 10:31:05 -0800 Subject: bpf: Introduce task_vma bpf_iter Introduce task_vma bpf_iter to print memory information of a process. It can be used to print customized information similar to /proc//maps. Current /proc//maps and /proc//smaps provide information of vma's of a process. However, these information are not flexible enough to cover all use cases. For example, if a vma cover mixed 2MB pages and 4kB pages (x86_64), there is no easy way to tell which address ranges are backed by 2MB pages. task_vma solves the problem by enabling the user to generate customize information based on the vma (and vma->vm_mm, vma->vm_file, etc.). To access the vma safely in the BPF program, task_vma iterator holds target mmap_lock while calling the BPF program. If the mmap_lock is contended, task_vma unlocks mmap_lock between iterations to unblock the writer(s). This lock contention avoidance mechanism is similar to the one used in show_smaps_rollup(). Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210212183107.50963-2-songliubraving@fb.com --- kernel/bpf/task_iter.c | 267 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 266 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 175b7b42bfc4..b68cb5d6d6eb 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -286,9 +286,248 @@ static const struct seq_operations task_file_seq_ops = { .show = task_file_seq_show, }; +struct bpf_iter_seq_task_vma_info { + /* The first field must be struct bpf_iter_seq_task_common. + * this is assumed by {init, fini}_seq_pidns() callback functions. + */ + struct bpf_iter_seq_task_common common; + struct task_struct *task; + struct vm_area_struct *vma; + u32 tid; + unsigned long prev_vm_start; + unsigned long prev_vm_end; +}; + +enum bpf_task_vma_iter_find_op { + task_vma_iter_first_vma, /* use mm->mmap */ + task_vma_iter_next_vma, /* use curr_vma->vm_next */ + task_vma_iter_find_vma, /* use find_vma() to find next vma */ +}; + +static struct vm_area_struct * +task_vma_seq_get_next(struct bpf_iter_seq_task_vma_info *info) +{ + struct pid_namespace *ns = info->common.ns; + enum bpf_task_vma_iter_find_op op; + struct vm_area_struct *curr_vma; + struct task_struct *curr_task; + u32 curr_tid = info->tid; + + /* If this function returns a non-NULL vma, it holds a reference to + * the task_struct, and holds read lock on vma->mm->mmap_lock. + * If this function returns NULL, it does not hold any reference or + * lock. + */ + if (info->task) { + curr_task = info->task; + curr_vma = info->vma; + /* In case of lock contention, drop mmap_lock to unblock + * the writer. + * + * After relock, call find(mm, prev_vm_end - 1) to find + * new vma to process. + * + * +------+------+-----------+ + * | VMA1 | VMA2 | VMA3 | + * +------+------+-----------+ + * | | | | + * 4k 8k 16k 400k + * + * For example, curr_vma == VMA2. Before unlock, we set + * + * prev_vm_start = 8k + * prev_vm_end = 16k + * + * There are a few cases: + * + * 1) VMA2 is freed, but VMA3 exists. + * + * find_vma() will return VMA3, just process VMA3. + * + * 2) VMA2 still exists. + * + * find_vma() will return VMA2, process VMA2->next. + * + * 3) no more vma in this mm. + * + * Process the next task. + * + * 4) find_vma() returns a different vma, VMA2'. + * + * 4.1) If VMA2 covers same range as VMA2', skip VMA2', + * because we already covered the range; + * 4.2) VMA2 and VMA2' covers different ranges, process + * VMA2'. + */ + if (mmap_lock_is_contended(curr_task->mm)) { + info->prev_vm_start = curr_vma->vm_start; + info->prev_vm_end = curr_vma->vm_end; + op = task_vma_iter_find_vma; + mmap_read_unlock(curr_task->mm); + if (mmap_read_lock_killable(curr_task->mm)) + goto finish; + } else { + op = task_vma_iter_next_vma; + } + } else { +again: + curr_task = task_seq_get_next(ns, &curr_tid, true); + if (!curr_task) { + info->tid = curr_tid + 1; + goto finish; + } + + if (curr_tid != info->tid) { + info->tid = curr_tid; + /* new task, process the first vma */ + op = task_vma_iter_first_vma; + } else { + /* Found the same tid, which means the user space + * finished data in previous buffer and read more. + * We dropped mmap_lock before returning to user + * space, so it is necessary to use find_vma() to + * find the next vma to process. + */ + op = task_vma_iter_find_vma; + } + + if (!curr_task->mm) + goto next_task; + + if (mmap_read_lock_killable(curr_task->mm)) + goto finish; + } + + switch (op) { + case task_vma_iter_first_vma: + curr_vma = curr_task->mm->mmap; + break; + case task_vma_iter_next_vma: + curr_vma = curr_vma->vm_next; + break; + case task_vma_iter_find_vma: + /* We dropped mmap_lock so it is necessary to use find_vma + * to find the next vma. This is similar to the mechanism + * in show_smaps_rollup(). + */ + curr_vma = find_vma(curr_task->mm, info->prev_vm_end - 1); + /* case 1) and 4.2) above just use curr_vma */ + + /* check for case 2) or case 4.1) above */ + if (curr_vma && + curr_vma->vm_start == info->prev_vm_start && + curr_vma->vm_end == info->prev_vm_end) + curr_vma = curr_vma->vm_next; + break; + } + if (!curr_vma) { + /* case 3) above, or case 2) 4.1) with vma->next == NULL */ + mmap_read_unlock(curr_task->mm); + goto next_task; + } + info->task = curr_task; + info->vma = curr_vma; + return curr_vma; + +next_task: + put_task_struct(curr_task); + info->task = NULL; + curr_tid++; + goto again; + +finish: + if (curr_task) + put_task_struct(curr_task); + info->task = NULL; + info->vma = NULL; + return NULL; +} + +static void *task_vma_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + struct vm_area_struct *vma; + + vma = task_vma_seq_get_next(info); + if (vma && *pos == 0) + ++*pos; + + return vma; +} + +static void *task_vma_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + + ++*pos; + return task_vma_seq_get_next(info); +} + +struct bpf_iter__task_vma { + __bpf_md_ptr(struct bpf_iter_meta *, meta); + __bpf_md_ptr(struct task_struct *, task); + __bpf_md_ptr(struct vm_area_struct *, vma); +}; + +DEFINE_BPF_ITER_FUNC(task_vma, struct bpf_iter_meta *meta, + struct task_struct *task, struct vm_area_struct *vma) + +static int __task_vma_seq_show(struct seq_file *seq, bool in_stop) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + struct bpf_iter__task_vma ctx; + struct bpf_iter_meta meta; + struct bpf_prog *prog; + + meta.seq = seq; + prog = bpf_iter_get_info(&meta, in_stop); + if (!prog) + return 0; + + ctx.meta = &meta; + ctx.task = info->task; + ctx.vma = info->vma; + return bpf_iter_run_prog(prog, &ctx); +} + +static int task_vma_seq_show(struct seq_file *seq, void *v) +{ + return __task_vma_seq_show(seq, false); +} + +static void task_vma_seq_stop(struct seq_file *seq, void *v) +{ + struct bpf_iter_seq_task_vma_info *info = seq->private; + + if (!v) { + (void)__task_vma_seq_show(seq, true); + } else { + /* info->vma has not been seen by the BPF program. If the + * user space reads more, task_vma_seq_get_next should + * return this vma again. Set prev_vm_start to ~0UL, + * so that we don't skip the vma returned by the next + * find_vma() (case task_vma_iter_find_vma in + * task_vma_seq_get_next()). + */ + info->prev_vm_start = ~0UL; + info->prev_vm_end = info->vma->vm_end; + mmap_read_unlock(info->task->mm); + put_task_struct(info->task); + info->task = NULL; + } +} + +static const struct seq_operations task_vma_seq_ops = { + .start = task_vma_seq_start, + .next = task_vma_seq_next, + .stop = task_vma_seq_stop, + .show = task_vma_seq_show, +}; + BTF_ID_LIST(btf_task_file_ids) BTF_ID(struct, task_struct) BTF_ID(struct, file) +BTF_ID(struct, vm_area_struct) static const struct bpf_iter_seq_info task_seq_info = { .seq_ops = &task_seq_ops, @@ -328,6 +567,26 @@ static struct bpf_iter_reg task_file_reg_info = { .seq_info = &task_file_seq_info, }; +static const struct bpf_iter_seq_info task_vma_seq_info = { + .seq_ops = &task_vma_seq_ops, + .init_seq_private = init_seq_pidns, + .fini_seq_private = fini_seq_pidns, + .seq_priv_size = sizeof(struct bpf_iter_seq_task_vma_info), +}; + +static struct bpf_iter_reg task_vma_reg_info = { + .target = "task_vma", + .feature = BPF_ITER_RESCHED, + .ctx_arg_info_size = 2, + .ctx_arg_info = { + { offsetof(struct bpf_iter__task_vma, task), + PTR_TO_BTF_ID_OR_NULL }, + { offsetof(struct bpf_iter__task_vma, vma), + PTR_TO_BTF_ID_OR_NULL }, + }, + .seq_info = &task_vma_seq_info, +}; + static int __init task_iter_init(void) { int ret; @@ -339,6 +598,12 @@ static int __init task_iter_init(void) task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1]; - return bpf_iter_reg_target(&task_file_reg_info); + ret = bpf_iter_reg_target(&task_file_reg_info); + if (ret) + return ret; + + task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_file_ids[0]; + task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[2]; + return bpf_iter_reg_target(&task_vma_reg_info); } late_initcall(task_iter_init); -- cgit v1.2.3 From 3d06f34aa89698f74e743b9ec023eafc19827cba Mon Sep 17 00:00:00 2001 From: Song Liu Date: Fri, 12 Feb 2021 10:31:06 -0800 Subject: bpf: Allow bpf_d_path in bpf_iter program task_file and task_vma iter programs have access to file->f_path. Enable bpf_d_path to print paths of these file. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210212183107.50963-3-songliubraving@fb.com --- kernel/trace/bpf_trace.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 845b2168e006..0b9e4fd9c61b 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -1191,6 +1191,10 @@ BTF_SET_END(btf_allowlist_d_path) static bool bpf_d_path_allowed(const struct bpf_prog *prog) { + if (prog->type == BPF_PROG_TYPE_TRACING && + prog->expected_attach_type == BPF_TRACE_ITER) + return true; + if (prog->type == BPF_PROG_TYPE_LSM) return bpf_lsm_is_sleepable_hook(prog->aux->attach_btf_id); -- cgit v1.2.3 From 17d8beda277a36203585943e70c7909b60775fd5 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 11 Feb 2021 16:59:26 -0800 Subject: bpf: Fix an unitialized value in bpf_iter Commit 15d83c4d7cef ("bpf: Allow loading of a bpf_iter program") cached btf_id in struct bpf_iter_target_info so later on if it can be checked cheaply compared to checking registered names. syzbot found a bug that uninitialized value may occur to bpf_iter_target_info->btf_id. This is because we allocated bpf_iter_target_info structure with kmalloc and never initialized field btf_id afterwards. This uninitialized btf_id is typically compared to a u32 bpf program func proto btf_id, and the chance of being equal is extremely slim. This patch fixed the issue by using kzalloc which will also prevent future likely instances due to adding new fields. Fixes: 15d83c4d7cef ("bpf: Allow loading of a bpf_iter program") Reported-by: syzbot+580f4f2a272e452d55cb@syzkaller.appspotmail.com Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210212005926.2875002-1-yhs@fb.com --- kernel/bpf/bpf_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c index 5454161407f1..a0d9eade9c80 100644 --- a/kernel/bpf/bpf_iter.c +++ b/kernel/bpf/bpf_iter.c @@ -287,7 +287,7 @@ int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info) { struct bpf_iter_target_info *tinfo; - tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL); + tinfo = kzalloc(sizeof(*tinfo), GFP_KERNEL); if (!tinfo) return -ENOMEM; -- cgit v1.2.3 From 7d4553b69fb335496c597c31590e982485ebe071 Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Tue, 9 Feb 2021 08:24:52 +0000 Subject: bpf, devmap: Use GFP_KERNEL for xdp bulk queue allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The devmap bulk queue is allocated with GFP_ATOMIC and the allocation may fail if there is no available space in existing percpu pool. Since commit 75ccae62cb8d42 ("xdp: Move devmap bulk queue into struct net_device") moved the bulk queue allocation to NETDEV_REGISTER callback, whose context is allowed to sleep, use GFP_KERNEL instead of GFP_ATOMIC to let percpu allocator extend the pool when needed and avoid possible failure of netdev registration. As the required alignment is natural, we can simply use alloc_percpu(). Fixes: 75ccae62cb8d42 ("xdp: Move devmap bulk queue into struct net_device") Signed-off-by: Jun'ichi Nomura Signed-off-by: Daniel Borkmann Cc: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210209082451.GA44021@jeru.linux.bs1.fc.nec.co.jp --- kernel/bpf/devmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index f6e9c68afdd4..85d9d1b72a33 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -802,9 +802,7 @@ static int dev_map_notification(struct notifier_block *notifier, break; /* will be freed in free_netdev() */ - netdev->xdp_bulkq = - __alloc_percpu_gfp(sizeof(struct xdp_dev_bulk_queue), - sizeof(void *), GFP_ATOMIC); + netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue); if (!netdev->xdp_bulkq) return NOTIFY_BAD; -- cgit v1.2.3 From 9b00f1b78809309163dda2d044d9e94a3c0248a3 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 10 Feb 2021 14:14:42 +0100 Subject: bpf: Fix truncation handling for mod32 dst reg wrt zero Recently noticed that when mod32 with a known src reg of 0 is performed, then the dst register is 32-bit truncated in verifier: 0: R1=ctx(id=0,off=0,imm=0) R10=fp0 0: (b7) r0 = 0 1: R0_w=inv0 R1=ctx(id=0,off=0,imm=0) R10=fp0 1: (b7) r1 = -1 2: R0_w=inv0 R1_w=inv-1 R10=fp0 2: (b4) w2 = -1 3: R0_w=inv0 R1_w=inv-1 R2_w=inv4294967295 R10=fp0 3: (9c) w1 %= w0 4: R0_w=inv0 R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv4294967295 R10=fp0 4: (b7) r0 = 1 5: R0_w=inv1 R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv4294967295 R10=fp0 5: (1d) if r1 == r2 goto pc+1 R0_w=inv1 R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv4294967295 R10=fp0 6: R0_w=inv1 R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv4294967295 R10=fp0 6: (b7) r0 = 2 7: R0_w=inv2 R1_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2_w=inv4294967295 R10=fp0 7: (95) exit 7: R0=inv1 R1=inv(id=0,umin_value=4294967295,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R2=inv4294967295 R10=fp0 7: (95) exit However, as a runtime result, we get 2 instead of 1, meaning the dst register does not contain (u32)-1 in this case. The reason is fairly straight forward given the 0 test leaves the dst register as-is: # ./bpftool p d x i 23 0: (b7) r0 = 0 1: (b7) r1 = -1 2: (b4) w2 = -1 3: (16) if w0 == 0x0 goto pc+1 4: (9c) w1 %= w0 5: (b7) r0 = 1 6: (1d) if r1 == r2 goto pc+1 7: (b7) r0 = 2 8: (95) exit This was originally not an issue given the dst register was marked as completely unknown (aka 64 bit unknown). However, after 468f6eafa6c4 ("bpf: fix 32-bit ALU op verification") the verifier casts the register output to 32 bit, and hence it becomes 32 bit unknown. Note that for the case where the src register is unknown, the dst register is marked 64 bit unknown. After the fix, the register is truncated by the runtime and the test passes: # ./bpftool p d x i 23 0: (b7) r0 = 0 1: (b7) r1 = -1 2: (b4) w2 = -1 3: (16) if w0 == 0x0 goto pc+2 4: (9c) w1 %= w0 5: (05) goto pc+1 6: (bc) w1 = w1 7: (b7) r0 = 1 8: (1d) if r1 == r2 goto pc+1 9: (b7) r0 = 2 10: (95) exit Semantics also match with {R,W}x mod{64,32} 0 -> {R,W}x. Invalid div has always been {R,W}x div{64,32} 0 -> 0. Rewrites are as follows: mod32: mod64: (16) if w0 == 0x0 goto pc+2 (15) if r0 == 0x0 goto pc+1 (9c) w1 %= w0 (9f) r1 %= r0 (05) goto pc+1 (bc) w1 = w1 Fixes: 468f6eafa6c4 ("bpf: fix 32-bit ALU op verification") Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 37581919e050..20babdd06278 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -11006,7 +11006,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) bool isdiv = BPF_OP(insn->code) == BPF_DIV; struct bpf_insn *patchlet; struct bpf_insn chk_and_div[] = { - /* Rx div 0 -> 0 */ + /* [R,W]x div 0 -> 0 */ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | BPF_JNE | BPF_K, insn->src_reg, 0, 2, 0), @@ -11015,16 +11015,18 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) *insn, }; struct bpf_insn chk_and_mod[] = { - /* Rx mod 0 -> Rx */ + /* [R,W]x mod 0 -> [R,W]x */ BPF_RAW_INSN((is64 ? BPF_JMP : BPF_JMP32) | BPF_JEQ | BPF_K, insn->src_reg, - 0, 1, 0), + 0, 1 + (is64 ? 0 : 1), 0), *insn, + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + BPF_MOV32_REG(insn->dst_reg, insn->dst_reg), }; patchlet = isdiv ? chk_and_div : chk_and_mod; cnt = isdiv ? ARRAY_SIZE(chk_and_div) : - ARRAY_SIZE(chk_and_mod); + ARRAY_SIZE(chk_and_mod) - (is64 ? 2 : 0); new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) -- cgit v1.2.3 From feb4adfad575c1e27cbfaa3462f376c13da36942 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Sat, 13 Feb 2021 00:56:39 +0400 Subject: bpf: Rename bpf_reg_state variables Using "reg" for an array of bpf_reg_state and "reg[i + 1]" for an individual bpf_reg_state is error-prone and verbose. Use "regs" for the former and "reg" for the latter as other code nearby does. Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210212205642.620788-2-me@ubique.spb.ru --- kernel/bpf/btf.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 756a93f534b6..bd5d2c563693 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5291,7 +5291,7 @@ int btf_check_type_match(struct bpf_verifier_log *log, const struct bpf_prog *pr * Only PTR_TO_CTX and SCALAR_VALUE states are recognized. */ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg) + struct bpf_reg_state *regs) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; @@ -5337,17 +5337,19 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * verifier sees. */ for (i = 0; i < nargs; i++) { + struct bpf_reg_state *reg = ®s[i + 1]; + t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_type_is_enum(t)) { - if (reg[i + 1].type == SCALAR_VALUE) + if (reg->type == SCALAR_VALUE) continue; bpf_log(log, "R%d is not a scalar\n", i + 1); goto out; } if (btf_type_is_ptr(t)) { - if (reg[i + 1].type == SCALAR_VALUE) { + if (reg->type == SCALAR_VALUE) { bpf_log(log, "R%d is not a pointer\n", i + 1); goto out; } @@ -5355,13 +5357,13 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, * is passing PTR_TO_CTX. */ if (btf_get_prog_ctx_type(log, btf, t, prog->type, i)) { - if (reg[i + 1].type != PTR_TO_CTX) { + if (reg->type != PTR_TO_CTX) { bpf_log(log, "arg#%d expected pointer to ctx, but got %s\n", i, btf_kind_str[BTF_INFO_KIND(t->info)]); goto out; } - if (check_ctx_reg(env, ®[i + 1], i + 1)) + if (check_ctx_reg(env, reg, i + 1)) goto out; continue; } @@ -5388,7 +5390,7 @@ out: * (either PTR_TO_CTX or SCALAR_VALUE). */ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, - struct bpf_reg_state *reg) + struct bpf_reg_state *regs) { struct bpf_verifier_log *log = &env->log; struct bpf_prog *prog = env->prog; @@ -5459,16 +5461,18 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, * Only PTR_TO_CTX and SCALAR are supported atm. */ for (i = 0; i < nargs; i++) { + struct bpf_reg_state *reg = ®s[i + 1]; + t = btf_type_by_id(btf, args[i].type); while (btf_type_is_modifier(t)) t = btf_type_by_id(btf, t->type); if (btf_type_is_int(t) || btf_type_is_enum(t)) { - reg[i + 1].type = SCALAR_VALUE; + reg->type = SCALAR_VALUE; continue; } if (btf_type_is_ptr(t) && btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - reg[i + 1].type = PTR_TO_CTX; + reg->type = PTR_TO_CTX; continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", -- cgit v1.2.3 From 4ddb74165ae580b6dcbb5ab1919d994fc8d03c3f Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Sat, 13 Feb 2021 00:56:40 +0400 Subject: bpf: Extract nullable reg type conversion into a helper function Extract conversion from a register's nullable type to a type with a value. The helper will be used in mark_ptr_not_null_reg(). Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210212205642.620788-3-me@ubique.spb.ru --- kernel/bpf/verifier.c | 83 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 52 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index beae700bb56e..a9f75bd7f8d3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1079,6 +1079,51 @@ static void mark_reg_known_zero(struct bpf_verifier_env *env, __mark_reg_known_zero(regs + regno); } +static void mark_ptr_not_null_reg(struct bpf_reg_state *reg) +{ + switch (reg->type) { + case PTR_TO_MAP_VALUE_OR_NULL: { + const struct bpf_map *map = reg->map_ptr; + + if (map->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = map->inner_map_meta; + } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { + reg->type = PTR_TO_XDP_SOCK; + } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || + map->map_type == BPF_MAP_TYPE_SOCKHASH) { + reg->type = PTR_TO_SOCKET; + } else { + reg->type = PTR_TO_MAP_VALUE; + } + break; + } + case PTR_TO_SOCKET_OR_NULL: + reg->type = PTR_TO_SOCKET; + break; + case PTR_TO_SOCK_COMMON_OR_NULL: + reg->type = PTR_TO_SOCK_COMMON; + break; + case PTR_TO_TCP_SOCK_OR_NULL: + reg->type = PTR_TO_TCP_SOCK; + break; + case PTR_TO_BTF_ID_OR_NULL: + reg->type = PTR_TO_BTF_ID; + break; + case PTR_TO_MEM_OR_NULL: + reg->type = PTR_TO_MEM; + break; + case PTR_TO_RDONLY_BUF_OR_NULL: + reg->type = PTR_TO_RDONLY_BUF; + break; + case PTR_TO_RDWR_BUF_OR_NULL: + reg->type = PTR_TO_RDWR_BUF; + break; + default: + WARN_ON("unknown nullable register type"); + } +} + static bool reg_is_pkt_pointer(const struct bpf_reg_state *reg) { return type_is_pkt_pointer(reg->type); @@ -7737,43 +7782,19 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state, } if (is_null) { reg->type = SCALAR_VALUE; - } else if (reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - const struct bpf_map *map = reg->map_ptr; - - if (map->inner_map_meta) { - reg->type = CONST_PTR_TO_MAP; - reg->map_ptr = map->inner_map_meta; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - reg->type = PTR_TO_XDP_SOCK; - } else if (map->map_type == BPF_MAP_TYPE_SOCKMAP || - map->map_type == BPF_MAP_TYPE_SOCKHASH) { - reg->type = PTR_TO_SOCKET; - } else { - reg->type = PTR_TO_MAP_VALUE; - } - } else if (reg->type == PTR_TO_SOCKET_OR_NULL) { - reg->type = PTR_TO_SOCKET; - } else if (reg->type == PTR_TO_SOCK_COMMON_OR_NULL) { - reg->type = PTR_TO_SOCK_COMMON; - } else if (reg->type == PTR_TO_TCP_SOCK_OR_NULL) { - reg->type = PTR_TO_TCP_SOCK; - } else if (reg->type == PTR_TO_BTF_ID_OR_NULL) { - reg->type = PTR_TO_BTF_ID; - } else if (reg->type == PTR_TO_MEM_OR_NULL) { - reg->type = PTR_TO_MEM; - } else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) { - reg->type = PTR_TO_RDONLY_BUF; - } else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) { - reg->type = PTR_TO_RDWR_BUF; - } - if (is_null) { /* We don't need id and ref_obj_id from this point * onwards anymore, thus we should better reset it, * so that state pruning has chances to take effect. */ reg->id = 0; reg->ref_obj_id = 0; - } else if (!reg_may_point_to_spin_lock(reg)) { + + return; + } + + mark_ptr_not_null_reg(reg); + + if (!reg_may_point_to_spin_lock(reg)) { /* For not-NULL ptr, reg->ref_obj_id will be reset * in release_reg_references(). * -- cgit v1.2.3 From e5069b9c23b3857db986c58801bebe450cff3392 Mon Sep 17 00:00:00 2001 From: Dmitrii Banshchikov Date: Sat, 13 Feb 2021 00:56:41 +0400 Subject: bpf: Support pointers in global func args Add an ability to pass a pointer to a type with known size in arguments of a global function. Such pointers may be used to overcome the limit on the maximum number of arguments, avoid expensive and tricky workarounds and to have multiple output arguments. A referenced type may contain pointers but indirect access through them isn't supported. The implementation consists of two parts. If a global function has an argument that is a pointer to a type with known size then: 1) In btf_check_func_arg_match(): check that the corresponding register points to NULL or to a valid memory region that is large enough to contain the expected argument's type. 2) In btf_prepare_func_args(): set the corresponding register type to PTR_TO_MEM_OR_NULL and its size to the size of the expected type. Only global functions are supported because allowance of pointers for static functions might break validation. Consider the following scenario. A static function has a pointer argument. A caller passes pointer to its stack memory. Because the callee can change referenced memory verifier cannot longer assume any particular slot type of the caller's stack memory hence the slot type is changed to SLOT_MISC. If there is an operation that relies on slot type other than SLOT_MISC then verifier won't be able to infer safety of the operation. When verifier sees a static function that has a pointer argument different from PTR_TO_CTX then it skips arguments check and continues with "inline" validation with more information available. The operation that relies on the particular slot type now succeeds. Because global functions were not allowed to have pointer arguments different from PTR_TO_CTX it's not possible to break existing and valid code. Signed-off-by: Dmitrii Banshchikov Signed-off-by: Alexei Starovoitov Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210212205642.620788-4-me@ubique.spb.ru --- include/linux/bpf_verifier.h | 2 ++ kernel/bpf/btf.c | 55 ++++++++++++++++++++++++++++++++++++-------- kernel/bpf/verifier.c | 30 ++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 532c97836d0d..971b33aca13d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -471,6 +471,8 @@ bpf_prog_offload_remove_insns(struct bpf_verifier_env *env, u32 off, u32 cnt); int check_ctx_reg(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int regno); +int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno, u32 mem_size); /* this lives here instead of in bpf.h because it needs to dereference tgt_prog */ static inline u64 bpf_trampoline_compute_key(const struct bpf_prog *tgt_prog, diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index bd5d2c563693..2efeb5f4b343 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5297,9 +5297,10 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, struct bpf_prog *prog = env->prog; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; - u32 i, nargs, btf_id; + const struct btf_type *t, *ref_t; + u32 i, nargs, btf_id, type_size; const char *tname; + bool is_global; if (!prog->aux->func_info) return -EINVAL; @@ -5333,6 +5334,8 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, bpf_log(log, "Function %s has %d > 5 args\n", tname, nargs); goto out; } + + is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL; /* check that BTF function arguments match actual types that the * verifier sees. */ @@ -5349,10 +5352,6 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, goto out; } if (btf_type_is_ptr(t)) { - if (reg->type == SCALAR_VALUE) { - bpf_log(log, "R%d is not a pointer\n", i + 1); - goto out; - } /* If function expects ctx type in BTF check that caller * is passing PTR_TO_CTX. */ @@ -5367,6 +5366,25 @@ int btf_check_func_arg_match(struct bpf_verifier_env *env, int subprog, goto out; continue; } + + if (!is_global) + goto out; + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, &type_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + goto out; + } + + if (check_mem_reg(env, reg, i + 1, type_size)) + goto out; + + continue; } bpf_log(log, "Unrecognized arg#%d type %s\n", i, btf_kind_str[BTF_INFO_KIND(t->info)]); @@ -5397,7 +5415,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, enum bpf_prog_type prog_type = prog->type; struct btf *btf = prog->aux->btf; const struct btf_param *args; - const struct btf_type *t; + const struct btf_type *t, *ref_t; u32 i, nargs, btf_id; const char *tname; @@ -5470,9 +5488,26 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog, reg->type = SCALAR_VALUE; continue; } - if (btf_type_is_ptr(t) && - btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { - reg->type = PTR_TO_CTX; + if (btf_type_is_ptr(t)) { + if (btf_get_prog_ctx_type(log, btf, t, prog_type, i)) { + reg->type = PTR_TO_CTX; + continue; + } + + t = btf_type_skip_modifiers(btf, t->type, NULL); + + ref_t = btf_resolve_size(btf, t, ®->mem_size); + if (IS_ERR(ref_t)) { + bpf_log(log, + "arg#%d reference type('%s %s') size cannot be determined: %ld\n", + i, btf_type_str(t), btf_name_by_offset(btf, t->name_off), + PTR_ERR(ref_t)); + return -EINVAL; + } + + reg->type = PTR_TO_MEM_OR_NULL; + reg->id = ++env->id_gen; + continue; } bpf_log(log, "Arg#%d type %s in %s() is not supported yet.\n", diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a9f75bd7f8d3..11a242932a2c 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4272,6 +4272,29 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +int check_mem_reg(struct bpf_verifier_env *env, struct bpf_reg_state *reg, + u32 regno, u32 mem_size) +{ + if (register_is_null(reg)) + return 0; + + if (reg_type_may_be_null(reg->type)) { + /* Assuming that the register contains a value check if the memory + * access is safe. Temporarily save and restore the register's state as + * the conversion shouldn't be visible to a caller. + */ + const struct bpf_reg_state saved_reg = *reg; + int rv; + + mark_ptr_not_null_reg(reg); + rv = check_helper_mem_access(env, regno, mem_size, true, NULL); + *reg = saved_reg; + return rv; + } + + return check_helper_mem_access(env, regno, mem_size, true, NULL); +} + /* Implementation details: * bpf_map_lookup returns PTR_TO_MAP_VALUE_OR_NULL * Two bpf_map_lookups (even with the same key) will have different reg->id. @@ -11960,6 +11983,13 @@ static int do_check_common(struct bpf_verifier_env *env, int subprog) mark_reg_known_zero(env, regs, i); else if (regs[i].type == SCALAR_VALUE) mark_reg_unknown(env, regs, i); + else if (regs[i].type == PTR_TO_MEM_OR_NULL) { + const u32 mem_size = regs[i].mem_size; + + mark_reg_known_zero(env, regs, i); + regs[i].mem_size = mem_size; + regs[i].id = ++env->id_gen; + } } } else { /* 1st arg to a function */ -- cgit v1.2.3 From 45159b27637b0fef6d5ddb86fc7c46b13c77960f Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Fri, 12 Feb 2021 05:04:08 +0100 Subject: bpf: Clear subreg_def for global function return values test_global_func4 fails on s390 as reported by Yauheni in [1]. The immediate problem is that the zext code includes the instruction, whose result needs to be zero-extended, into the zero-extension patchlet, and if this instruction happens to be a branch, then its delta is not adjusted. As a result, the verifier rejects the program later. However, according to [2], as far as the verifier's algorithm is concerned and as specified by the insn_no_def() function, branching insns do not define anything. This includes call insns, even though one might argue that they define %r0. This means that the real problem is that zero extension kicks in at all. This happens because clear_caller_saved_regs() sets BPF_REG_0's subreg_def after global function calls. This can be fixed in many ways; this patch mimics what helper function call handling already does. [1] https://lore.kernel.org/bpf/20200903140542.156624-1-yauheni.kaliuta@redhat.com/ [2] https://lore.kernel.org/bpf/CAADnVQ+2RPKcftZw8d+B1UwB35cpBhpF5u3OocNh90D9pETPwg@mail.gmail.com/ Fixes: 51c39bb1d5d1 ("bpf: Introduce function-by-function verification") Reported-by: Yauheni Kaliuta Signed-off-by: Ilya Leoshkevich Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210212040408.90109-1-iii@linux.ibm.com --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 11a242932a2c..16ba43352a5f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5279,8 +5279,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, subprog); clear_caller_saved_regs(env, caller->regs); - /* All global functions return SCALAR_VALUE */ + /* All global functions return a 64-bit SCALAR_VALUE */ mark_reg_unknown(env, caller->regs, BPF_REG_0); + caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG; /* continue with next insn after call */ return 0; -- cgit v1.2.3 From 6cd56ef1df399a004f90ecb682427f9964969fc9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 25 Jan 2021 08:59:08 +0000 Subject: sched/fair: Remove select_idle_smt() In order to make the next patch more readable, and to quantify the actual effectiveness of this pass, start by removing it. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210125085909.4600-4-mgorman@techsingularity.net --- kernel/sched/fair.c | 30 ------------------------------ 1 file changed, 30 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c18ef6c1542..6a0fc8a2dc67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6114,27 +6114,6 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int return -1; } -/* - * Scan the local SMT mask for idle CPUs. - */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - int cpu; - - if (!static_branch_likely(&sched_smt_present)) - return -1; - - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || - !cpumask_test_cpu(cpu, sched_domain_span(sd))) - continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - return cpu; - } - - return -1; -} - #else /* CONFIG_SCHED_SMT */ static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) @@ -6142,11 +6121,6 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s return -1; } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - return -1; -} - #endif /* CONFIG_SCHED_SMT */ /* @@ -6336,10 +6310,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - return target; } -- cgit v1.2.3 From 9fe1f127b913318c631d0041ecf71486e38c2c2d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 27 Jan 2021 13:52:03 +0000 Subject: sched/fair: Merge select_idle_core/cpu() Both select_idle_core() and select_idle_cpu() do a loop over the same cpumask. Observe that by clearing the already visited CPUs, we can fold the iteration and iterate a core at a time. All we need to do is remember any non-idle CPU we encountered while scanning for an idle core. This way we'll only iterate every CPU once. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/20210127135203.19633-5-mgorman@techsingularity.net --- kernel/sched/fair.c | 99 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 59 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6a0fc8a2dc67..c73d5883b940 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6019,6 +6019,14 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; } +static inline int __select_idle_cpu(int cpu) +{ + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + return cpu; + + return -1; +} + #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -6077,48 +6085,51 @@ unlock: * there are no idle cores left in the system; tracked through * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. */ -static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu; + bool idle = true; + int cpu; if (!static_branch_likely(&sched_smt_present)) - return -1; - - if (!test_idle_cores(target, false)) - return -1; - - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + return __select_idle_cpu(core); - for_each_cpu_wrap(core, cpus, target) { - bool idle = true; - - for_each_cpu(cpu, cpu_smt_mask(core)) { - if (!available_idle_cpu(cpu)) { - idle = false; - break; + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (!available_idle_cpu(cpu)) { + idle = false; + if (*idle_cpu == -1) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { + *idle_cpu = cpu; + break; + } + continue; } + break; } - - if (idle) - return core; - - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr)) + *idle_cpu = cpu; } - /* - * Failed to find an idle core; stop looking for one. - */ - set_idle_cores(target, 0); + if (idle) + return core; + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); return -1; } #else /* CONFIG_SCHED_SMT */ -static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static inline void set_idle_cores(int cpu, int val) { - return -1; +} + +static inline bool test_idle_cores(int cpu, bool def) +{ + return def; +} + +static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) +{ + return __select_idle_cpu(core); } #endif /* CONFIG_SCHED_SMT */ @@ -6131,10 +6142,11 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int i, cpu, idle_cpu = -1, nr = INT_MAX; + bool smt = test_idle_cores(target, false); + int this = smp_processor_id(); struct sched_domain *this_sd; u64 time; - int this = smp_processor_id(); - int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) @@ -6142,7 +6154,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP)) { + if (sched_feat(SIS_PROP) && !smt) { u64 avg_cost, avg_idle, span_avg; /* @@ -6162,18 +6174,29 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t } for_each_cpu_wrap(cpu, cpus, target) { - if (!--nr) - return -1; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - break; + if (smt) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + + } else { + if (!--nr) + return -1; + idle_cpu = __select_idle_cpu(cpu); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + break; + } } - if (sched_feat(SIS_PROP)) { + if (smt) + set_idle_cores(this, false); + + if (sched_feat(SIS_PROP) && !smt) { time = cpu_clock(this) - time; update_avg(&this_sd->avg_scan_cost, time); } - return cpu; + return idle_cpu; } /* @@ -6302,10 +6325,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!sd) return target; - i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - i = select_idle_cpu(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; -- cgit v1.2.3 From bf9be9a163b464aa90f60af13b336da2db8b2ea1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 17:04:12 +0200 Subject: rbtree, sched/fair: Use rb_add_cached() Reduce rbtree boiler plate by using the new helper function. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Davidlohr Bueso --- kernel/sched/fair.c | 46 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c73d5883b940..59b645e3c4fd 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -531,12 +531,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline int entity_before(struct sched_entity *a, +static inline bool entity_before(struct sched_entity *a, struct sched_entity *b) { return (s64)(a->vruntime - b->vruntime) < 0; } +#define __node_2_se(node) \ + rb_entry((node), struct sched_entity, run_node) + static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; @@ -552,8 +555,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } if (leftmost) { /* non-empty tree */ - struct sched_entity *se; - se = rb_entry(leftmost, struct sched_entity, run_node); + struct sched_entity *se = __node_2_se(leftmost); if (!curr) vruntime = se->vruntime; @@ -569,37 +571,17 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) #endif } +static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +{ + return entity_before(__node_2_se(a), __node_2_se(b)); +} + /* * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - bool leftmost = true; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (entity_before(se, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&se->run_node, parent, link); - rb_insert_color_cached(&se->run_node, - &cfs_rq->tasks_timeline, leftmost); + rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -614,7 +596,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) if (!left) return NULL; - return rb_entry(left, struct sched_entity, run_node); + return __node_2_se(left); } static struct sched_entity *__pick_next_entity(struct sched_entity *se) @@ -624,7 +606,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) if (!next) return NULL; - return rb_entry(next, struct sched_entity, run_node); + return __node_2_se(next); } #ifdef CONFIG_SCHED_DEBUG @@ -635,7 +617,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) if (!last) return NULL; - return rb_entry(last, struct sched_entity, run_node); + return __node_2_se(last); } /************************************************************** -- cgit v1.2.3 From 8ecca39483ed4e4e97096d0d6f8e25fdd323b189 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 17:04:41 +0200 Subject: rbtree, sched/deadline: Use rb_add_cached() Reduce rbtree boiler plate by using the new helpers. Make rb_add_cached() / rb_erase_cached() return a pointer to the leftmost node to aid in updating additional state. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Davidlohr Bueso --- include/linux/rbtree.h | 18 +++++++++--- kernel/sched/deadline.c | 77 ++++++++++++++++++------------------------------- 2 files changed, 42 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index e0b300de8f3f..d31ecaf4fdd3 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -141,12 +141,18 @@ static inline void rb_insert_color_cached(struct rb_node *node, rb_insert_color(node, &root->rb_root); } -static inline void rb_erase_cached(struct rb_node *node, - struct rb_root_cached *root) + +static inline struct rb_node * +rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { + struct rb_node *leftmost = NULL; + if (root->rb_leftmost == node) - root->rb_leftmost = rb_next(node); + leftmost = root->rb_leftmost = rb_next(node); + rb_erase(node, &root->rb_root); + + return leftmost; } static inline void rb_replace_node_cached(struct rb_node *victim, @@ -179,8 +185,10 @@ static inline void rb_replace_node_cached(struct rb_node *victim, * @node: node to insert * @tree: leftmost cached tree to insert @node into * @less: operator defining the (partial) node order + * + * Returns @node when it is the new leftmost, or NULL. */ -static __always_inline void +static __always_inline struct rb_node * rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, bool (*less)(struct rb_node *, const struct rb_node *)) { @@ -200,6 +208,8 @@ rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, rb_link_node(node, parent, link); rb_insert_color_cached(node, tree, leftmost); + + return leftmost ? node : NULL; } /** diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5421782fe897..1508d126e88b 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -517,58 +517,44 @@ static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) update_dl_migration(dl_rq); } +#define __node_2_pdl(node) \ + rb_entry((node), struct task_struct, pushable_dl_tasks) + +static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); +} + /* * The list of pushable -deadline task is not a plist, like in * sched_rt.c, it is an rb-tree with tasks ordered by deadline. */ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) { - struct dl_rq *dl_rq = &rq->dl; - struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct task_struct *entry; - bool leftmost = true; + struct rb_node *leftmost; BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks)); - while (*link) { - parent = *link; - entry = rb_entry(parent, struct task_struct, - pushable_dl_tasks); - if (dl_entity_preempt(&p->dl, &entry->dl)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = false; - } - } - + leftmost = rb_add_cached(&p->pushable_dl_tasks, + &rq->dl.pushable_dl_tasks_root, + __pushable_less); if (leftmost) - dl_rq->earliest_dl.next = p->dl.deadline; - - rb_link_node(&p->pushable_dl_tasks, parent, link); - rb_insert_color_cached(&p->pushable_dl_tasks, - &dl_rq->pushable_dl_tasks_root, leftmost); + rq->dl.earliest_dl.next = p->dl.deadline; } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) { struct dl_rq *dl_rq = &rq->dl; + struct rb_root_cached *root = &dl_rq->pushable_dl_tasks_root; + struct rb_node *leftmost; if (RB_EMPTY_NODE(&p->pushable_dl_tasks)) return; - if (dl_rq->pushable_dl_tasks_root.rb_leftmost == &p->pushable_dl_tasks) { - struct rb_node *next_node; - - next_node = rb_next(&p->pushable_dl_tasks); - if (next_node) { - dl_rq->earliest_dl.next = rb_entry(next_node, - struct task_struct, pushable_dl_tasks)->dl.deadline; - } - } + leftmost = rb_erase_cached(&p->pushable_dl_tasks, root); + if (leftmost) + dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; - rb_erase_cached(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); RB_CLEAR_NODE(&p->pushable_dl_tasks); } @@ -1478,29 +1464,21 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) dec_dl_migration(dl_se, dl_rq); } +#define __node_2_dle(node) \ + rb_entry((node), struct sched_dl_entity, rb_node) + +static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) +{ + return dl_time_before(__node_2_dle(a)->deadline, __node_2_dle(b)->deadline); +} + static void __enqueue_dl_entity(struct sched_dl_entity *dl_se) { struct dl_rq *dl_rq = dl_rq_of_se(dl_se); - struct rb_node **link = &dl_rq->root.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_dl_entity *entry; - int leftmost = 1; BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node)); - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_dl_entity, rb_node); - if (dl_time_before(dl_se->deadline, entry->deadline)) - link = &parent->rb_left; - else { - link = &parent->rb_right; - leftmost = 0; - } - } - - rb_link_node(&dl_se->rb_node, parent, link); - rb_insert_color_cached(&dl_se->rb_node, &dl_rq->root, leftmost); + rb_add_cached(&dl_se->rb_node, &dl_rq->root, __dl_less); inc_dl_tasks(dl_se, dl_rq); } @@ -1513,6 +1491,7 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se) return; rb_erase_cached(&dl_se->rb_node, &dl_rq->root); + RB_CLEAR_NODE(&dl_se->rb_node); dec_dl_tasks(dl_se, dl_rq); -- cgit v1.2.3 From a3b89864554bbce1594b7abdb5739fc708c1ca95 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 17:05:15 +0200 Subject: rbtree, perf: Use new rbtree helpers Reduce rbtree boiler plate by using the new helpers. One noteworthy change is unification of the various (partial) compare functions. We construct a subtree match by forcing the sub-order to always match, see __group_cmp(). Due to 'const' we had to touch cgroup_id(). Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Tejun Heo Acked-by: Davidlohr Bueso --- include/linux/cgroup.h | 4 +- kernel/events/core.c | 195 +++++++++++++++++++++++-------------------------- 2 files changed, 92 insertions(+), 107 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 451c2d26a5db..4f2f79de083e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -307,7 +307,7 @@ void css_task_iter_end(struct css_task_iter *it); * Inline functions. */ -static inline u64 cgroup_id(struct cgroup *cgrp) +static inline u64 cgroup_id(const struct cgroup *cgrp) { return cgrp->kn->id; } @@ -701,7 +701,7 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); struct cgroup_subsys_state; struct cgroup; -static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; } +static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, diff --git a/kernel/events/core.c b/kernel/events/core.c index 55d18791a72d..3d890961f6e5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1595,50 +1595,91 @@ static void perf_event_groups_init(struct perf_event_groups *groups) groups->index = 0; } +static inline struct cgroup *event_cgroup(const struct perf_event *event) +{ + struct cgroup *cgroup = NULL; + +#ifdef CONFIG_CGROUP_PERF + if (event->cgrp) + cgroup = event->cgrp->css.cgroup; +#endif + + return cgroup; +} + /* * Compare function for event groups; * * Implements complex key that first sorts by CPU and then by virtual index * which provides ordering when rotating groups for the same CPU. */ -static bool -perf_event_groups_less(struct perf_event *left, struct perf_event *right) +static __always_inline int +perf_event_groups_cmp(const int left_cpu, const struct cgroup *left_cgroup, + const u64 left_group_index, const struct perf_event *right) { - if (left->cpu < right->cpu) - return true; - if (left->cpu > right->cpu) - return false; + if (left_cpu < right->cpu) + return -1; + if (left_cpu > right->cpu) + return 1; #ifdef CONFIG_CGROUP_PERF - if (left->cgrp != right->cgrp) { - if (!left->cgrp || !left->cgrp->css.cgroup) { - /* - * Left has no cgroup but right does, no cgroups come - * first. - */ - return true; - } - if (!right->cgrp || !right->cgrp->css.cgroup) { - /* - * Right has no cgroup but left does, no cgroups come - * first. - */ - return false; - } - /* Two dissimilar cgroups, order by id. */ - if (left->cgrp->css.cgroup->kn->id < right->cgrp->css.cgroup->kn->id) - return true; + { + const struct cgroup *right_cgroup = event_cgroup(right); - return false; + if (left_cgroup != right_cgroup) { + if (!left_cgroup) { + /* + * Left has no cgroup but right does, no + * cgroups come first. + */ + return -1; + } + if (!right_cgroup) { + /* + * Right has no cgroup but left does, no + * cgroups come first. + */ + return 1; + } + /* Two dissimilar cgroups, order by id. */ + if (cgroup_id(left_cgroup) < cgroup_id(right_cgroup)) + return -1; + + return 1; + } } #endif - if (left->group_index < right->group_index) - return true; - if (left->group_index > right->group_index) - return false; + if (left_group_index < right->group_index) + return -1; + if (left_group_index > right->group_index) + return 1; - return false; + return 0; +} + +#define __node_2_pe(node) \ + rb_entry((node), struct perf_event, group_node) + +static inline bool __group_less(struct rb_node *a, const struct rb_node *b) +{ + struct perf_event *e = __node_2_pe(a); + return perf_event_groups_cmp(e->cpu, event_cgroup(e), e->group_index, + __node_2_pe(b)) < 0; +} + +struct __group_key { + int cpu; + struct cgroup *cgroup; +}; + +static inline int __group_cmp(const void *key, const struct rb_node *node) +{ + const struct __group_key *a = key; + const struct perf_event *b = __node_2_pe(node); + + /* partial/subtree match: @cpu, @cgroup; ignore: @group_index */ + return perf_event_groups_cmp(a->cpu, a->cgroup, b->group_index, b); } /* @@ -1650,27 +1691,9 @@ static void perf_event_groups_insert(struct perf_event_groups *groups, struct perf_event *event) { - struct perf_event *node_event; - struct rb_node *parent; - struct rb_node **node; - event->group_index = ++groups->index; - node = &groups->tree.rb_node; - parent = *node; - - while (*node) { - parent = *node; - node_event = container_of(*node, struct perf_event, group_node); - - if (perf_event_groups_less(event, node_event)) - node = &parent->rb_left; - else - node = &parent->rb_right; - } - - rb_link_node(&event->group_node, parent, node); - rb_insert_color(&event->group_node, &groups->tree); + rb_add(&event->group_node, &groups->tree, __group_less); } /* @@ -1718,45 +1741,17 @@ static struct perf_event * perf_event_groups_first(struct perf_event_groups *groups, int cpu, struct cgroup *cgrp) { - struct perf_event *node_event = NULL, *match = NULL; - struct rb_node *node = groups->tree.rb_node; -#ifdef CONFIG_CGROUP_PERF - u64 node_cgrp_id, cgrp_id = 0; - - if (cgrp) - cgrp_id = cgrp->kn->id; -#endif - - while (node) { - node_event = container_of(node, struct perf_event, group_node); - - if (cpu < node_event->cpu) { - node = node->rb_left; - continue; - } - if (cpu > node_event->cpu) { - node = node->rb_right; - continue; - } -#ifdef CONFIG_CGROUP_PERF - node_cgrp_id = 0; - if (node_event->cgrp && node_event->cgrp->css.cgroup) - node_cgrp_id = node_event->cgrp->css.cgroup->kn->id; + struct __group_key key = { + .cpu = cpu, + .cgroup = cgrp, + }; + struct rb_node *node; - if (cgrp_id < node_cgrp_id) { - node = node->rb_left; - continue; - } - if (cgrp_id > node_cgrp_id) { - node = node->rb_right; - continue; - } -#endif - match = node_event; - node = node->rb_left; - } + node = rb_find_first(&key, &groups->tree, __group_cmp); + if (node) + return __node_2_pe(node); - return match; + return NULL; } /* @@ -1765,27 +1760,17 @@ perf_event_groups_first(struct perf_event_groups *groups, int cpu, static struct perf_event * perf_event_groups_next(struct perf_event *event) { - struct perf_event *next; -#ifdef CONFIG_CGROUP_PERF - u64 curr_cgrp_id = 0; - u64 next_cgrp_id = 0; -#endif - - next = rb_entry_safe(rb_next(&event->group_node), typeof(*event), group_node); - if (next == NULL || next->cpu != event->cpu) - return NULL; - -#ifdef CONFIG_CGROUP_PERF - if (event->cgrp && event->cgrp->css.cgroup) - curr_cgrp_id = event->cgrp->css.cgroup->kn->id; + struct __group_key key = { + .cpu = event->cpu, + .cgroup = event_cgroup(event), + }; + struct rb_node *next; - if (next->cgrp && next->cgrp->css.cgroup) - next_cgrp_id = next->cgrp->css.cgroup->kn->id; + next = rb_next_match(&key, &event->group_node, __group_cmp); + if (next) + return __node_2_pe(next); - if (curr_cgrp_id != next_cgrp_id) - return NULL; -#endif - return next; + return NULL; } /* -- cgit v1.2.3 From a905e84e64083a0ee701f61810badee234050825 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 17:06:27 +0200 Subject: rbtree, uprobes: Use rbtree helpers Reduce rbtree boilerplate by using the new helpers. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Davidlohr Bueso --- kernel/events/uprobes.c | 80 ++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index bf9edd8d75be..fd5160dee2b4 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -613,41 +613,56 @@ static void put_uprobe(struct uprobe *uprobe) } } -static int match_uprobe(struct uprobe *l, struct uprobe *r) +static __always_inline +int uprobe_cmp(const struct inode *l_inode, const loff_t l_offset, + const struct uprobe *r) { - if (l->inode < r->inode) + if (l_inode < r->inode) return -1; - if (l->inode > r->inode) + if (l_inode > r->inode) return 1; - if (l->offset < r->offset) + if (l_offset < r->offset) return -1; - if (l->offset > r->offset) + if (l_offset > r->offset) return 1; return 0; } +#define __node_2_uprobe(node) \ + rb_entry((node), struct uprobe, rb_node) + +struct __uprobe_key { + struct inode *inode; + loff_t offset; +}; + +static inline int __uprobe_cmp_key(const void *key, const struct rb_node *b) +{ + const struct __uprobe_key *a = key; + return uprobe_cmp(a->inode, a->offset, __node_2_uprobe(b)); +} + +static inline int __uprobe_cmp(struct rb_node *a, const struct rb_node *b) +{ + struct uprobe *u = __node_2_uprobe(a); + return uprobe_cmp(u->inode, u->offset, __node_2_uprobe(b)); +} + static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) { - struct uprobe u = { .inode = inode, .offset = offset }; - struct rb_node *n = uprobes_tree.rb_node; - struct uprobe *uprobe; - int match; + struct __uprobe_key key = { + .inode = inode, + .offset = offset, + }; + struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); - while (n) { - uprobe = rb_entry(n, struct uprobe, rb_node); - match = match_uprobe(&u, uprobe); - if (!match) - return get_uprobe(uprobe); + if (node) + return __node_2_uprobe(node); - if (match < 0) - n = n->rb_left; - else - n = n->rb_right; - } return NULL; } @@ -668,32 +683,15 @@ static struct uprobe *find_uprobe(struct inode *inode, loff_t offset) static struct uprobe *__insert_uprobe(struct uprobe *uprobe) { - struct rb_node **p = &uprobes_tree.rb_node; - struct rb_node *parent = NULL; - struct uprobe *u; - int match; + struct rb_node *node; - while (*p) { - parent = *p; - u = rb_entry(parent, struct uprobe, rb_node); - match = match_uprobe(uprobe, u); - if (!match) - return get_uprobe(u); + node = rb_find_add(&uprobe->rb_node, &uprobes_tree, __uprobe_cmp); + if (node) + return get_uprobe(__node_2_uprobe(node)); - if (match < 0) - p = &parent->rb_left; - else - p = &parent->rb_right; - - } - - u = NULL; - rb_link_node(&uprobe->rb_node, parent, p); - rb_insert_color(&uprobe->rb_node, &uprobes_tree); /* get access + creation ref */ refcount_set(&uprobe->ref, 2); - - return u; + return NULL; } /* -- cgit v1.2.3 From 5a7987253ef0909d94e176cd97e511013de0fe19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 29 Apr 2020 17:29:58 +0200 Subject: rbtree, rtmutex: Use rb_add_cached() Reduce rbtree boiler plate by using the new helpers. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Davidlohr Bueso --- kernel/locking/rtmutex.c | 54 ++++++++++++++++-------------------------------- 1 file changed, 18 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2f8cd616d3b2..57e380453bf9 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -267,27 +267,18 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, return 1; } +#define __node_2_waiter(node) \ + rb_entry((node), struct rt_mutex_waiter, tree_entry) + +static inline bool __waiter_less(struct rb_node *a, const struct rb_node *b) +{ + return rt_mutex_waiter_less(__node_2_waiter(a), __node_2_waiter(b)); +} + static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &lock->waiters.rb_root.rb_node; - struct rb_node *parent = NULL; - struct rt_mutex_waiter *entry; - bool leftmost = true; - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct rt_mutex_waiter, tree_entry); - if (rt_mutex_waiter_less(waiter, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&waiter->tree_entry, parent, link); - rb_insert_color_cached(&waiter->tree_entry, &lock->waiters, leftmost); + rb_add_cached(&waiter->tree_entry, &lock->waiters, __waiter_less); } static void @@ -300,27 +291,18 @@ rt_mutex_dequeue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) RB_CLEAR_NODE(&waiter->tree_entry); } +#define __node_2_pi_waiter(node) \ + rb_entry((node), struct rt_mutex_waiter, pi_tree_entry) + +static inline bool __pi_waiter_less(struct rb_node *a, const struct rb_node *b) +{ + return rt_mutex_waiter_less(__node_2_pi_waiter(a), __node_2_pi_waiter(b)); +} + static void rt_mutex_enqueue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) { - struct rb_node **link = &task->pi_waiters.rb_root.rb_node; - struct rb_node *parent = NULL; - struct rt_mutex_waiter *entry; - bool leftmost = true; - - while (*link) { - parent = *link; - entry = rb_entry(parent, struct rt_mutex_waiter, pi_tree_entry); - if (rt_mutex_waiter_less(waiter, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&waiter->pi_tree_entry, parent, link); - rb_insert_color_cached(&waiter->pi_tree_entry, &task->pi_waiters, leftmost); + rb_add_cached(&waiter->pi_tree_entry, &task->pi_waiters, __pi_waiter_less); } static void -- cgit v1.2.3 From 71e5f6644fb2f3304fcb310145ded234a37e7cc1 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Mon, 1 Feb 2021 10:53:53 +0100 Subject: sched/topology: Fix sched_domain_topology_level alloc in sched_init_numa() Commit "sched/topology: Make sched_init_numa() use a set for the deduplicating sort" allocates 'i + nr_levels (level)' instead of 'i + nr_levels + 1' sched_domain_topology_level. This led to an Oops (on Arm64 juno with CONFIG_SCHED_DEBUG): sched_init_domains build_sched_domains() __free_domain_allocs() __sdt_free() { ... for_each_sd_topology(tl) ... sd = *per_cpu_ptr(sdd->sd, j); <-- ... } Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Tested-by: Vincent Guittot Tested-by: Barry Song Link: https://lkml.kernel.org/r/6000e39e-7d28-c360-9cd6-8798fd22a9bf@arm.com --- kernel/sched/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index bf5c9bd10bfe..09d35044bd88 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1702,7 +1702,7 @@ void sched_init_numa(void) /* Compute default topology size */ for (i = 0; sched_domain_topology[i].mask; i++); - tl = kzalloc((i + nr_levels) * + tl = kzalloc((i + nr_levels + 1) * sizeof(struct sched_domain_topology_level), GFP_KERNEL); if (!tl) return; -- cgit v1.2.3 From ae18ad281e825993d190073d0ae2ea35dee27ee1 Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 28 Jan 2021 14:10:38 +0100 Subject: sched: Remove MAX_USER_RT_PRIO Commit d46523ea32a7 ("[PATCH] fix MAX_USER_RT_PRIO and MAX_RT_PRIO") was introduced due to a a small time period in which the realtime patch set was using different values for MAX_USER_RT_PRIO and MAX_RT_PRIO. This is no longer true, i.e. now MAX_RT_PRIO == MAX_USER_RT_PRIO. Get rid of MAX_USER_RT_PRIO and make everything use MAX_RT_PRIO instead. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210128131040.296856-2-dietmar.eggemann@arm.com --- include/linux/sched/prio.h | 9 +-------- kernel/sched/core.c | 7 +++---- 2 files changed, 4 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index 7d64feafc408..d111f2fd77ea 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -11,16 +11,9 @@ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority * values are inverted: lower p->prio value means higher priority. - * - * The MAX_USER_RT_PRIO value allows the actual maximum - * RT priority to be separate from the value exported to - * user-space. This allows kernel threads to set their - * priority to a value higher than any user task. Note: - * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. */ -#define MAX_USER_RT_PRIO 100 -#define MAX_RT_PRIO MAX_USER_RT_PRIO +#define MAX_RT_PRIO 100 #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6c789dcb8e5a..f0b0b67fcdee 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5911,11 +5911,10 @@ recheck: /* * Valid priorities for SCHED_FIFO and SCHED_RR are - * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL, + * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL, * SCHED_BATCH and SCHED_IDLE is 0. */ - if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) || - (!p->mm && attr->sched_priority > MAX_RT_PRIO-1)) + if (attr->sched_priority > MAX_RT_PRIO-1) return -EINVAL; if ((dl_policy(policy) && !__checkparam_dl(attr)) || (rt_policy(policy) != (attr->sched_priority != 0))) @@ -6983,7 +6982,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) switch (policy) { case SCHED_FIFO: case SCHED_RR: - ret = MAX_USER_RT_PRIO-1; + ret = MAX_RT_PRIO-1; break; case SCHED_DEADLINE: case SCHED_NORMAL: -- cgit v1.2.3 From 9d061ba6bc170045857f3efe0bba5def30188d4d Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 28 Jan 2021 14:10:39 +0100 Subject: sched: Remove USER_PRIO, TASK_USER_PRIO and MAX_USER_PRIO The only remaining use of MAX_USER_PRIO (and USER_PRIO) is the SCALE_PRIO() definition in the PowerPC Cell architecture's Synergistic Processor Unit (SPU) scheduler. TASK_USER_PRIO isn't used anymore. Commit fe443ef2ac42 ("[POWERPC] spusched: Dynamic timeslicing for SCHED_OTHER") copied SCALE_PRIO() from the task scheduler in v2.6.23. Commit a4ec24b48dde ("sched: tidy up SCHED_RR") removed it from the task scheduler in v2.6.24. Commit 3ee237dddcd8 ("sched/prio: Add 3 macros of MAX_NICE, MIN_NICE and NICE_WIDTH in prio.h") introduced NICE_WIDTH much later. With: MAX_USER_PRIO = USER_PRIO(MAX_PRIO) = MAX_PRIO - MAX_RT_PRIO MAX_PRIO = MAX_RT_PRIO + NICE_WIDTH MAX_USER_PRIO = MAX_RT_PRIO + NICE_WIDTH - MAX_RT_PRIO MAX_USER_PRIO = NICE_WIDTH MAX_USER_PRIO can be replaced by NICE_WIDTH to be able to remove all the {*_}USER_PRIO defines. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210128131040.296856-3-dietmar.eggemann@arm.com --- arch/powerpc/platforms/cell/spufs/sched.c | 2 +- include/linux/sched/prio.h | 9 --------- kernel/sched/sched.h | 2 +- 3 files changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index f18d5067cd0f..aeb7f3922106 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -72,7 +72,7 @@ static struct timer_list spuloadavg_timer; #define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK)) #define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE) + max(x * (MAX_PRIO - prio) / (NICE_WIDTH / 2), MIN_SPU_TIMESLICE) /* * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values: diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h index d111f2fd77ea..ab83d85e1183 100644 --- a/include/linux/sched/prio.h +++ b/include/linux/sched/prio.h @@ -26,15 +26,6 @@ #define NICE_TO_PRIO(nice) ((nice) + DEFAULT_PRIO) #define PRIO_TO_NICE(prio) ((prio) - DEFAULT_PRIO) -/* - * 'User priority' is the nice value converted to something we - * can work with better when scaling various scheduler parameters, - * it's a [ 0 ... 39 ] range. - */ -#define USER_PRIO(p) ((p)-MAX_RT_PRIO) -#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) -#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) - /* * Convert nice value [19,-20] to rlimit style value [1,40]. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f519aba2c542..2185b3b435a9 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -140,7 +140,7 @@ extern void call_trace_sched_update_nr_running(struct rq *rq, int count); * scale_load() and scale_load_down(w) to convert between them. The * following must be true: * - * scale_load(sched_prio_to_weight[USER_PRIO(NICE_TO_PRIO(0))]) == NICE_0_LOAD + * scale_load(sched_prio_to_weight[NICE_TO_PRIO(0)-MAX_RT_PRIO]) == NICE_0_LOAD * */ #define NICE_0_LOAD (1L << NICE_0_LOAD_SHIFT) -- cgit v1.2.3 From c541bb7835a306cdbbe8abbdf4e4df507e0ca27a Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Thu, 28 Jan 2021 14:10:40 +0100 Subject: sched/core: Update task_prio() function header The description of the RT offset and the values for 'normal' tasks needs update. Moreover there are DL tasks now. task_prio() has to stay like it is to guarantee compatibility with the /proc//stat priority field: # cat /proc//stat | awk '{ print $18; }' Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210128131040.296856-4-dietmar.eggemann@arm.com --- kernel/sched/core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0b0b67fcdee..4afbdd27f46d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5616,8 +5616,12 @@ SYSCALL_DEFINE1(nice, int, increment) * @p: the task in question. * * Return: The priority value as seen by users in /proc. - * RT tasks are offset by -200. Normal tasks are centered - * around 0, value goes from -16 to +15. + * + * sched policy return value kernel prio user prio/nice + * + * normal, batch, idle [0 ... 39] [100 ... 139] 0/[-20 ... 19] + * fifo, rr [-2 ... -100] [98 ... 0] [1 ... 99] + * deadline -101 -1 0 */ int task_prio(const struct task_struct *p) { -- cgit v1.2.3 From 3f2a8fc4b15de18644e8a80a09edda168676e22c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 18 Jan 2021 15:12:16 +0100 Subject: static_call/x86: Add __static_call_return0() Provide a stub function that return 0 and wire up the static call site patching to replace the CALL with a single 5 byte instruction that clears %RAX, the return value register. The function can be cast to any function pointer type that has a single %RAX return (including pointers). Also provide a version that returns an int for convenience. We are clearing the entire %RAX register in any case, whether the return value is 32 or 64 bits, since %RAX is always a scratch register anyway. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210118141223.123667-2-frederic@kernel.org --- arch/x86/kernel/static_call.c | 17 +++++++++++++++-- include/linux/static_call.h | 12 ++++++++++++ kernel/static_call.c | 5 +++++ 3 files changed, 32 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index ca9a380d9c0b..9442c4136c38 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -11,14 +11,26 @@ enum insn_type { RET = 3, /* tramp / site cond-tail-call */ }; +/* + * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax + * The REX.W cancels the effect of any data16. + */ +static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 }; + static void __ref __static_call_transform(void *insn, enum insn_type type, void *func) { + const void *emulate = NULL; int size = CALL_INSN_SIZE; const void *code; switch (type) { case CALL: code = text_gen_insn(CALL_INSN_OPCODE, insn, func); + if (func == &__static_call_return0) { + emulate = code; + code = &xor5rax; + } + break; case NOP: @@ -41,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void if (unlikely(system_state == SYSTEM_BOOTING)) return text_poke_early(insn, code, size); - text_poke_bp(insn, code, size, NULL); + text_poke_bp(insn, code, size, emulate); } static void __static_call_validate(void *insn, bool tail) @@ -54,7 +66,8 @@ static void __static_call_validate(void *insn, bool tail) return; } else { if (opcode == CALL_INSN_OPCODE || - !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5)) + !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) || + !memcmp(insn, xor5rax, 5)) return; } diff --git a/include/linux/static_call.h b/include/linux/static_call.h index a2c064585c03..bd6735d10e2e 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -142,6 +142,8 @@ extern void __static_call_update(struct static_call_key *key, void *tramp, void extern int static_call_mod_init(struct module *mod); extern int static_call_text_reserved(void *start, void *end); +extern long __static_call_return0(void); + #define DEFINE_STATIC_CALL(name, _func) \ DECLARE_STATIC_CALL(name, _func); \ struct static_call_key STATIC_CALL_KEY(name) = { \ @@ -206,6 +208,11 @@ static inline int static_call_text_reserved(void *start, void *end) return 0; } +static inline long __static_call_return0(void) +{ + return 0; +} + #define EXPORT_STATIC_CALL(name) \ EXPORT_SYMBOL(STATIC_CALL_KEY(name)); \ EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)) @@ -222,6 +229,11 @@ struct static_call_key { void *func; }; +static inline long __static_call_return0(void) +{ + return 0; +} + #define DEFINE_STATIC_CALL(name, _func) \ DECLARE_STATIC_CALL(name, _func); \ struct static_call_key STATIC_CALL_KEY(name) = { \ diff --git a/kernel/static_call.c b/kernel/static_call.c index 84565c2a41b8..0bc11b5ce681 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -438,6 +438,11 @@ int __init static_call_init(void) } early_initcall(static_call_init); +long __static_call_return0(void) +{ + return 0; +} + #ifdef CONFIG_STATIC_CALL_SELFTEST static int func_a(int x) -- cgit v1.2.3 From 6ef869e0647439af0fc28dde162d33320d4e1dd7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 18 Jan 2021 15:12:19 +0100 Subject: preempt: Introduce CONFIG_PREEMPT_DYNAMIC Preemption mode selection is currently hardcoded on Kconfig choices. Introduce a dedicated option to tune preemption flavour at boot time, This will be only available on architectures efficiently supporting static calls in order not to tempt with the feature against additional overhead that might be prohibitive or undesirable. CONFIG_PREEMPT_DYNAMIC is automatically selected by CONFIG_PREEMPT if the architecture provides the necessary support (CONFIG_STATIC_CALL_INLINE, CONFIG_GENERIC_ENTRY, and provide with __preempt_schedule_function() / __preempt_schedule_notrace_function()). Suggested-by: Peter Zijlstra Signed-off-by: Michal Hocko Signed-off-by: Frederic Weisbecker [peterz: relax requirement to HAVE_STATIC_CALL] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210118141223.123667-5-frederic@kernel.org --- Documentation/admin-guide/kernel-parameters.txt | 7 +++++++ arch/Kconfig | 9 +++++++++ arch/x86/Kconfig | 1 + kernel/Kconfig.preempt | 19 +++++++++++++++++++ 4 files changed, 36 insertions(+) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index a10b545c2070..78ab29400dd3 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3916,6 +3916,13 @@ Format: {"off"} Disable Hardware Transactional Memory + preempt= [KNL] + Select preemption mode if you have CONFIG_PREEMPT_DYNAMIC + none - Limited to cond_resched() calls + voluntary - Limited to cond_resched() and might_sleep() calls + full - Any section that isn't explicitly preempt disabled + can be preempted anytime. + print-fatal-signals= [KNL] debug: print fatal signals diff --git a/arch/Kconfig b/arch/Kconfig index 24862d15f3a3..124507907824 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1090,6 +1090,15 @@ config HAVE_STATIC_CALL_INLINE bool depends on HAVE_STATIC_CALL +config HAVE_PREEMPT_DYNAMIC + bool + depends on HAVE_STATIC_CALL + depends on GENERIC_ENTRY + help + Select this if the architecture support boot time preempt setting + on top of static calls. It is strongly advised to support inline + static call to avoid any overhead. + config ARCH_WANT_LD_ORPHAN_WARN bool help diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 21f851179ff0..d3338a87761f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -224,6 +224,7 @@ config X86 select HAVE_STACK_VALIDATION if X86_64 select HAVE_STATIC_CALL select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION + select HAVE_PREEMPT_DYNAMIC select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bf82259cff96..416017301660 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -40,6 +40,7 @@ config PREEMPT depends on !ARCH_NO_PREEMPT select PREEMPTION select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK + select PREEMPT_DYNAMIC if HAVE_PREEMPT_DYNAMIC help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -80,3 +81,21 @@ config PREEMPT_COUNT config PREEMPTION bool select PREEMPT_COUNT + +config PREEMPT_DYNAMIC + bool + help + This option allows to define the preemption model on the kernel + command line parameter and thus override the default preemption + model defined during compile time. + + The feature is primarily interesting for Linux distributions which + provide a pre-built kernel binary to reduce the number of kernel + flavors they offer while still offering different usecases. + + The runtime overhead is negligible with HAVE_STATIC_CALL_INLINE enabled + but if runtime patching is not available for the specific architecture + then the potential overhead should be considered. + + Interesting if you want the same pre-built kernel should be used for + both Server and Desktop workloads. -- cgit v1.2.3 From b965f1ddb47daa5b8b2e2bc9c921431236830367 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 18 Jan 2021 15:12:20 +0100 Subject: preempt/dynamic: Provide cond_resched() and might_resched() static calls Provide static calls to control cond_resched() (called in !CONFIG_PREEMPT) and might_resched() (called in CONFIG_PREEMPT_VOLUNTARY) to that we can override their behaviour when preempt= is overriden. Since the default behaviour is full preemption, both their calls are ignored when preempt= isn't passed. [fweisbec: branch might_resched() directly to __cond_resched(), only define static calls when PREEMPT_DYNAMIC] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210118141223.123667-6-frederic@kernel.org --- include/linux/kernel.h | 23 +++++++++++++++++++---- include/linux/sched.h | 27 ++++++++++++++++++++++++--- kernel/sched/core.c | 16 +++++++++++++--- 3 files changed, 56 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f7902d8c1048..cfd3d349f905 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -15,7 +15,7 @@ #include #include #include - +#include #include #include @@ -81,11 +81,26 @@ struct pt_regs; struct user; #ifdef CONFIG_PREEMPT_VOLUNTARY -extern int _cond_resched(void); -# define might_resched() _cond_resched() + +extern int __cond_resched(void); +# define might_resched() __cond_resched() + +#elif defined(CONFIG_PREEMPT_DYNAMIC) + +extern int __cond_resched(void); + +DECLARE_STATIC_CALL(might_resched, __cond_resched); + +static __always_inline void might_resched(void) +{ + static_call(might_resched)(); +} + #else + # define might_resched() do { } while (0) -#endif + +#endif /* CONFIG_PREEMPT_* */ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP extern void ___might_sleep(const char *file, int line, int preempt_offset); diff --git a/include/linux/sched.h b/include/linux/sched.h index e1152227bb79..2f35594b8b53 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1871,11 +1871,32 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) * value indicates whether a reschedule was done in fact. * cond_resched_lock() will drop the spinlock before scheduling, */ -#ifndef CONFIG_PREEMPTION -extern int _cond_resched(void); +#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) +extern int __cond_resched(void); + +#ifdef CONFIG_PREEMPT_DYNAMIC + +DECLARE_STATIC_CALL(cond_resched, __cond_resched); + +static __always_inline int _cond_resched(void) +{ + return static_call(cond_resched)(); +} + #else + +static inline int _cond_resched(void) +{ + return __cond_resched(); +} + +#endif /* CONFIG_PREEMPT_DYNAMIC */ + +#else + static inline int _cond_resched(void) { return 0; } -#endif + +#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ #define cond_resched() ({ \ ___might_sleep(__FILE__, __LINE__, 0); \ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4afbdd27f46d..f7c8fd8fa177 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6785,17 +6785,27 @@ SYSCALL_DEFINE0(sched_yield) return 0; } -#ifndef CONFIG_PREEMPTION -int __sched _cond_resched(void) +#if !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) +int __sched __cond_resched(void) { if (should_resched(0)) { preempt_schedule_common(); return 1; } +#ifndef CONFIG_PREEMPT_RCU rcu_all_qs(); +#endif return 0; } -EXPORT_SYMBOL(_cond_resched); +EXPORT_SYMBOL(__cond_resched); +#endif + +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); +EXPORT_STATIC_CALL(cond_resched); + +DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); +EXPORT_STATIC_CALL(might_resched); #endif /* -- cgit v1.2.3 From 2c9a98d3bc808717ab63ad928a2b568967775388 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 18 Jan 2021 15:12:21 +0100 Subject: preempt/dynamic: Provide preempt_schedule[_notrace]() static calls Provide static calls to control preempt_schedule[_notrace]() (called in CONFIG_PREEMPT) so that we can override their behaviour when preempt= is overriden. Since the default behaviour is full preemption, both their calls are initialized to the arch provided wrapper, if any. [fweisbec: only define static calls when PREEMPT_DYNAMIC, make it less dependent on x86 with __preempt_schedule_func] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210118141223.123667-7-frederic@kernel.org --- arch/x86/include/asm/preempt.h | 34 ++++++++++++++++++++++++++-------- kernel/sched/core.c | 12 ++++++++++++ 2 files changed, 38 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 69485ca13665..9b12dce9bda5 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -5,6 +5,7 @@ #include #include #include +#include DECLARE_PER_CPU(int, __preempt_count); @@ -103,16 +104,33 @@ static __always_inline bool should_resched(int preempt_offset) } #ifdef CONFIG_PREEMPTION - extern asmlinkage void preempt_schedule_thunk(void); -# define __preempt_schedule() \ - asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT) - extern asmlinkage void preempt_schedule(void); - extern asmlinkage void preempt_schedule_notrace_thunk(void); -# define __preempt_schedule_notrace() \ - asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT) +extern asmlinkage void preempt_schedule(void); +extern asmlinkage void preempt_schedule_thunk(void); + +#define __preempt_schedule_func preempt_schedule_thunk + +DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); + +#define __preempt_schedule() \ +do { \ + __ADDRESSABLE(STATIC_CALL_KEY(preempt_schedule)); \ + asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \ +} while (0) + +extern asmlinkage void preempt_schedule_notrace(void); +extern asmlinkage void preempt_schedule_notrace_thunk(void); + +#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk + +DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); + +#define __preempt_schedule_notrace() \ +do { \ + __ADDRESSABLE(STATIC_CALL_KEY(preempt_schedule_notrace)); \ + asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \ +} while (0) - extern asmlinkage void preempt_schedule_notrace(void); #endif #endif /* __ASM_PREEMPT_H */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f7c8fd8fa177..880611c701d3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5265,6 +5265,12 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); +EXPORT_STATIC_CALL(preempt_schedule); +#endif + + /** * preempt_schedule_notrace - preempt_schedule called by tracing * @@ -5317,6 +5323,12 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) } EXPORT_SYMBOL_GPL(preempt_schedule_notrace); +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); +EXPORT_STATIC_CALL(preempt_schedule_notrace); +#endif + + #endif /* CONFIG_PREEMPTION */ /* -- cgit v1.2.3 From 40607ee97e4eec5655cc0f76a720bdc4c63a6434 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 18 Jan 2021 15:12:22 +0100 Subject: preempt/dynamic: Provide irqentry_exit_cond_resched() static call Provide static call to control IRQ preemption (called in CONFIG_PREEMPT) so that we can override its behaviour when preempt= is overriden. Since the default behaviour is full preemption, its call is initialized to provide IRQ preemption when preempt= isn't passed. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210118141223.123667-8-frederic@kernel.org --- include/linux/entry-common.h | 4 ++++ kernel/entry/common.c | 10 +++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index a104b298019a..883acef895bc 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -2,6 +2,7 @@ #ifndef __LINUX_ENTRYCOMMON_H #define __LINUX_ENTRYCOMMON_H +#include #include #include #include @@ -454,6 +455,9 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); * Conditional reschedule with additional sanity checks. */ void irqentry_exit_cond_resched(void); +#ifdef CONFIG_PREEMPT_DYNAMIC +DECLARE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); +#endif /** * irqentry_exit - Handle return from exception that used irqentry_enter() diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f9d491b17b78..f09cae37ddd5 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -385,6 +385,9 @@ void irqentry_exit_cond_resched(void) preempt_schedule_irq(); } } +#ifdef CONFIG_PREEMPT_DYNAMIC +DEFINE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); +#endif noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) { @@ -411,8 +414,13 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) } instrumentation_begin(); - if (IS_ENABLED(CONFIG_PREEMPTION)) + if (IS_ENABLED(CONFIG_PREEMPTION)) { +#ifdef CONFIG_PREEMT_DYNAMIC + static_call(irqentry_exit_cond_resched)(); +#else irqentry_exit_cond_resched(); +#endif + } /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); -- cgit v1.2.3 From 826bfeb37bb4302ee6042f330c4c0c757152bdb8 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Mon, 18 Jan 2021 15:12:23 +0100 Subject: preempt/dynamic: Support dynamic preempt with preempt= boot option Support the preempt= boot option and patch the static call sites accordingly. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210118141223.123667-9-frederic@kernel.org --- kernel/sched/core.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 67 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 880611c701d3..0c067176edc5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5328,9 +5328,75 @@ DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); EXPORT_STATIC_CALL(preempt_schedule_notrace); #endif - #endif /* CONFIG_PREEMPTION */ +#ifdef CONFIG_PREEMPT_DYNAMIC + +#include + +/* + * SC:cond_resched + * SC:might_resched + * SC:preempt_schedule + * SC:preempt_schedule_notrace + * SC:irqentry_exit_cond_resched + * + * + * NONE: + * cond_resched <- __cond_resched + * might_resched <- RET0 + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * VOLUNTARY: + * cond_resched <- __cond_resched + * might_resched <- __cond_resched + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP + * + * FULL: + * cond_resched <- RET0 + * might_resched <- RET0 + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched + */ +static int __init setup_preempt_mode(char *str) +{ + if (!strcmp(str, "none")) { + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); + static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); + static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + pr_info("Dynamic Preempt: %s\n", str); + } else if (!strcmp(str, "voluntary")) { + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, __cond_resched); + static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); + static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); + static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); + pr_info("Dynamic Preempt: %s\n", str); + } else if (!strcmp(str, "full")) { + static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); + static_call_update(preempt_schedule, __preempt_schedule_func); + static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); + static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + pr_info("Dynamic Preempt: %s\n", str); + } else { + pr_warn("Dynamic Preempt: Unsupported preempt mode %s, default to full\n", str); + return 1; + } + return 0; +} +__setup("preempt=", setup_preempt_mode); + +#endif /* CONFIG_PREEMPT_DYNAMIC */ + + /* * This is the entry point to schedule() from kernel preemption * off of irq context. -- cgit v1.2.3 From e59e10f8ef63d42fbb99776a5a112841e798b3b5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Jan 2021 13:01:58 +0100 Subject: sched: Add /debug/sched_preempt Add a debugfs file to muck about with the preempt mode at runtime. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/YAsGiUYf6NyaTplX@hirez.programming.kicks-ass.net --- kernel/sched/core.c | 135 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 126 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0c067176edc5..4a17bb5f28b0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5363,37 +5363,154 @@ EXPORT_STATIC_CALL(preempt_schedule_notrace); * preempt_schedule_notrace <- preempt_schedule_notrace * irqentry_exit_cond_resched <- irqentry_exit_cond_resched */ -static int __init setup_preempt_mode(char *str) + +enum { + preempt_dynamic_none = 0, + preempt_dynamic_voluntary, + preempt_dynamic_full, +}; + +static int preempt_dynamic_mode = preempt_dynamic_full; + +static int sched_dynamic_mode(const char *str) { - if (!strcmp(str, "none")) { + if (!strcmp(str, "none")) + return 0; + + if (!strcmp(str, "voluntary")) + return 1; + + if (!strcmp(str, "full")) + return 2; + + return -1; +} + +static void sched_dynamic_update(int mode) +{ + /* + * Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in + * the ZERO state, which is invalid. + */ + static_call_update(cond_resched, __cond_resched); + static_call_update(might_resched, __cond_resched); + static_call_update(preempt_schedule, __preempt_schedule_func); + static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); + static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); + + switch (mode) { + case preempt_dynamic_none: static_call_update(cond_resched, __cond_resched); static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); - pr_info("Dynamic Preempt: %s\n", str); - } else if (!strcmp(str, "voluntary")) { + pr_info("Dynamic Preempt: none\n"); + break; + + case preempt_dynamic_voluntary: static_call_update(cond_resched, __cond_resched); static_call_update(might_resched, __cond_resched); static_call_update(preempt_schedule, (typeof(&preempt_schedule)) NULL); static_call_update(preempt_schedule_notrace, (typeof(&preempt_schedule_notrace)) NULL); static_call_update(irqentry_exit_cond_resched, (typeof(&irqentry_exit_cond_resched)) NULL); - pr_info("Dynamic Preempt: %s\n", str); - } else if (!strcmp(str, "full")) { + pr_info("Dynamic Preempt: voluntary\n"); + break; + + case preempt_dynamic_full: static_call_update(cond_resched, (typeof(&__cond_resched)) __static_call_return0); static_call_update(might_resched, (typeof(&__cond_resched)) __static_call_return0); static_call_update(preempt_schedule, __preempt_schedule_func); static_call_update(preempt_schedule_notrace, __preempt_schedule_notrace_func); static_call_update(irqentry_exit_cond_resched, irqentry_exit_cond_resched); - pr_info("Dynamic Preempt: %s\n", str); - } else { - pr_warn("Dynamic Preempt: Unsupported preempt mode %s, default to full\n", str); + pr_info("Dynamic Preempt: full\n"); + break; + } + + preempt_dynamic_mode = mode; +} + +static int __init setup_preempt_mode(char *str) +{ + int mode = sched_dynamic_mode(str); + if (mode < 0) { + pr_warn("Dynamic Preempt: unsupported mode: %s\n", str); return 1; } + + sched_dynamic_update(mode); return 0; } __setup("preempt=", setup_preempt_mode); +#ifdef CONFIG_SCHED_DEBUG + +static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[16]; + int mode; + + if (cnt > 15) + cnt = 15; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + mode = sched_dynamic_mode(strstrip(buf)); + if (mode < 0) + return mode; + + sched_dynamic_update(mode); + + *ppos += cnt; + + return cnt; +} + +static int sched_dynamic_show(struct seq_file *m, void *v) +{ + static const char * preempt_modes[] = { + "none", "voluntary", "full" + }; + int i; + + for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { + if (preempt_dynamic_mode == i) + seq_puts(m, "("); + seq_puts(m, preempt_modes[i]); + if (preempt_dynamic_mode == i) + seq_puts(m, ")"); + + seq_puts(m, " "); + } + + seq_puts(m, "\n"); + return 0; +} + +static int sched_dynamic_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_dynamic_show, NULL); +} + +static const struct file_operations sched_dynamic_fops = { + .open = sched_dynamic_open, + .write = sched_dynamic_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug_dynamic(void) +{ + debugfs_create_file("sched_preempt", 0644, NULL, NULL, &sched_dynamic_fops); + return 0; +} +late_initcall(sched_init_debug_dynamic); + +#endif /* CONFIG_SCHED_DEBUG */ #endif /* CONFIG_PREEMPT_DYNAMIC */ -- cgit v1.2.3 From 73f44fe19d359635a607e8e8daa0da4001c1cfc2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 27 Jan 2021 17:18:37 -0600 Subject: static_call: Allow module use without exposing static_call_key When exporting static_call_key; with EXPORT_STATIC_CALL*(), the module can use static_call_update() to change the function called. This is not desirable in general. Not exporting static_call_key however also disallows usage of static_call(), since objtool needs the key to construct the static_call_site. Solve this by allowing objtool to create the static_call_site using the trampoline address when it builds a module and cannot find the static_call_key symbol. The module loader will then try and map the trampole back to a key before it constructs the normal sites list. Doing this requires a trampoline -> key associsation, so add another magic section that keeps those. Originally-by: Peter Zijlstra (Intel) Signed-off-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210127231837.ifddpn7rhwdaepiu@treble --- arch/x86/include/asm/static_call.h | 7 +++++ include/asm-generic/vmlinux.lds.h | 5 ++- include/linux/static_call.h | 22 +++++++++++-- include/linux/static_call_types.h | 27 ++++++++++++++-- kernel/static_call.c | 55 +++++++++++++++++++++++++++++++-- tools/include/linux/static_call_types.h | 27 ++++++++++++++-- tools/objtool/check.c | 17 ++++++++-- 7 files changed, 149 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index c37f11999d0c..cbb67b6030f9 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -37,4 +37,11 @@ #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop") + +#define ARCH_ADD_TRAMP_KEY(name) \ + asm(".pushsection .static_call_tramp_key, \"a\" \n" \ + ".long " STATIC_CALL_TRAMP_STR(name) " - . \n" \ + ".long " STATIC_CALL_KEY_STR(name) " - . \n" \ + ".popsection \n") + #endif /* _ASM_STATIC_CALL_H */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index b97c628ad91f..3f747de1934d 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -393,7 +393,10 @@ . = ALIGN(8); \ __start_static_call_sites = .; \ KEEP(*(.static_call_sites)) \ - __stop_static_call_sites = .; + __stop_static_call_sites = .; \ + __start_static_call_tramp_key = .; \ + KEEP(*(.static_call_tramp_key)) \ + __stop_static_call_tramp_key = .; /* * Allow architectures to handle ro_after_init data on their diff --git a/include/linux/static_call.h b/include/linux/static_call.h index d69dd8b976ca..85ecc789f4ff 100644 --- a/include/linux/static_call.h +++ b/include/linux/static_call.h @@ -138,6 +138,12 @@ struct static_call_key { }; }; +/* For finding the key associated with a trampoline */ +struct static_call_tramp_key { + s32 tramp; + s32 key; +}; + extern void __static_call_update(struct static_call_key *key, void *tramp, void *func); extern int static_call_mod_init(struct module *mod); extern int static_call_text_reserved(void *start, void *end); @@ -165,11 +171,18 @@ extern long __static_call_return0(void); #define EXPORT_STATIC_CALL(name) \ EXPORT_SYMBOL(STATIC_CALL_KEY(name)); \ EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)) - #define EXPORT_STATIC_CALL_GPL(name) \ EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name)); \ EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name)) +/* Leave the key unexported, so modules can't change static call targets: */ +#define EXPORT_STATIC_CALL_TRAMP(name) \ + EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)); \ + ARCH_ADD_TRAMP_KEY(name) +#define EXPORT_STATIC_CALL_TRAMP_GPL(name) \ + EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name)); \ + ARCH_ADD_TRAMP_KEY(name) + #elif defined(CONFIG_HAVE_STATIC_CALL) static inline int static_call_init(void) { return 0; } @@ -216,11 +229,16 @@ static inline long __static_call_return0(void) #define EXPORT_STATIC_CALL(name) \ EXPORT_SYMBOL(STATIC_CALL_KEY(name)); \ EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)) - #define EXPORT_STATIC_CALL_GPL(name) \ EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name)); \ EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name)) +/* Leave the key unexported, so modules can't change static call targets: */ +#define EXPORT_STATIC_CALL_TRAMP(name) \ + EXPORT_SYMBOL(STATIC_CALL_TRAMP(name)) +#define EXPORT_STATIC_CALL_TRAMP_GPL(name) \ + EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name)) + #else /* Generic implementation */ static inline int static_call_init(void) { return 0; } diff --git a/include/linux/static_call_types.h b/include/linux/static_call_types.h index 08f78b1b88b4..ae5662d368b9 100644 --- a/include/linux/static_call_types.h +++ b/include/linux/static_call_types.h @@ -10,6 +10,7 @@ #define STATIC_CALL_KEY_PREFIX_STR __stringify(STATIC_CALL_KEY_PREFIX) #define STATIC_CALL_KEY_PREFIX_LEN (sizeof(STATIC_CALL_KEY_PREFIX_STR) - 1) #define STATIC_CALL_KEY(name) __PASTE(STATIC_CALL_KEY_PREFIX, name) +#define STATIC_CALL_KEY_STR(name) __stringify(STATIC_CALL_KEY(name)) #define STATIC_CALL_TRAMP_PREFIX __SCT__ #define STATIC_CALL_TRAMP_PREFIX_STR __stringify(STATIC_CALL_TRAMP_PREFIX) @@ -39,17 +40,39 @@ struct static_call_site { #ifdef CONFIG_HAVE_STATIC_CALL +#define __raw_static_call(name) (&STATIC_CALL_TRAMP(name)) + +#ifdef CONFIG_HAVE_STATIC_CALL_INLINE + /* * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from * the symbol table so that objtool can reference it when it generates the * .static_call_sites section. */ +#define __STATIC_CALL_ADDRESSABLE(name) \ + __ADDRESSABLE(STATIC_CALL_KEY(name)) + #define __static_call(name) \ ({ \ - __ADDRESSABLE(STATIC_CALL_KEY(name)); \ - &STATIC_CALL_TRAMP(name); \ + __STATIC_CALL_ADDRESSABLE(name); \ + __raw_static_call(name); \ }) +#else /* !CONFIG_HAVE_STATIC_CALL_INLINE */ + +#define __STATIC_CALL_ADDRESSABLE(name) +#define __static_call(name) __raw_static_call(name) + +#endif /* CONFIG_HAVE_STATIC_CALL_INLINE */ + +#ifdef MODULE +#define __STATIC_CALL_MOD_ADDRESSABLE(name) +#define static_call_mod(name) __raw_static_call(name) +#else +#define __STATIC_CALL_MOD_ADDRESSABLE(name) __STATIC_CALL_ADDRESSABLE(name) +#define static_call_mod(name) __static_call(name) +#endif + #define static_call(name) __static_call(name) #else diff --git a/kernel/static_call.c b/kernel/static_call.c index 0bc11b5ce681..6906c6ec4c97 100644 --- a/kernel/static_call.c +++ b/kernel/static_call.c @@ -12,6 +12,8 @@ extern struct static_call_site __start_static_call_sites[], __stop_static_call_sites[]; +extern struct static_call_tramp_key __start_static_call_tramp_key[], + __stop_static_call_tramp_key[]; static bool static_call_initialized; @@ -323,10 +325,59 @@ static int __static_call_mod_text_reserved(void *start, void *end) return ret; } +static unsigned long tramp_key_lookup(unsigned long addr) +{ + struct static_call_tramp_key *start = __start_static_call_tramp_key; + struct static_call_tramp_key *stop = __stop_static_call_tramp_key; + struct static_call_tramp_key *tramp_key; + + for (tramp_key = start; tramp_key != stop; tramp_key++) { + unsigned long tramp; + + tramp = (long)tramp_key->tramp + (long)&tramp_key->tramp; + if (tramp == addr) + return (long)tramp_key->key + (long)&tramp_key->key; + } + + return 0; +} + static int static_call_add_module(struct module *mod) { - return __static_call_init(mod, mod->static_call_sites, - mod->static_call_sites + mod->num_static_call_sites); + struct static_call_site *start = mod->static_call_sites; + struct static_call_site *stop = start + mod->num_static_call_sites; + struct static_call_site *site; + + for (site = start; site != stop; site++) { + unsigned long addr = (unsigned long)static_call_key(site); + unsigned long key; + + /* + * Is the key is exported, 'addr' points to the key, which + * means modules are allowed to call static_call_update() on + * it. + * + * Otherwise, the key isn't exported, and 'addr' points to the + * trampoline so we need to lookup the key. + * + * We go through this dance to prevent crazy modules from + * abusing sensitive static calls. + */ + if (!kernel_text_address(addr)) + continue; + + key = tramp_key_lookup(addr); + if (!key) { + pr_warn("Failed to fixup __raw_static_call() usage at: %ps\n", + static_call_addr(site)); + return -EINVAL; + } + + site->key = (key - (long)&site->key) | + (site->key & STATIC_CALL_SITE_FLAGS); + } + + return __static_call_init(mod, start, stop); } static void static_call_del_module(struct module *mod) diff --git a/tools/include/linux/static_call_types.h b/tools/include/linux/static_call_types.h index 08f78b1b88b4..ae5662d368b9 100644 --- a/tools/include/linux/static_call_types.h +++ b/tools/include/linux/static_call_types.h @@ -10,6 +10,7 @@ #define STATIC_CALL_KEY_PREFIX_STR __stringify(STATIC_CALL_KEY_PREFIX) #define STATIC_CALL_KEY_PREFIX_LEN (sizeof(STATIC_CALL_KEY_PREFIX_STR) - 1) #define STATIC_CALL_KEY(name) __PASTE(STATIC_CALL_KEY_PREFIX, name) +#define STATIC_CALL_KEY_STR(name) __stringify(STATIC_CALL_KEY(name)) #define STATIC_CALL_TRAMP_PREFIX __SCT__ #define STATIC_CALL_TRAMP_PREFIX_STR __stringify(STATIC_CALL_TRAMP_PREFIX) @@ -39,17 +40,39 @@ struct static_call_site { #ifdef CONFIG_HAVE_STATIC_CALL +#define __raw_static_call(name) (&STATIC_CALL_TRAMP(name)) + +#ifdef CONFIG_HAVE_STATIC_CALL_INLINE + /* * __ADDRESSABLE() is used to ensure the key symbol doesn't get stripped from * the symbol table so that objtool can reference it when it generates the * .static_call_sites section. */ +#define __STATIC_CALL_ADDRESSABLE(name) \ + __ADDRESSABLE(STATIC_CALL_KEY(name)) + #define __static_call(name) \ ({ \ - __ADDRESSABLE(STATIC_CALL_KEY(name)); \ - &STATIC_CALL_TRAMP(name); \ + __STATIC_CALL_ADDRESSABLE(name); \ + __raw_static_call(name); \ }) +#else /* !CONFIG_HAVE_STATIC_CALL_INLINE */ + +#define __STATIC_CALL_ADDRESSABLE(name) +#define __static_call(name) __raw_static_call(name) + +#endif /* CONFIG_HAVE_STATIC_CALL_INLINE */ + +#ifdef MODULE +#define __STATIC_CALL_MOD_ADDRESSABLE(name) +#define static_call_mod(name) __raw_static_call(name) +#else +#define __STATIC_CALL_MOD_ADDRESSABLE(name) __STATIC_CALL_ADDRESSABLE(name) +#define static_call_mod(name) __static_call(name) +#endif + #define static_call(name) __static_call(name) #else diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4bd30315eb62..f2e5e5ce1a05 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -502,8 +502,21 @@ static int create_static_call_sections(struct objtool_file *file) key_sym = find_symbol_by_name(file->elf, tmp); if (!key_sym) { - WARN("static_call: can't find static_call_key symbol: %s", tmp); - return -1; + if (!module) { + WARN("static_call: can't find static_call_key symbol: %s", tmp); + return -1; + } + + /* + * For modules(), the key might not be exported, which + * means the module can make static calls but isn't + * allowed to change them. + * + * In that case we temporarily set the key to be the + * trampoline address. This is fixed up in + * static_call_add_module(). + */ + key_sym = insn->call_dest; } free(key_name); -- cgit v1.2.3 From ef72661e28c64ad610f89acc2832ec67b27ba438 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Jan 2021 16:26:50 +0100 Subject: sched: Harden PREEMPT_DYNAMIC Use the new EXPORT_STATIC_CALL_TRAMP() / static_call_mod() to unexport the static_call_key for the PREEMPT_DYNAMIC calls such that modules can no longer update these calls. Having modules change/hi-jack the preemption calls would be horrible. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- arch/x86/include/asm/preempt.h | 4 ++-- include/linux/kernel.h | 2 +- include/linux/sched.h | 2 +- kernel/sched/core.c | 8 ++++---- 4 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 9b12dce9bda5..0aa96f824af1 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -114,7 +114,7 @@ DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); #define __preempt_schedule() \ do { \ - __ADDRESSABLE(STATIC_CALL_KEY(preempt_schedule)); \ + __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \ } while (0) @@ -127,7 +127,7 @@ DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); #define __preempt_schedule_notrace() \ do { \ - __ADDRESSABLE(STATIC_CALL_KEY(preempt_schedule_notrace)); \ + __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \ asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \ } while (0) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index cfd3d349f905..5b7ed6dc99ac 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -93,7 +93,7 @@ DECLARE_STATIC_CALL(might_resched, __cond_resched); static __always_inline void might_resched(void) { - static_call(might_resched)(); + static_call_mod(might_resched)(); } #else diff --git a/include/linux/sched.h b/include/linux/sched.h index 2f35594b8b53..4d568288abf9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1880,7 +1880,7 @@ DECLARE_STATIC_CALL(cond_resched, __cond_resched); static __always_inline int _cond_resched(void) { - return static_call(cond_resched)(); + return static_call_mod(cond_resched)(); } #else diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4a17bb5f28b0..cec507be460c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5267,7 +5267,7 @@ EXPORT_SYMBOL(preempt_schedule); #ifdef CONFIG_PREEMPT_DYNAMIC DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); -EXPORT_STATIC_CALL(preempt_schedule); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule); #endif @@ -5325,7 +5325,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace); #ifdef CONFIG_PREEMPT_DYNAMIC DEFINE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); -EXPORT_STATIC_CALL(preempt_schedule_notrace); +EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace); #endif #endif /* CONFIG_PREEMPTION */ @@ -6997,10 +6997,10 @@ EXPORT_SYMBOL(__cond_resched); #ifdef CONFIG_PREEMPT_DYNAMIC DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched); -EXPORT_STATIC_CALL(cond_resched); +EXPORT_STATIC_CALL_TRAMP(cond_resched); DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched); -EXPORT_STATIC_CALL(might_resched); +EXPORT_STATIC_CALL_TRAMP(might_resched); #endif /* -- cgit v1.2.3 From f9d34595ae4feed38856b88769e2ba5af22d2548 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Sat, 23 Jan 2021 21:10:25 +0100 Subject: smp: Process pending softirqs in flush_smp_call_function_from_idle() send_call_function_single_ipi() may wake an idle CPU without sending an IPI. The woken up CPU will process the SMP-functions in flush_smp_call_function_from_idle(). Any raised softirq from within the SMP-function call will not be processed. Should the CPU have no tasks assigned, then it will go back to idle with pending softirqs and the NOHZ will rightfully complain. Process pending softirqs on return from flush_smp_call_function_queue(). Fixes: b2a02fc43a1f4 ("smp: Optimize send_call_function_single_ipi()") Reported-by: Jens Axboe Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210123201027.3262800-2-bigeasy@linutronix.de --- kernel/smp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 1b6070bf97bb..aeb0adfa0606 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -449,6 +450,9 @@ void flush_smp_call_function_from_idle(void) local_irq_save(flags); flush_smp_call_function_queue(true); + if (local_softirq_pending()) + do_softirq(); + local_irq_restore(flags); } -- cgit v1.2.3 From b0d6d4789677d128b1933af023083054f0973574 Mon Sep 17 00:00:00 2001 From: Sven Schnelle Date: Tue, 9 Feb 2021 16:07:11 +0100 Subject: uprobes: (Re)add missing get_uprobe() in __find_uprobe() commit c6bc9bd06dff ("rbtree, uprobes: Use rbtree helpers") accidentally removed the refcount increase. Add it again. Fixes: c6bc9bd06dff ("rbtree, uprobes: Use rbtree helpers") Signed-off-by: Sven Schnelle Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210209150711.36778-1-svens@linux.ibm.com --- kernel/events/uprobes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index fd5160dee2b4..3ea7f8f92f1d 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -661,7 +661,7 @@ static struct uprobe *__find_uprobe(struct inode *inode, loff_t offset) struct rb_node *node = rb_find(&key, &uprobes_tree, __uprobe_cmp_key); if (node) - return __node_2_uprobe(node); + return get_uprobe(__node_2_uprobe(node)); return NULL; } -- cgit v1.2.3 From de40f33e788b0c016bfde512ace2f76339ef7ddb Mon Sep 17 00:00:00 2001 From: Dietmar Eggemann Date: Tue, 19 Jan 2021 09:35:42 +0100 Subject: sched/deadline: Reduce rq lock contention in dl_add_task_root_domain() dl_add_task_root_domain() is called during sched domain rebuild: rebuild_sched_domains_locked() partition_and_rebuild_sched_domains() rebuild_root_domains() for all top_cpuset descendants: update_tasks_root_domain() for all tasks of cpuset: dl_add_task_root_domain() Change it so that only the task pi lock is taken to check if the task has a SCHED_DEADLINE (DL) policy. In case that p is a DL task take the rq lock as well to be able to safely de-reference root domain's DL bandwidth structure. Most of the tasks will have another policy (namely SCHED_NORMAL) and can now bail without taking the rq lock. One thing to note here: Even in case that there aren't any DL user tasks, a slow frequency switching system with cpufreq gov schedutil has a DL task (sugov) per frequency domain running which participates in DL bandwidth management. Signed-off-by: Dietmar Eggemann Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Quentin Perret Reviewed-by: Valentin Schneider Reviewed-by: Daniel Bristot de Oliveira Acked-by: Juri Lelli Link: https://lkml.kernel.org/r/20210119083542.19856-1-dietmar.eggemann@arm.com --- kernel/sched/deadline.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 1508d126e88b..6f377969fa71 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2388,9 +2388,13 @@ void dl_add_task_root_domain(struct task_struct *p) struct rq *rq; struct dl_bw *dl_b; - rq = task_rq_lock(p, &rf); - if (!dl_task(p)) - goto unlock; + raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + if (!dl_task(p)) { + raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags); + return; + } + + rq = __task_rq_lock(p, &rf); dl_b = &rq->rd->dl_bw; raw_spin_lock(&dl_b->lock); @@ -2399,7 +2403,6 @@ void dl_add_task_root_domain(struct task_struct *p) raw_spin_unlock(&dl_b->lock); -unlock: task_rq_unlock(rq, p, &rf); } -- cgit v1.2.3 From 156ec6f42b8d300dbbf382738ff35c8bad8f4c3a Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 8 Feb 2021 08:35:53 +0100 Subject: sched/features: Fix hrtick reprogramming Hung tasks and RCU stall cases were reported on systems which were not 100% busy. Investigation of such unexpected cases (no sign of potential starvation caused by tasks hogging the system) pointed out that the periodic sched tick timer wasn't serviced anymore after a certain point and that caused all machinery that depends on it (timers, RCU, etc.) to stop working as well. This issues was however only reproducible if HRTICK was enabled. Looking at core dumps it was found that the rbtree of the hrtimer base used also for the hrtick was corrupted (i.e. next as seen from the base root and actual leftmost obtained by traversing the tree are different). Same base is also used for periodic tick hrtimer, which might get "lost" if the rbtree gets corrupted. Much alike what described in commit 1f71addd34f4c ("tick/sched: Do not mess with an enqueued hrtimer") there is a race window between hrtimer_set_expires() in hrtick_start and hrtimer_start_expires() in __hrtick_restart() in which the former might be operating on an already queued hrtick hrtimer, which might lead to corruption of the base. Use hrtick_start() (which removes the timer before enqueuing it back) to ensure hrtick hrtimer reprogramming is entirely guarded by the base lock, so that no race conditions can occur. Signed-off-by: Juri Lelli Signed-off-by: Luis Claudio R. Goncalves Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210208073554.14629-2-juri.lelli@redhat.com --- kernel/sched/core.c | 8 +++----- kernel/sched/sched.h | 1 + 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cec507be460c..18d51ab3d2b9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -355,8 +355,9 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; + ktime_t time = rq->hrtick_time; - hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED_HARD); + hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD); } /* @@ -380,7 +381,6 @@ static void __hrtick_start(void *arg) void hrtick_start(struct rq *rq, u64 delay) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time; s64 delta; /* @@ -388,9 +388,7 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense and can cause timer DoS. */ delta = max_t(s64, delay, 10000LL); - time = ktime_add_ns(timer->base->get_time(), delta); - - hrtimer_set_expires(timer, time); + rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta); if (rq == this_rq()) __hrtick_restart(rq); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 2185b3b435a9..0dfdd52799c7 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1031,6 +1031,7 @@ struct rq { call_single_data_t hrtick_csd; #endif struct hrtimer hrtick_timer; + ktime_t hrtick_time; #endif #ifdef CONFIG_SCHEDSTATS -- cgit v1.2.3 From e0ee463c93c43b1657ad69cf2678ff5bf1b754fe Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Mon, 8 Feb 2021 08:35:54 +0100 Subject: sched/features: Distinguish between NORMAL and DEADLINE hrtick The HRTICK feature has traditionally been servicing configurations that need precise preemptions point for NORMAL tasks. More recently, the feature has been extended to also service DEADLINE tasks with stringent runtime enforcement needs (e.g., runtime < 1ms with HZ=1000). Enabling HRTICK sched feature currently enables the additional timer and task tick for both classes, which might introduced undesired overhead for no additional benefit if one needed it only for one of the cases. Separate HRTICK sched feature in two (and leave the traditional case name unmodified) so that it can be selectively enabled when needed. With: $ echo HRTICK > /sys/kernel/debug/sched_features the NORMAL/fair hrtick gets enabled. With: $ echo HRTICK_DL > /sys/kernel/debug/sched_features the DEADLINE hrtick gets enabled. Signed-off-by: Juri Lelli Signed-off-by: Luis Claudio R. Goncalves Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20210208073554.14629-3-juri.lelli@redhat.com --- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 4 ++-- kernel/sched/fair.c | 4 ++-- kernel/sched/features.h | 1 + kernel/sched/sched.h | 26 ++++++++++++++++++++++++-- 5 files changed, 30 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 18d51ab3d2b9..88a2e2bdbabe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4969,7 +4969,7 @@ static void __sched notrace __schedule(bool preempt) schedule_debug(prev, preempt); - if (sched_feat(HRTICK)) + if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) hrtick_clear(rq); local_irq_disable(); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6f377969fa71..aac3539aa0fe 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1832,7 +1832,7 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) if (!first) return; - if (hrtick_enabled(rq)) + if (hrtick_enabled_dl(rq)) start_hrtick_dl(rq, p); if (rq->curr->sched_class != &dl_sched_class) @@ -1895,7 +1895,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) * not being the leftmost task anymore. In that case NEED_RESCHED will * be set and schedule() will start a new hrtick for the next task. */ - if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 && + if (hrtick_enabled_dl(rq) && queued && p->dl.runtime > 0 && is_leftmost(p, &rq->dl)) start_hrtick_dl(rq, p); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 59b645e3c4fd..8a8bd7b13634 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5429,7 +5429,7 @@ static void hrtick_update(struct rq *rq) { struct task_struct *curr = rq->curr; - if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) @@ -7116,7 +7116,7 @@ done: __maybe_unused; list_move(&p->se.group_node, &rq->cfs_tasks); #endif - if (hrtick_enabled(rq)) + if (hrtick_enabled_fair(rq)) hrtick_start_fair(rq, p); update_misfit_status(p, rq); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index e875eabb6600..1bc2b158fc51 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -38,6 +38,7 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true) SCHED_FEAT(WAKEUP_PREEMPTION, true) SCHED_FEAT(HRTICK, false) +SCHED_FEAT(HRTICK_DL, false) SCHED_FEAT(DOUBLE_TICK, false) /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 0dfdd52799c7..10a1522b1e30 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2105,17 +2105,39 @@ extern const_debug unsigned int sysctl_sched_migration_cost; */ static inline int hrtick_enabled(struct rq *rq) { - if (!sched_feat(HRTICK)) - return 0; if (!cpu_active(cpu_of(rq))) return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } +static inline int hrtick_enabled_fair(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + return hrtick_enabled(rq); +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + if (!sched_feat(HRTICK_DL)) + return 0; + return hrtick_enabled(rq); +} + void hrtick_start(struct rq *rq, u64 delay); #else +static inline int hrtick_enabled_fair(struct rq *rq) +{ + return 0; +} + +static inline int hrtick_enabled_dl(struct rq *rq) +{ + return 0; +} + static inline int hrtick_enabled(struct rq *rq) { return 0; -- cgit v1.2.3 From 54b7429efffc99e845ba9381bee3244f012a06c2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Feb 2021 00:05:44 +0100 Subject: rcu: Pull deferred rcuog wake up to rcu_eqs_enter() callers Deferred wakeup of rcuog kthreads upon RCU idle mode entry is going to be handled differently whether initiated by idle, user or guest. Prepare with pulling that control up to rcu_eqs_enter() callers. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210131230548.32970-2-frederic@kernel.org --- kernel/rcu/tree.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 40e5e3dd253e..63032e5620b9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -644,7 +644,6 @@ static noinstr void rcu_eqs_enter(bool user) trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, atomic_read(&rdp->dynticks)); WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); rdp = this_cpu_ptr(&rcu_data); - do_nocb_deferred_wakeup(rdp); rcu_prepare_for_idle(); rcu_preempt_deferred_qs(current); @@ -672,7 +671,10 @@ static noinstr void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + lockdep_assert_irqs_disabled(); + do_nocb_deferred_wakeup(rdp); rcu_eqs_enter(false); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -691,7 +693,14 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); */ noinstr void rcu_user_enter(void) { + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + lockdep_assert_irqs_disabled(); + + instrumentation_begin(); + do_nocb_deferred_wakeup(rdp); + instrumentation_end(); + rcu_eqs_enter(true); } #endif /* CONFIG_NO_HZ_FULL */ -- cgit v1.2.3 From 43789ef3f7d61aa7bed0cb2764e588fc990c30ef Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Feb 2021 00:05:45 +0100 Subject: rcu/nocb: Perform deferred wake up before last idle's need_resched() check Entering RCU idle mode may cause a deferred wake up of an RCU NOCB_GP kthread (rcuog) to be serviced. Usually a local wake up happening while running the idle task is handled in one of the need_resched() checks carefully placed within the idle loop that can break to the scheduler. Unfortunately the call to rcu_idle_enter() is already beyond the last generic need_resched() check and we may halt the CPU with a resched request unhandled, leaving the task hanging. Fix this with splitting the rcuog wakeup handling from rcu_idle_enter() and place it before the last generic need_resched() check in the idle loop. It is then assumed that no call to call_rcu() will be performed after that in the idle loop until the CPU is put in low power mode. Fixes: 96d3fd0d315a (rcu: Break call_rcu() deadlock involving scheduler and perf) Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210131230548.32970-3-frederic@kernel.org --- include/linux/rcupdate.h | 2 ++ kernel/rcu/tree.c | 3 --- kernel/rcu/tree_plugin.h | 5 +++++ kernel/sched/idle.c | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index fd02c5fa60cb..36c2119de702 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -110,8 +110,10 @@ static inline void rcu_user_exit(void) { } #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); +void rcu_nocb_flush_deferred_wakeup(void); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } +static inline void rcu_nocb_flush_deferred_wakeup(void) { } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /** diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 63032e5620b9..82838e93b498 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -671,10 +671,7 @@ static noinstr void rcu_eqs_enter(bool user) */ void rcu_idle_enter(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - lockdep_assert_irqs_disabled(); - do_nocb_deferred_wakeup(rdp); rcu_eqs_enter(false); } EXPORT_SYMBOL_GPL(rcu_idle_enter); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 7e291ce0a1d6..d5b38c28abd1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2187,6 +2187,11 @@ static void do_nocb_deferred_wakeup(struct rcu_data *rdp) do_nocb_deferred_wakeup_common(rdp); } +void rcu_nocb_flush_deferred_wakeup(void) +{ + do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); +} + void __init rcu_init_nohz(void) { int cpu; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 305727ea0677..7199e6f23789 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -285,6 +285,7 @@ static void do_idle(void) } arch_cpu_idle_enter(); + rcu_nocb_flush_deferred_wakeup(); /* * In poll mode we reenable interrupts and spin. Also if we -- cgit v1.2.3 From f8bb5cae9616224a39cbb399de382d36ac41df10 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Feb 2021 00:05:46 +0100 Subject: rcu/nocb: Trigger self-IPI on late deferred wake up before user resume Entering RCU idle mode may cause a deferred wake up of an RCU NOCB_GP kthread (rcuog) to be serviced. Unfortunately the call to rcu_user_enter() is already past the last rescheduling opportunity before we resume to userspace or to guest mode. We may escape there with the woken task ignored. The ultimate resort to fix every callsites is to trigger a self-IPI (nohz_full depends on arch to implement arch_irq_work_raise()) that will trigger a reschedule on IRQ tail or guest exit. Eventually every site that want a saner treatment will need to carefully place a call to rcu_nocb_flush_deferred_wakeup() before the last explicit need_resched() check upon resume. Fixes: 96d3fd0d315a (rcu: Break call_rcu() deadlock involving scheduler and perf) Reported-by: Paul E. McKenney Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210131230548.32970-4-frederic@kernel.org --- kernel/rcu/tree.c | 21 ++++++++++++++++++++- kernel/rcu/tree.h | 2 +- kernel/rcu/tree_plugin.h | 25 ++++++++++++++++--------- 3 files changed, 37 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 82838e93b498..4b1e5bd16492 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -677,6 +677,18 @@ void rcu_idle_enter(void) EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL + +/* + * An empty function that will trigger a reschedule on + * IRQ tail once IRQs get re-enabled on userspace resume. + */ +static void late_wakeup_func(struct irq_work *work) +{ +} + +static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) = + IRQ_WORK_INIT(late_wakeup_func); + /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -694,12 +706,19 @@ noinstr void rcu_user_enter(void) lockdep_assert_irqs_disabled(); + /* + * We may be past the last rescheduling opportunity in the entry code. + * Trigger a self IPI that will fire and reschedule once we resume to + * user/guest mode. + */ instrumentation_begin(); - do_nocb_deferred_wakeup(rdp); + if (do_nocb_deferred_wakeup(rdp) && need_resched()) + irq_work_queue(this_cpu_ptr(&late_wakeup_work)); instrumentation_end(); rcu_eqs_enter(true); } + #endif /* CONFIG_NO_HZ_FULL */ /** diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 7708ed161f4a..9226f4021a36 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -433,7 +433,7 @@ static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp, static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty, unsigned long flags); static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); -static void do_nocb_deferred_wakeup(struct rcu_data *rdp); +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp); static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_cpu_nocb_kthread(int cpu); static void __init rcu_spawn_nocb_kthreads(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d5b38c28abd1..384856e4d13e 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1631,8 +1631,8 @@ bool rcu_is_nocb_cpu(int cpu) * Kick the GP kthread for this NOCB group. Caller holds ->nocb_lock * and this function releases it. */ -static void wake_nocb_gp(struct rcu_data *rdp, bool force, - unsigned long flags) +static bool wake_nocb_gp(struct rcu_data *rdp, bool force, + unsigned long flags) __releases(rdp->nocb_lock) { bool needwake = false; @@ -1643,7 +1643,7 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("AlreadyAwake")); rcu_nocb_unlock_irqrestore(rdp, flags); - return; + return false; } del_timer(&rdp->nocb_timer); rcu_nocb_unlock_irqrestore(rdp, flags); @@ -1656,6 +1656,8 @@ static void wake_nocb_gp(struct rcu_data *rdp, bool force, raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags); if (needwake) wake_up_process(rdp_gp->nocb_gp_kthread); + + return needwake; } /* @@ -2152,20 +2154,23 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) } /* Do a deferred wakeup of rcu_nocb_kthread(). */ -static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp) { unsigned long flags; int ndw; + int ret; rcu_nocb_lock_irqsave(rdp, flags); if (!rcu_nocb_need_deferred_wakeup(rdp)) { rcu_nocb_unlock_irqrestore(rdp, flags); - return; + return false; } ndw = READ_ONCE(rdp->nocb_defer_wakeup); WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); - wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); + + return ret; } /* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */ @@ -2181,10 +2186,11 @@ static void do_nocb_deferred_wakeup_timer(struct timer_list *t) * This means we do an inexact common-case check. Note that if * we miss, ->nocb_timer will eventually clean things up. */ -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) { if (rcu_nocb_need_deferred_wakeup(rdp)) - do_nocb_deferred_wakeup_common(rdp); + return do_nocb_deferred_wakeup_common(rdp); + return false; } void rcu_nocb_flush_deferred_wakeup(void) @@ -2523,8 +2529,9 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp) return false; } -static void do_nocb_deferred_wakeup(struct rcu_data *rdp) +static bool do_nocb_deferred_wakeup(struct rcu_data *rdp) { + return false; } static void rcu_spawn_cpu_nocb_kthread(int cpu) -- cgit v1.2.3 From 47b8ff194c1fd73d58dc339b597d466fe48c8958 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Feb 2021 00:05:47 +0100 Subject: entry: Explicitly flush pending rcuog wakeup before last rescheduling point Following the idle loop model, cleanly check for pending rcuog wakeup before the last rescheduling point on resuming to user mode. This way we can avoid to do it from rcu_user_enter() with the last resort self-IPI hack that enforces rescheduling. Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210131230548.32970-5-frederic@kernel.org --- kernel/entry/common.c | 7 +++++++ kernel/rcu/tree.c | 12 +++++++----- 2 files changed, 14 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/entry/common.c b/kernel/entry/common.c index f09cae37ddd5..8442e5c9cfa2 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -184,6 +184,10 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, * enabled above. */ local_irq_disable_exit_to_user(); + + /* Check if any of the above work has queued a deferred wakeup */ + rcu_nocb_flush_deferred_wakeup(); + ti_work = READ_ONCE(current_thread_info()->flags); } @@ -197,6 +201,9 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) lockdep_assert_irqs_disabled(); + /* Flush pending rcuog wakeup before the last need_resched() check */ + rcu_nocb_flush_deferred_wakeup(); + if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) ti_work = exit_to_user_mode_loop(regs, ti_work); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4b1e5bd16492..2ebc211fffcb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -707,13 +707,15 @@ noinstr void rcu_user_enter(void) lockdep_assert_irqs_disabled(); /* - * We may be past the last rescheduling opportunity in the entry code. - * Trigger a self IPI that will fire and reschedule once we resume to - * user/guest mode. + * Other than generic entry implementation, we may be past the last + * rescheduling opportunity in the entry code. Trigger a self IPI + * that will fire and reschedule once we resume in user/guest mode. */ instrumentation_begin(); - if (do_nocb_deferred_wakeup(rdp) && need_resched()) - irq_work_queue(this_cpu_ptr(&late_wakeup_work)); + if (!IS_ENABLED(CONFIG_GENERIC_ENTRY) || (current->flags & PF_VCPU)) { + if (do_nocb_deferred_wakeup(rdp) && need_resched()) + irq_work_queue(this_cpu_ptr(&late_wakeup_work)); + } instrumentation_end(); rcu_eqs_enter(true); -- cgit v1.2.3 From 4ae7dc97f726ea95c58ac58af71cc034ad22d7de Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 1 Feb 2021 00:05:48 +0100 Subject: entry/kvm: Explicitly flush pending rcuog wakeup before last rescheduling point Following the idle loop model, cleanly check for pending rcuog wakeup before the last rescheduling point upon resuming to guest mode. This way we can avoid to do it from rcu_user_enter() with the last resort self-IPI hack that enforces rescheduling. Suggested-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20210131230548.32970-6-frederic@kernel.org --- arch/x86/kvm/x86.c | 1 + include/linux/entry-kvm.h | 14 ++++++++++++++ kernel/rcu/tree.c | 44 ++++++++++++++++++++++++++++++++++---------- kernel/rcu/tree_plugin.h | 1 + 4 files changed, 50 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b404e4d7dd8..b967c1c774a1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1782,6 +1782,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { + xfer_to_guest_mode_prepare(); return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 9b93f8584ff7..8b2b1d68b954 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -46,6 +46,20 @@ static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, */ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu); +/** + * xfer_to_guest_mode_prepare - Perform last minute preparation work that + * need to be handled while IRQs are disabled + * upon entering to guest. + * + * Has to be invoked with interrupts disabled before the last call + * to xfer_to_guest_mode_work_pending(). + */ +static inline void xfer_to_guest_mode_prepare(void) +{ + lockdep_assert_irqs_disabled(); + rcu_nocb_flush_deferred_wakeup(); +} + /** * __xfer_to_guest_mode_work_pending - Check if work is pending * diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2ebc211fffcb..ce17b8477442 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -678,9 +678,10 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter); #ifdef CONFIG_NO_HZ_FULL +#if !defined(CONFIG_GENERIC_ENTRY) || !defined(CONFIG_KVM_XFER_TO_GUEST_WORK) /* * An empty function that will trigger a reschedule on - * IRQ tail once IRQs get re-enabled on userspace resume. + * IRQ tail once IRQs get re-enabled on userspace/guest resume. */ static void late_wakeup_func(struct irq_work *work) { @@ -689,6 +690,37 @@ static void late_wakeup_func(struct irq_work *work) static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) = IRQ_WORK_INIT(late_wakeup_func); +/* + * If either: + * + * 1) the task is about to enter in guest mode and $ARCH doesn't support KVM generic work + * 2) the task is about to enter in user mode and $ARCH doesn't support generic entry. + * + * In these cases the late RCU wake ups aren't supported in the resched loops and our + * last resort is to fire a local irq_work that will trigger a reschedule once IRQs + * get re-enabled again. + */ +noinstr static void rcu_irq_work_resched(void) +{ + struct rcu_data *rdp = this_cpu_ptr(&rcu_data); + + if (IS_ENABLED(CONFIG_GENERIC_ENTRY) && !(current->flags & PF_VCPU)) + return; + + if (IS_ENABLED(CONFIG_KVM_XFER_TO_GUEST_WORK) && (current->flags & PF_VCPU)) + return; + + instrumentation_begin(); + if (do_nocb_deferred_wakeup(rdp) && need_resched()) { + irq_work_queue(this_cpu_ptr(&late_wakeup_work)); + } + instrumentation_end(); +} + +#else +static inline void rcu_irq_work_resched(void) { } +#endif + /** * rcu_user_enter - inform RCU that we are resuming userspace. * @@ -702,8 +734,6 @@ static DEFINE_PER_CPU(struct irq_work, late_wakeup_work) = */ noinstr void rcu_user_enter(void) { - struct rcu_data *rdp = this_cpu_ptr(&rcu_data); - lockdep_assert_irqs_disabled(); /* @@ -711,13 +741,7 @@ noinstr void rcu_user_enter(void) * rescheduling opportunity in the entry code. Trigger a self IPI * that will fire and reschedule once we resume in user/guest mode. */ - instrumentation_begin(); - if (!IS_ENABLED(CONFIG_GENERIC_ENTRY) || (current->flags & PF_VCPU)) { - if (do_nocb_deferred_wakeup(rdp) && need_resched()) - irq_work_queue(this_cpu_ptr(&late_wakeup_work)); - } - instrumentation_end(); - + rcu_irq_work_resched(); rcu_eqs_enter(true); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 384856e4d13e..cdc1b7651c03 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2197,6 +2197,7 @@ void rcu_nocb_flush_deferred_wakeup(void) { do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data)); } +EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup); void __init rcu_init_nohz(void) { -- cgit v1.2.3