summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile3
-rw-r--r--kernel/audit.c62
-rw-r--r--kernel/audit.h29
-rw-r--r--kernel/audit_watch.c14
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/bpf/arraymap.c55
-rw-r--r--kernel/bpf/cgroup.c37
-rw-r--r--kernel/bpf/core.c49
-rw-r--r--kernel/bpf/hashtab.c51
-rw-r--r--kernel/bpf/inode.c16
-rw-r--r--kernel/bpf/map_in_map.c5
-rw-r--r--kernel/bpf/map_in_map.h1
-rw-r--r--kernel/bpf/syscall.c510
-rw-r--r--kernel/bpf/verifier.c318
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-internal.h5
-rw-r--r--kernel/cgroup/cgroup-v1.c155
-rw-r--r--kernel/cgroup/cgroup.c221
-rw-r--r--kernel/cgroup/cpuset.c41
-rw-r--r--kernel/cgroup/debug.c357
-rw-r--r--kernel/compat.c398
-rw-r--r--kernel/configs/android-base.config11
-rw-r--r--kernel/configs/android-recommended.config5
-rw-r--r--kernel/cpu.c39
-rw-r--r--kernel/crash_core.c44
-rw-r--r--kernel/cred.c2
-rw-r--r--kernel/events/core.c266
-rw-r--r--kernel/events/internal.h6
-rw-r--r--kernel/events/ring_buffer.c31
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/exit.c333
-rw-r--r--kernel/extable.c5
-rw-r--r--kernel/fork.c47
-rw-r--r--kernel/futex.c49
-rw-r--r--kernel/groups.c35
-rw-r--r--kernel/irq/affinity.c13
-rw-r--r--kernel/irq/chip.c24
-rw-r--r--kernel/irq/cpuhotplug.c9
-rw-r--r--kernel/irq/handle.c2
-rw-r--r--kernel/irq/internals.h14
-rw-r--r--kernel/irq/ipi.c4
-rw-r--r--kernel/irq/irqdesc.c3
-rw-r--r--kernel/irq/irqdomain.c19
-rw-r--r--kernel/irq/manage.c111
-rw-r--r--kernel/irq/pm.c2
-rw-r--r--kernel/kallsyms.c10
-rw-r--r--kernel/kcmp.c57
-rw-r--r--kernel/kexec.c8
-rw-r--r--kernel/kexec_core.c39
-rw-r--r--kernel/kexec_file.c29
-rw-r--r--kernel/kexec_internal.h2
-rw-r--r--kernel/kmod.c75
-rw-r--r--kernel/kprobes.c42
-rw-r--r--kernel/ksysfs.c4
-rw-r--r--kernel/kthread.c1
-rw-r--r--kernel/locking/mutex.c6
-rw-r--r--kernel/locking/qrwlock.c1
-rw-r--r--kernel/locking/qspinlock.c118
-rw-r--r--kernel/locking/qspinlock_paravirt.h3
-rw-r--r--kernel/locking/rtmutex.c1
-rw-r--r--kernel/locking/rwsem-spinlock.c4
-rw-r--r--kernel/membarrier.c70
-rw-r--r--kernel/memremap.c6
-rw-r--r--kernel/module.c102
-rw-r--r--kernel/pid.c19
-rw-r--r--kernel/power/hibernate.c2
-rw-r--r--kernel/power/main.c2
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/snapshot.c15
-rw-r--r--kernel/power/suspend.c35
-rw-r--r--kernel/printk/internal.h6
-rw-r--r--kernel/printk/printk.c19
-rw-r--r--kernel/printk/printk_safe.c36
-rw-r--r--kernel/rcu/Kconfig3
-rw-r--r--kernel/rcu/rcu.h128
-rw-r--r--kernel/rcu/rcu_segcblist.c108
-rw-r--r--kernel/rcu/rcu_segcblist.h28
-rw-r--r--kernel/rcu/rcuperf.c17
-rw-r--r--kernel/rcu/rcutorture.c83
-rw-r--r--kernel/rcu/srcutiny.c8
-rw-r--r--kernel/rcu/srcutree.c50
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tiny_plugin.h47
-rw-r--r--kernel/rcu/tree.c213
-rw-r--r--kernel/rcu/tree.h15
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h238
-rw-r--r--kernel/rcu/update.c18
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/autogroup.c3
-rw-r--r--kernel/sched/completion.c19
-rw-r--r--kernel/sched/core.c62
-rw-r--r--kernel/sched/cpudeadline.c27
-rw-r--r--kernel/sched/cpufreq_schedutil.c5
-rw-r--r--kernel/sched/cpupri.c2
-rw-r--r--kernel/sched/cputime.c180
-rw-r--r--kernel/sched/deadline.c47
-rw-r--r--kernel/sched/debug.c83
-rw-r--r--kernel/sched/fair.c491
-rw-r--r--kernel/sched/membarrier.c152
-rw-r--r--kernel/sched/sched.h4
-rw-r--r--kernel/sched/topology.c39
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/seccomp.c16
-rw-r--r--kernel/signal.c168
-rw-r--r--kernel/sys.c122
-rw-r--r--kernel/sysctl.c335
-rw-r--r--kernel/sysctl_binary.c2
-rw-r--r--kernel/task_work.c8
-rw-r--r--kernel/time/alarmtimer.c4
-rw-r--r--kernel/time/hrtimer.c30
-rw-r--r--kernel/time/posix-cpu-timers.c8
-rw-r--r--kernel/time/posix-stubs.c96
-rw-r--r--kernel/time/posix-timers.c127
-rw-r--r--kernel/time/time.c58
-rw-r--r--kernel/time/timekeeping.c4
-rw-r--r--kernel/time/timer.c52
-rw-r--r--kernel/torture.c2
-rw-r--r--kernel/trace/Kconfig22
-rw-r--r--kernel/trace/bpf_trace.c100
-rw-r--r--kernel/trace/ftrace.c415
-rw-r--r--kernel/trace/ring_buffer.c24
-rw-r--r--kernel/trace/ring_buffer_benchmark.c2
-rw-r--r--kernel/trace/trace.c489
-rw-r--r--kernel/trace/trace.h36
-rw-r--r--kernel/trace/trace_event_perf.c4
-rw-r--r--kernel/trace/trace_events.c66
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_kprobe.c15
-rw-r--r--kernel/trace/trace_output.c27
-rw-r--r--kernel/trace/trace_sched_switch.c72
-rw-r--r--kernel/trace/trace_stack.c6
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/trace/trace_uprobe.c2
-rw-r--r--kernel/trace/tracing_map.c11
-rw-r--r--kernel/watchdog.c290
-rw-r--r--kernel/watchdog_hld.c96
-rw-r--r--kernel/workqueue.c30
138 files changed, 5800 insertions, 3159 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 72aa080f91f0..9c323a6daa46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
-obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR_PERF) += watchdog_hld.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
@@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
-obj-$(CONFIG_MEMBARRIER) += membarrier.o
obj-$(CONFIG_HAS_IOMEM) += memremap.o
diff --git a/kernel/audit.c b/kernel/audit.c
index 4b7d49868ce1..6dd556931739 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -575,12 +575,16 @@ static void kauditd_retry_skb(struct sk_buff *skb)
/**
* auditd_reset - Disconnect the auditd connection
+ * @ac: auditd connection state
*
* Description:
* Break the auditd/kauditd connection and move all the queued records into the
- * hold queue in case auditd reconnects.
+ * hold queue in case auditd reconnects. It is important to note that the @ac
+ * pointer should never be dereferenced inside this function as it may be NULL
+ * or invalid, you can only compare the memory address! If @ac is NULL then
+ * the connection will always be reset.
*/
-static void auditd_reset(void)
+static void auditd_reset(const struct auditd_connection *ac)
{
unsigned long flags;
struct sk_buff *skb;
@@ -590,17 +594,21 @@ static void auditd_reset(void)
spin_lock_irqsave(&auditd_conn_lock, flags);
ac_old = rcu_dereference_protected(auditd_conn,
lockdep_is_held(&auditd_conn_lock));
+ if (ac && ac != ac_old) {
+ /* someone already registered a new auditd connection */
+ spin_unlock_irqrestore(&auditd_conn_lock, flags);
+ return;
+ }
rcu_assign_pointer(auditd_conn, NULL);
spin_unlock_irqrestore(&auditd_conn_lock, flags);
if (ac_old)
call_rcu(&ac_old->rcu, auditd_conn_free);
- /* flush all of the main and retry queues to the hold queue */
+ /* flush the retry queue to the hold queue, but don't touch the main
+ * queue since we need to process that normally for multicast */
while ((skb = skb_dequeue(&audit_retry_queue)))
kauditd_hold_skb(skb);
- while ((skb = skb_dequeue(&audit_queue)))
- kauditd_hold_skb(skb);
}
/**
@@ -633,6 +641,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
ac = rcu_dereference(auditd_conn);
if (!ac) {
rcu_read_unlock();
+ kfree_skb(skb);
rc = -ECONNREFUSED;
goto err;
}
@@ -649,8 +658,8 @@ static int auditd_send_unicast_skb(struct sk_buff *skb)
return rc;
err:
- if (rc == -ECONNREFUSED)
- auditd_reset();
+ if (ac && rc == -ECONNREFUSED)
+ auditd_reset(ac);
return rc;
}
@@ -795,9 +804,9 @@ static int kauditd_thread(void *dummy)
rc = kauditd_send_queue(sk, portid,
&audit_hold_queue, UNICAST_RETRIES,
NULL, kauditd_rehold_skb);
- if (rc < 0) {
+ if (ac && rc < 0) {
sk = NULL;
- auditd_reset();
+ auditd_reset(ac);
goto main_queue;
}
@@ -805,9 +814,9 @@ static int kauditd_thread(void *dummy)
rc = kauditd_send_queue(sk, portid,
&audit_retry_queue, UNICAST_RETRIES,
NULL, kauditd_hold_skb);
- if (rc < 0) {
+ if (ac && rc < 0) {
sk = NULL;
- auditd_reset();
+ auditd_reset(ac);
goto main_queue;
}
@@ -815,12 +824,13 @@ main_queue:
/* process the main queue - do the multicast send and attempt
* unicast, dump failed record sends to the retry queue; if
* sk == NULL due to previous failures we will just do the
- * multicast send and move the record to the retry queue */
+ * multicast send and move the record to the hold queue */
rc = kauditd_send_queue(sk, portid, &audit_queue, 1,
kauditd_send_multicast_skb,
- kauditd_retry_skb);
- if (sk == NULL || rc < 0)
- auditd_reset();
+ (sk ?
+ kauditd_retry_skb : kauditd_hold_skb));
+ if (ac && rc < 0)
+ auditd_reset(ac);
sk = NULL;
/* drop our netns reference, no auditd sends past this line */
@@ -1230,7 +1240,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
auditd_pid, 1);
/* unregister the auditd connection */
- auditd_reset();
+ auditd_reset(NULL);
}
}
if (s.mask & AUDIT_STATUS_RATE_LIMIT) {
@@ -1999,22 +2009,10 @@ void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap)
static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
{
- kernel_cap_t *perm = &name->fcap.permitted;
- kernel_cap_t *inh = &name->fcap.inheritable;
- int log = 0;
-
- if (!cap_isclear(*perm)) {
- audit_log_cap(ab, "cap_fp", perm);
- log = 1;
- }
- if (!cap_isclear(*inh)) {
- audit_log_cap(ab, "cap_fi", inh);
- log = 1;
- }
-
- if (log)
- audit_log_format(ab, " cap_fe=%d cap_fver=%x",
- name->fcap.fE, name->fcap_ver);
+ audit_log_cap(ab, "cap_fp", &name->fcap.permitted);
+ audit_log_cap(ab, "cap_fi", &name->fcap.inheritable);
+ audit_log_format(ab, " cap_fe=%d cap_fver=%x",
+ name->fcap.fE, name->fcap_ver);
}
static inline int audit_copy_fcaps(struct audit_names *name,
diff --git a/kernel/audit.h b/kernel/audit.h
index ddfce2ea4891..b331d9b83f63 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -68,6 +68,7 @@ struct audit_cap_data {
unsigned int fE; /* effective bit of file cap */
kernel_cap_t effective; /* effective set of process */
};
+ kernel_cap_t ambient;
};
/* When fs/namei.c:getname() is called, we store the pointer in name and bump
@@ -247,13 +248,13 @@ struct audit_netlink_list {
struct sk_buff_head q;
};
-int audit_send_list(void *);
+int audit_send_list(void *_dest);
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
-extern int audit_del_rule(struct audit_entry *);
-extern void audit_free_rule_rcu(struct rcu_head *);
+extern int audit_del_rule(struct audit_entry *entry);
+extern void audit_free_rule_rcu(struct rcu_head *head);
extern struct list_head audit_filter_list[];
extern struct audit_entry *audit_dupe_rule(struct audit_krule *old);
@@ -301,17 +302,17 @@ extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark
#endif /* CONFIG_AUDIT_WATCH */
#ifdef CONFIG_AUDIT_TREE
-extern struct audit_chunk *audit_tree_lookup(const struct inode *);
-extern void audit_put_chunk(struct audit_chunk *);
-extern bool audit_tree_match(struct audit_chunk *, struct audit_tree *);
-extern int audit_make_tree(struct audit_krule *, char *, u32);
-extern int audit_add_tree_rule(struct audit_krule *);
-extern int audit_remove_tree_rule(struct audit_krule *);
+extern struct audit_chunk *audit_tree_lookup(const struct inode *inode);
+extern void audit_put_chunk(struct audit_chunk *chunk);
+extern bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree);
+extern int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op);
+extern int audit_add_tree_rule(struct audit_krule *rule);
+extern int audit_remove_tree_rule(struct audit_krule *rule);
extern void audit_trim_trees(void);
extern int audit_tag_tree(char *old, char *new);
-extern const char *audit_tree_path(struct audit_tree *);
-extern void audit_put_tree(struct audit_tree *);
-extern void audit_kill_trees(struct list_head *);
+extern const char *audit_tree_path(struct audit_tree *tree);
+extern void audit_put_tree(struct audit_tree *tree);
+extern void audit_kill_trees(struct list_head *list);
#else
#define audit_remove_tree_rule(rule) BUG()
#define audit_add_tree_rule(rule) -EINVAL
@@ -323,7 +324,7 @@ extern void audit_kill_trees(struct list_head *);
#define audit_kill_trees(list) BUG()
#endif
-extern char *audit_unpack_string(void **, size_t *, size_t);
+extern char *audit_unpack_string(void **bufp, size_t *remain, size_t len);
extern pid_t audit_sig_pid;
extern kuid_t audit_sig_uid;
@@ -333,7 +334,7 @@ extern int audit_filter(int msgtype, unsigned int listtype);
#ifdef CONFIG_AUDITSYSCALL
extern int audit_signal_info(int sig, struct task_struct *t);
-extern void audit_filter_inodes(struct task_struct *, struct audit_context *);
+extern void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx);
extern struct list_head *audit_killed_trees(void);
#else
#define audit_signal_info(s,t) AUDIT_DISABLED
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 62d686d96581..9eb8b3511636 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
- FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
@@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule)
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
+ /*
+ * audit_remove_watch() drops our reference to 'parent' which
+ * can get freed. Grab our own reference to be safe.
+ */
+ audit_get_parent(parent);
audit_remove_watch(watch);
-
- if (list_empty(&parent->watches)) {
- audit_get_parent(parent);
+ if (list_empty(&parent->watches))
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
- audit_put_parent(parent);
- }
+ audit_put_parent(parent);
}
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bb724baa7ac9..3260ba2312a9 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1261,6 +1261,7 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_cap(ab, "cap_pi", &context->capset.cap.inheritable);
audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
+ audit_log_cap(ab, "cap_pa", &context->capset.cap.ambient);
break;
case AUDIT_MMAP:
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
@@ -1382,9 +1383,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
audit_log_cap(ab, "old_pp", &axs->old_pcap.permitted);
audit_log_cap(ab, "old_pi", &axs->old_pcap.inheritable);
audit_log_cap(ab, "old_pe", &axs->old_pcap.effective);
- audit_log_cap(ab, "new_pp", &axs->new_pcap.permitted);
- audit_log_cap(ab, "new_pi", &axs->new_pcap.inheritable);
- audit_log_cap(ab, "new_pe", &axs->new_pcap.effective);
+ audit_log_cap(ab, "old_pa", &axs->old_pcap.ambient);
+ audit_log_cap(ab, "pp", &axs->new_pcap.permitted);
+ audit_log_cap(ab, "pi", &axs->new_pcap.inheritable);
+ audit_log_cap(ab, "pe", &axs->new_pcap.effective);
+ audit_log_cap(ab, "pa", &axs->new_pcap.ambient);
break; }
}
@@ -2342,10 +2345,12 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
ax->old_pcap.permitted = old->cap_permitted;
ax->old_pcap.inheritable = old->cap_inheritable;
ax->old_pcap.effective = old->cap_effective;
+ ax->old_pcap.ambient = old->cap_ambient;
ax->new_pcap.permitted = new->cap_permitted;
ax->new_pcap.inheritable = new->cap_inheritable;
ax->new_pcap.effective = new->cap_effective;
+ ax->new_pcap.ambient = new->cap_ambient;
return 0;
}
@@ -2364,6 +2369,7 @@ void __audit_log_capset(const struct cred *new, const struct cred *old)
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
+ context->capset.cap.ambient = new->cap_ambient;
context->type = AUDIT_CAPSET;
}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 172dc8ee0e3b..d771a3872500 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -335,6 +335,26 @@ static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
}
/* only called from syscall */
+int bpf_fd_array_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+ void **elem, *ptr;
+ int ret = 0;
+
+ if (!map->ops->map_fd_sys_lookup_elem)
+ return -ENOTSUPP;
+
+ rcu_read_lock();
+ elem = array_map_lookup_elem(map, key);
+ if (elem && (ptr = READ_ONCE(*elem)))
+ *value = map->ops->map_fd_sys_lookup_elem(ptr);
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* only called from syscall */
int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
@@ -400,6 +420,11 @@ static void prog_fd_array_put_ptr(void *ptr)
bpf_prog_put(ptr);
}
+static u32 prog_fd_array_sys_lookup_elem(void *ptr)
+{
+ return ((struct bpf_prog *)ptr)->aux->id;
+}
+
/* decrement refcnt of all bpf_progs that are stored in this map */
void bpf_fd_array_map_clear(struct bpf_map *map)
{
@@ -418,6 +443,7 @@ const struct bpf_map_ops prog_array_map_ops = {
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = prog_fd_array_get_ptr,
.map_fd_put_ptr = prog_fd_array_put_ptr,
+ .map_fd_sys_lookup_elem = prog_fd_array_sys_lookup_elem,
};
static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
@@ -452,38 +478,24 @@ static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
struct file *map_file, int fd)
{
- const struct perf_event_attr *attr;
struct bpf_event_entry *ee;
struct perf_event *event;
struct file *perf_file;
+ u64 value;
perf_file = perf_event_get(fd);
if (IS_ERR(perf_file))
return perf_file;
+ ee = ERR_PTR(-EOPNOTSUPP);
event = perf_file->private_data;
- ee = ERR_PTR(-EINVAL);
-
- attr = perf_event_attrs(event);
- if (IS_ERR(attr) || attr->inherit)
+ if (perf_event_read_local(event, &value) == -EOPNOTSUPP)
goto err_out;
- switch (attr->type) {
- case PERF_TYPE_SOFTWARE:
- if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
- goto err_out;
- /* fall-through */
- case PERF_TYPE_RAW:
- case PERF_TYPE_HARDWARE:
- ee = bpf_event_entry_gen(perf_file, map_file);
- if (ee)
- return ee;
- ee = ERR_PTR(-ENOMEM);
- /* fall-through */
- default:
- break;
- }
-
+ ee = bpf_event_entry_gen(perf_file, map_file);
+ if (ee)
+ return ee;
+ ee = ERR_PTR(-ENOMEM);
err_out:
fput(perf_file);
return ee;
@@ -599,4 +611,5 @@ const struct bpf_map_ops array_of_maps_map_ops = {
.map_delete_elem = fd_array_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
+ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
};
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ea6033cba947..546113430049 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -236,3 +236,40 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
return ret;
}
EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+
+/**
+ * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock
+ * @sk: socket to get cgroup from
+ * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains
+ * sk with connection information (IP addresses, etc.) May not contain
+ * cgroup info if it is a req sock.
+ * @type: The type of program to be exectuted
+ *
+ * socket passed is expected to be of type INET or INET6.
+ *
+ * The program type passed in via @type must be suitable for sock_ops
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter_sock_ops(struct sock *sk,
+ struct bpf_sock_ops_kern *sock_ops,
+ enum bpf_attach_type type)
+{
+ struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ struct bpf_prog *prog;
+ int ret = 0;
+
+
+ rcu_read_lock();
+
+ prog = rcu_dereference(cgrp->bpf.effective[type]);
+ if (prog)
+ ret = BPF_PROG_RUN(prog, sock_ops) == 1 ? 0 : -EPERM;
+
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_ops);
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index dedf367f59bb..ad5f55922a13 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -763,10 +763,10 @@ EXPORT_SYMBOL_GPL(__bpf_call_base);
*
* Decode and execute eBPF instructions.
*/
-static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
+static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn,
+ u64 *stack)
{
- u64 stack[MAX_BPF_STACK / sizeof(u64)];
- u64 regs[MAX_BPF_REG], tmp;
+ u64 tmp;
static const void *jumptable[256] = {
[0 ... 255] = &&default_label,
/* Now overwrite non-defaults ... */
@@ -824,7 +824,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_ALU64 | BPF_NEG] = &&ALU64_NEG,
/* Call instruction */
[BPF_JMP | BPF_CALL] = &&JMP_CALL,
- [BPF_JMP | BPF_CALL | BPF_X] = &&JMP_TAIL_CALL,
+ [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL,
/* Jumps */
[BPF_JMP | BPF_JA] = &&JMP_JA,
[BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X,
@@ -874,9 +874,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
#define CONT ({ insn++; goto select_insn; })
#define CONT_JMP ({ insn++; goto select_insn; })
- FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)];
- ARG1 = (u64) (unsigned long) ctx;
-
select_insn:
goto *jumptable[insn->code];
@@ -1219,7 +1216,39 @@ load_byte:
WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code);
return 0;
}
-STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */
+STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
+
+#define PROG_NAME(stack_size) __bpf_prog_run##stack_size
+#define DEFINE_BPF_PROG_RUN(stack_size) \
+static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
+{ \
+ u64 stack[stack_size / sizeof(u64)]; \
+ u64 regs[MAX_BPF_REG]; \
+\
+ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
+ ARG1 = (u64) (unsigned long) ctx; \
+ return ___bpf_prog_run(regs, insn, stack); \
+}
+
+#define EVAL1(FN, X) FN(X)
+#define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y)
+#define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y)
+#define EVAL4(FN, X, Y...) FN(X) EVAL3(FN, Y)
+#define EVAL5(FN, X, Y...) FN(X) EVAL4(FN, Y)
+#define EVAL6(FN, X, Y...) FN(X) EVAL5(FN, Y)
+
+EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192);
+EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384);
+EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512);
+
+#define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size),
+
+static unsigned int (*interpreters[])(const void *ctx,
+ const struct bpf_insn *insn) = {
+EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192)
+EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384)
+EVAL4(PROG_NAME_LIST, 416, 448, 480, 512)
+};
bool bpf_prog_array_compatible(struct bpf_array *array,
const struct bpf_prog *fp)
@@ -1268,7 +1297,9 @@ static int bpf_check_tail_call(const struct bpf_prog *fp)
*/
struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err)
{
- fp->bpf_func = (void *) __bpf_prog_run;
+ u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1);
+
+ fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1];
/* eBPF JITs can rewrite the program in case constant
* blinding is active. However, in case of error during
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 004334ea13ba..d11c8181f4c5 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -652,12 +652,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
}
}
+static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab)
+{
+ return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS &&
+ BITS_PER_LONG == 64;
+}
+
+static u32 htab_size_value(const struct bpf_htab *htab, bool percpu)
+{
+ u32 size = htab->map.value_size;
+
+ if (percpu || fd_htab_map_needs_adjust(htab))
+ size = round_up(size, 8);
+ return size;
+}
+
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
void *value, u32 key_size, u32 hash,
bool percpu, bool onallcpus,
struct htab_elem *old_elem)
{
- u32 size = htab->map.value_size;
+ u32 size = htab_size_value(htab, percpu);
bool prealloc = htab_is_prealloc(htab);
struct htab_elem *l_new, **pl_new;
void __percpu *pptr;
@@ -696,9 +711,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
memcpy(l_new->key, key, key_size);
if (percpu) {
- /* round up value_size to 8 bytes */
- size = round_up(size, 8);
-
if (prealloc) {
pptr = htab_elem_get_ptr(l_new, key_size);
} else {
@@ -1209,17 +1221,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr)
{
- struct bpf_map *map;
-
if (attr->value_size != sizeof(u32))
return ERR_PTR(-EINVAL);
-
- /* pointer is stored internally */
- attr->value_size = sizeof(void *);
- map = htab_map_alloc(attr);
- attr->value_size = sizeof(u32);
-
- return map;
+ return htab_map_alloc(attr);
}
static void fd_htab_map_free(struct bpf_map *map)
@@ -1244,6 +1248,26 @@ static void fd_htab_map_free(struct bpf_map *map)
}
/* only called from syscall */
+int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value)
+{
+ void **ptr;
+ int ret = 0;
+
+ if (!map->ops->map_fd_sys_lookup_elem)
+ return -ENOTSUPP;
+
+ rcu_read_lock();
+ ptr = htab_map_lookup_elem(map, key);
+ if (ptr)
+ *value = map->ops->map_fd_sys_lookup_elem(READ_ONCE(*ptr));
+ else
+ ret = -ENOENT;
+ rcu_read_unlock();
+
+ return ret;
+}
+
+/* only called from syscall */
int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
void *key, void *value, u64 map_flags)
{
@@ -1305,4 +1329,5 @@ const struct bpf_map_ops htab_of_maps_map_ops = {
.map_delete_elem = htab_map_delete_elem,
.map_fd_get_ptr = bpf_map_fd_get_ptr,
.map_fd_put_ptr = bpf_map_fd_put_ptr,
+ .map_fd_sys_lookup_elem = bpf_map_fd_sys_lookup_elem,
};
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 9bbd33497d3d..e833ed914358 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -377,10 +377,22 @@ static void bpf_evict_inode(struct inode *inode)
bpf_any_put(inode->i_private, type);
}
+/*
+ * Display the mount options in /proc/mounts.
+ */
+static int bpf_show_options(struct seq_file *m, struct dentry *root)
+{
+ umode_t mode = d_inode(root)->i_mode & S_IALLUGO & ~S_ISVTX;
+
+ if (mode != S_IRWXUGO)
+ seq_printf(m, ",mode=%o", mode);
+ return 0;
+}
+
static const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
- .show_options = generic_show_options,
+ .show_options = bpf_show_options,
.evict_inode = bpf_evict_inode,
};
@@ -434,8 +446,6 @@ static int bpf_fill_super(struct super_block *sb, void *data, int silent)
struct inode *inode;
int ret;
- save_mount_options(sb, data);
-
ret = bpf_parse_options(data, &opts);
if (ret)
return ret;
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 59bcdf821ae4..1da574612bea 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -95,3 +95,8 @@ void bpf_map_fd_put_ptr(void *ptr)
*/
bpf_map_put(ptr);
}
+
+u32 bpf_map_fd_sys_lookup_elem(void *ptr)
+{
+ return ((struct bpf_map *)ptr)->id;
+}
diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h
index 177fadb689dc..6183db9ec08c 100644
--- a/kernel/bpf/map_in_map.h
+++ b/kernel/bpf/map_in_map.h
@@ -19,5 +19,6 @@ bool bpf_map_meta_equal(const struct bpf_map *meta0,
void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file,
int ufd);
void bpf_map_fd_put_ptr(void *ptr);
+u32 bpf_map_fd_sys_lookup_elem(void *ptr);
#endif
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 265a0d854e33..6c772adabad2 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -22,8 +22,20 @@
#include <linux/filter.h>
#include <linux/version.h>
#include <linux/kernel.h>
+#include <linux/idr.h>
+
+#define IS_FD_ARRAY(map) ((map)->map_type == BPF_MAP_TYPE_PROG_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || \
+ (map)->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS)
+#define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
+#define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
DEFINE_PER_CPU(int, bpf_prog_active);
+static DEFINE_IDR(prog_idr);
+static DEFINE_SPINLOCK(prog_idr_lock);
+static DEFINE_IDR(map_idr);
+static DEFINE_SPINLOCK(map_idr_lock);
int sysctl_unprivileged_bpf_disabled __read_mostly;
@@ -114,6 +126,37 @@ static void bpf_map_uncharge_memlock(struct bpf_map *map)
free_uid(user);
}
+static int bpf_map_alloc_id(struct bpf_map *map)
+{
+ int id;
+
+ spin_lock_bh(&map_idr_lock);
+ id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ map->id = id;
+ spin_unlock_bh(&map_idr_lock);
+
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
+{
+ if (do_idr_lock)
+ spin_lock_bh(&map_idr_lock);
+ else
+ __acquire(&map_idr_lock);
+
+ idr_remove(&map_idr, map->id);
+
+ if (do_idr_lock)
+ spin_unlock_bh(&map_idr_lock);
+ else
+ __release(&map_idr_lock);
+}
+
/* called from workqueue */
static void bpf_map_free_deferred(struct work_struct *work)
{
@@ -135,14 +178,21 @@ static void bpf_map_put_uref(struct bpf_map *map)
/* decrement map refcnt and schedule it for freeing via workqueue
* (unrelying map implementation ops->map_free() might sleep)
*/
-void bpf_map_put(struct bpf_map *map)
+static void __bpf_map_put(struct bpf_map *map, bool do_idr_lock)
{
if (atomic_dec_and_test(&map->refcnt)) {
+ /* bpf_map_free_id() must be called first */
+ bpf_map_free_id(map, do_idr_lock);
INIT_WORK(&map->work, bpf_map_free_deferred);
schedule_work(&map->work);
}
}
+void bpf_map_put(struct bpf_map *map)
+{
+ __bpf_map_put(map, true);
+}
+
void bpf_map_put_with_uref(struct bpf_map *map)
{
bpf_map_put_uref(map);
@@ -166,10 +216,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
const struct bpf_map *map = filp->private_data;
const struct bpf_array *array;
u32 owner_prog_type = 0;
+ u32 owner_jited = 0;
if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) {
array = container_of(map, struct bpf_array, map);
owner_prog_type = array->owner_prog_type;
+ owner_jited = array->owner_jited;
}
seq_printf(m,
@@ -186,9 +238,12 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->map_flags,
map->pages * 1ULL << PAGE_SHIFT);
- if (owner_prog_type)
+ if (owner_prog_type) {
seq_printf(m, "owner_prog_type:\t%u\n",
owner_prog_type);
+ seq_printf(m, "owner_jited:\t%u\n",
+ owner_jited);
+ }
}
#endif
@@ -236,11 +291,22 @@ static int map_create(union bpf_attr *attr)
if (err)
goto free_map_nouncharge;
- err = bpf_map_new_fd(map);
- if (err < 0)
- /* failed to allocate fd */
+ err = bpf_map_alloc_id(map);
+ if (err)
goto free_map;
+ err = bpf_map_new_fd(map);
+ if (err < 0) {
+ /* failed to allocate fd.
+ * bpf_map_put() is needed because the above
+ * bpf_map_alloc_id() has published the map
+ * to the userspace and the userspace may
+ * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID.
+ */
+ bpf_map_put(map);
+ return err;
+ }
+
trace_bpf_map_create(map, err);
return err;
@@ -295,6 +361,28 @@ struct bpf_map *bpf_map_get_with_uref(u32 ufd)
return map;
}
+/* map_idr_lock should have been held */
+static struct bpf_map *bpf_map_inc_not_zero(struct bpf_map *map,
+ bool uref)
+{
+ int refold;
+
+ refold = __atomic_add_unless(&map->refcnt, 1, 0);
+
+ if (refold >= BPF_MAX_REFCNT) {
+ __bpf_map_put(map, false);
+ return ERR_PTR(-EBUSY);
+ }
+
+ if (!refold)
+ return ERR_PTR(-ENOENT);
+
+ if (uref)
+ atomic_inc(&map->usercnt);
+
+ return map;
+}
+
int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
{
return -ENOTSUPP;
@@ -322,19 +410,18 @@ static int map_lookup_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
+ else if (IS_FD_MAP(map))
+ value_size = sizeof(u32);
else
value_size = map->value_size;
@@ -350,9 +437,10 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_array_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
- } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS ||
- map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) {
- err = -ENOTSUPP;
+ } else if (IS_FD_ARRAY(map)) {
+ err = bpf_fd_array_map_lookup_elem(map, key, value);
+ } else if (IS_FD_HASH(map)) {
+ err = bpf_fd_htab_map_lookup_elem(map, key, value);
} else {
rcu_read_lock();
ptr = map->ops->map_lookup_elem(map, key);
@@ -402,14 +490,11 @@ static int map_update_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
@@ -488,14 +573,11 @@ static int map_delete_elem(union bpf_attr *attr)
if (IS_ERR(map))
return PTR_ERR(map);
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
preempt_disable();
__this_cpu_inc(bpf_prog_active);
@@ -507,7 +589,6 @@ static int map_delete_elem(union bpf_attr *attr)
if (!err)
trace_bpf_map_delete_elem(map, ufd, key);
-free_key:
kfree(key);
err_put:
fdput(f);
@@ -536,14 +617,11 @@ static int map_get_next_key(union bpf_attr *attr)
return PTR_ERR(map);
if (ukey) {
- err = -ENOMEM;
- key = kmalloc(map->key_size, GFP_USER);
- if (!key)
+ key = memdup_user(ukey, map->key_size);
+ if (IS_ERR(key)) {
+ err = PTR_ERR(key);
goto err_put;
-
- err = -EFAULT;
- if (copy_from_user(key, ukey, map->key_size) != 0)
- goto free_key;
+ }
} else {
key = NULL;
}
@@ -650,6 +728,42 @@ static void bpf_prog_uncharge_memlock(struct bpf_prog *prog)
free_uid(user);
}
+static int bpf_prog_alloc_id(struct bpf_prog *prog)
+{
+ int id;
+
+ spin_lock_bh(&prog_idr_lock);
+ id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC);
+ if (id > 0)
+ prog->aux->id = id;
+ spin_unlock_bh(&prog_idr_lock);
+
+ /* id is in [1, INT_MAX) */
+ if (WARN_ON_ONCE(!id))
+ return -ENOSPC;
+
+ return id > 0 ? 0 : id;
+}
+
+static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock)
+{
+ /* cBPF to eBPF migrations are currently not in the idr store. */
+ if (!prog->aux->id)
+ return;
+
+ if (do_idr_lock)
+ spin_lock_bh(&prog_idr_lock);
+ else
+ __acquire(&prog_idr_lock);
+
+ idr_remove(&prog_idr, prog->aux->id);
+
+ if (do_idr_lock)
+ spin_unlock_bh(&prog_idr_lock);
+ else
+ __release(&prog_idr_lock);
+}
+
static void __bpf_prog_put_rcu(struct rcu_head *rcu)
{
struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
@@ -659,14 +773,21 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
bpf_prog_free(aux->prog);
}
-void bpf_prog_put(struct bpf_prog *prog)
+static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock)
{
if (atomic_dec_and_test(&prog->aux->refcnt)) {
trace_bpf_prog_put_rcu(prog);
+ /* bpf_prog_free_id() must be called first */
+ bpf_prog_free_id(prog, do_idr_lock);
bpf_prog_kallsyms_del(prog);
call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu);
}
}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ __bpf_prog_put(prog, true);
+}
EXPORT_SYMBOL_GPL(bpf_prog_put);
static int bpf_prog_release(struct inode *inode, struct file *filp)
@@ -748,6 +869,24 @@ struct bpf_prog *bpf_prog_inc(struct bpf_prog *prog)
}
EXPORT_SYMBOL_GPL(bpf_prog_inc);
+/* prog_idr_lock should have been held */
+static struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog)
+{
+ int refold;
+
+ refold = __atomic_add_unless(&prog->aux->refcnt, 1, 0);
+
+ if (refold >= BPF_MAX_REFCNT) {
+ __bpf_prog_put(prog, false);
+ return ERR_PTR(-EBUSY);
+ }
+
+ if (!refold)
+ return ERR_PTR(-ENOENT);
+
+ return prog;
+}
+
static struct bpf_prog *__bpf_prog_get(u32 ufd, enum bpf_prog_type *type)
{
struct fd f = fdget(ufd);
@@ -815,7 +954,9 @@ static int bpf_prog_load(union bpf_attr *attr)
attr->kern_version != LINUX_VERSION_CODE)
return -EINVAL;
- if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN))
+ if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
+ type != BPF_PROG_TYPE_CGROUP_SKB &&
+ !capable(CAP_SYS_ADMIN))
return -EPERM;
/* plain bpf_prog allocation */
@@ -855,11 +996,22 @@ static int bpf_prog_load(union bpf_attr *attr)
if (err < 0)
goto free_used_maps;
- err = bpf_prog_new_fd(prog);
- if (err < 0)
- /* failed to allocate fd */
+ err = bpf_prog_alloc_id(prog);
+ if (err)
goto free_used_maps;
+ err = bpf_prog_new_fd(prog);
+ if (err < 0) {
+ /* failed to allocate fd.
+ * bpf_prog_put() is needed because the above
+ * bpf_prog_alloc_id() has published the prog
+ * to the userspace and the userspace may
+ * have refcnt-ed it through BPF_PROG_GET_FD_BY_ID.
+ */
+ bpf_prog_put(prog);
+ return err;
+ }
+
bpf_prog_kallsyms_add(prog);
trace_bpf_prog_load(prog, err);
return err;
@@ -919,6 +1071,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_CGROUP_INET_SOCK_CREATE:
ptype = BPF_PROG_TYPE_CGROUP_SOCK;
break;
+ case BPF_CGROUP_SOCK_OPS:
+ ptype = BPF_PROG_TYPE_SOCK_OPS;
+ break;
default:
return -EINVAL;
}
@@ -959,6 +1114,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_CGROUP_INET_INGRESS:
case BPF_CGROUP_INET_EGRESS:
case BPF_CGROUP_INET_SOCK_CREATE:
+ case BPF_CGROUP_SOCK_OPS:
cgrp = cgroup_get_from_fd(attr->target_fd);
if (IS_ERR(cgrp))
return PTR_ERR(cgrp);
@@ -973,6 +1129,7 @@ static int bpf_prog_detach(const union bpf_attr *attr)
return ret;
}
+
#endif /* CONFIG_CGROUP_BPF */
#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration
@@ -997,6 +1154,237 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
return ret;
}
+#define BPF_OBJ_GET_NEXT_ID_LAST_FIELD next_id
+
+static int bpf_obj_get_next_id(const union bpf_attr *attr,
+ union bpf_attr __user *uattr,
+ struct idr *idr,
+ spinlock_t *lock)
+{
+ u32 next_id = attr->start_id;
+ int err = 0;
+
+ if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ next_id++;
+ spin_lock_bh(lock);
+ if (!idr_get_next(idr, &next_id))
+ err = -ENOENT;
+ spin_unlock_bh(lock);
+
+ if (!err)
+ err = put_user(next_id, &uattr->next_id);
+
+ return err;
+}
+
+#define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
+
+static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_prog *prog;
+ u32 id = attr->prog_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&prog_idr_lock);
+ prog = idr_find(&prog_idr, id);
+ if (prog)
+ prog = bpf_prog_inc_not_zero(prog);
+ else
+ prog = ERR_PTR(-ENOENT);
+ spin_unlock_bh(&prog_idr_lock);
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ fd = bpf_prog_new_fd(prog);
+ if (fd < 0)
+ bpf_prog_put(prog);
+
+ return fd;
+}
+
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+
+static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ u32 id = attr->map_id;
+ int fd;
+
+ if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ spin_lock_bh(&map_idr_lock);
+ map = idr_find(&map_idr, id);
+ if (map)
+ map = bpf_map_inc_not_zero(map, true);
+ else
+ map = ERR_PTR(-ENOENT);
+ spin_unlock_bh(&map_idr_lock);
+
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ fd = bpf_map_new_fd(map);
+ if (fd < 0)
+ bpf_map_put(map);
+
+ return fd;
+}
+
+static int check_uarg_tail_zero(void __user *uaddr,
+ size_t expected_size,
+ size_t actual_size)
+{
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+ int err;
+
+ if (actual_size <= expected_size)
+ return 0;
+
+ addr = uaddr + expected_size;
+ end = uaddr + actual_size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
+static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_prog_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_prog_info info = {};
+ u32 info_len = attr->info.info_len;
+ char __user *uinsns;
+ u32 ulen;
+ int err;
+
+ err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ if (copy_from_user(&info, uinfo, info_len))
+ return -EFAULT;
+
+ info.type = prog->type;
+ info.id = prog->aux->id;
+
+ memcpy(info.tag, prog->tag, sizeof(prog->tag));
+
+ if (!capable(CAP_SYS_ADMIN)) {
+ info.jited_prog_len = 0;
+ info.xlated_prog_len = 0;
+ goto done;
+ }
+
+ ulen = info.jited_prog_len;
+ info.jited_prog_len = prog->jited_len;
+ if (info.jited_prog_len && ulen) {
+ uinsns = u64_to_user_ptr(info.jited_prog_insns);
+ ulen = min_t(u32, info.jited_prog_len, ulen);
+ if (copy_to_user(uinsns, prog->bpf_func, ulen))
+ return -EFAULT;
+ }
+
+ ulen = info.xlated_prog_len;
+ info.xlated_prog_len = bpf_prog_insn_size(prog);
+ if (info.xlated_prog_len && ulen) {
+ uinsns = u64_to_user_ptr(info.xlated_prog_insns);
+ ulen = min_t(u32, info.xlated_prog_len, ulen);
+ if (copy_to_user(uinsns, prog->insnsi, ulen))
+ return -EFAULT;
+ }
+
+done:
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int bpf_map_get_info_by_fd(struct bpf_map *map,
+ const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ struct bpf_map_info __user *uinfo = u64_to_user_ptr(attr->info.info);
+ struct bpf_map_info info = {};
+ u32 info_len = attr->info.info_len;
+ int err;
+
+ err = check_uarg_tail_zero(uinfo, sizeof(info), info_len);
+ if (err)
+ return err;
+ info_len = min_t(u32, sizeof(info), info_len);
+
+ info.type = map->map_type;
+ info.id = map->id;
+ info.key_size = map->key_size;
+ info.value_size = map->value_size;
+ info.max_entries = map->max_entries;
+ info.map_flags = map->map_flags;
+
+ if (copy_to_user(uinfo, &info, info_len) ||
+ put_user(info_len, &uattr->info.info_len))
+ return -EFAULT;
+
+ return 0;
+}
+
+#define BPF_OBJ_GET_INFO_BY_FD_LAST_FIELD info.info
+
+static int bpf_obj_get_info_by_fd(const union bpf_attr *attr,
+ union bpf_attr __user *uattr)
+{
+ int ufd = attr->info.bpf_fd;
+ struct fd f;
+ int err;
+
+ if (CHECK_ATTR(BPF_OBJ_GET_INFO_BY_FD))
+ return -EINVAL;
+
+ f = fdget(ufd);
+ if (!f.file)
+ return -EBADFD;
+
+ if (f.file->f_op == &bpf_prog_fops)
+ err = bpf_prog_get_info_by_fd(f.file->private_data, attr,
+ uattr);
+ else if (f.file->f_op == &bpf_map_fops)
+ err = bpf_map_get_info_by_fd(f.file->private_data, attr,
+ uattr);
+ else
+ err = -EINVAL;
+
+ fdput(f);
+ return err;
+}
+
SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
{
union bpf_attr attr = {};
@@ -1016,23 +1404,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
* user-space does not rely on any kernel feature
* extensions we dont know about yet.
*/
- if (size > sizeof(attr)) {
- unsigned char __user *addr;
- unsigned char __user *end;
- unsigned char val;
-
- addr = (void __user *)uattr + sizeof(attr);
- end = (void __user *)uattr + size;
-
- for (; addr < end; addr++) {
- err = get_user(val, addr);
- if (err)
- return err;
- if (val)
- return -E2BIG;
- }
- size = sizeof(attr);
- }
+ err = check_uarg_tail_zero(uattr, sizeof(attr), size);
+ if (err)
+ return err;
+ size = min_t(u32, size, sizeof(attr));
/* copy attributes from user space, may be less than sizeof(bpf_attr) */
if (copy_from_user(&attr, uattr, size) != 0)
@@ -1074,6 +1449,23 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
case BPF_PROG_TEST_RUN:
err = bpf_prog_test_run(&attr, uattr);
break;
+ case BPF_PROG_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &prog_idr, &prog_idr_lock);
+ break;
+ case BPF_MAP_GET_NEXT_ID:
+ err = bpf_obj_get_next_id(&attr, uattr,
+ &map_idr, &map_idr_lock);
+ break;
+ case BPF_PROG_GET_FD_BY_ID:
+ err = bpf_prog_get_fd_by_id(&attr);
+ break;
+ case BPF_MAP_GET_FD_BY_ID:
+ err = bpf_map_get_fd_by_id(&attr);
+ break;
+ case BPF_OBJ_GET_INFO_BY_FD:
+ err = bpf_obj_get_info_by_fd(&attr, uattr);
+ break;
default:
err = -EINVAL;
break;
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index a8a725697bed..664d93972373 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -504,6 +504,7 @@ static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
{
regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
regs[regno].max_value = BPF_REGISTER_MAX_RANGE;
+ regs[regno].value_from_signed = false;
regs[regno].min_align = 0;
}
@@ -546,20 +547,6 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
return 0;
}
-static int bpf_size_to_bytes(int bpf_size)
-{
- if (bpf_size == BPF_W)
- return 4;
- else if (bpf_size == BPF_H)
- return 2;
- else if (bpf_size == BPF_B)
- return 1;
- else if (bpf_size == BPF_DW)
- return 8;
- else
- return -EINVAL;
-}
-
static bool is_spillable_regtype(enum bpf_reg_type type)
{
switch (type) {
@@ -758,15 +745,29 @@ static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
}
/* check access to 'struct bpf_context' fields */
-static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
+static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, int size,
enum bpf_access_type t, enum bpf_reg_type *reg_type)
{
+ struct bpf_insn_access_aux info = {
+ .reg_type = *reg_type,
+ };
+
/* for analyzer ctx accesses are already validated and converted */
if (env->analyzer_ops)
return 0;
if (env->prog->aux->ops->is_valid_access &&
- env->prog->aux->ops->is_valid_access(off, size, t, reg_type)) {
+ env->prog->aux->ops->is_valid_access(off, size, t, &info)) {
+ /* A non zero info.ctx_field_size indicates that this field is a
+ * candidate for later verifier transformation to load the whole
+ * field and then apply a mask when accessed with a narrower
+ * access than actual ctx access size. A zero info.ctx_field_size
+ * will only allow for whole field access and rejects any other
+ * type of narrower access.
+ */
+ env->insn_aux_data[insn_idx].ctx_field_size = info.ctx_field_size;
+ *reg_type = info.reg_type;
+
/* remember the offset of last byte accessed in ctx */
if (env->prog->aux->max_ctx_offset < off + size)
env->prog->aux->max_ctx_offset = off + size;
@@ -777,12 +778,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
return -EACCES;
}
-static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+static bool __is_pointer_value(bool allow_ptr_leaks,
+ const struct bpf_reg_state *reg)
{
- if (env->allow_ptr_leaks)
+ if (allow_ptr_leaks)
return false;
- switch (env->cur_state.regs[regno].type) {
+ switch (reg->type) {
case UNKNOWN_VALUE:
case CONST_IMM:
return false;
@@ -791,6 +793,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
}
}
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+{
+ return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+}
+
static int check_pkt_ptr_alignment(const struct bpf_reg_state *reg,
int off, int size, bool strict)
{
@@ -868,7 +875,7 @@ static int check_ptr_alignment(struct bpf_verifier_env *env,
* if t==write && value_regno==-1, some unknown value is stored into memory
* if t==read && value_regno==-1, don't care what we read from memory
*/
-static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
+static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regno, int off,
int bpf_size, enum bpf_access_type t,
int value_regno)
{
@@ -911,7 +918,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
verbose("R%d leaks addr into ctx\n", value_regno);
return -EACCES;
}
- err = check_ctx_access(env, off, size, t, &reg_type);
+ err = check_ctx_access(env, insn_idx, off, size, t, &reg_type);
if (!err && t == BPF_READ && value_regno >= 0) {
mark_reg_unknown_value_and_range(state->regs,
value_regno);
@@ -926,6 +933,10 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
verbose("invalid stack off=%d size=%d\n", off, size);
return -EACCES;
}
+
+ if (env->prog->aux->stack_depth < -off)
+ env->prog->aux->stack_depth = -off;
+
if (t == BPF_WRITE) {
if (!env->allow_ptr_leaks &&
state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
@@ -968,7 +979,7 @@ static int check_mem_access(struct bpf_verifier_env *env, u32 regno, int off,
return err;
}
-static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
+static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_insn *insn)
{
struct bpf_reg_state *regs = env->cur_state.regs;
int err;
@@ -995,13 +1006,13 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
/* check whether atomic_add can read the memory */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
if (err)
return err;
/* check whether atomic_add can write into the same memory */
- return check_mem_access(env, insn->dst_reg, insn->off,
+ return check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE, -1);
}
@@ -1037,6 +1048,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
return -EACCES;
}
+ if (env->prog->aux->stack_depth < -off)
+ env->prog->aux->stack_depth = -off;
+
if (meta && meta->raw_mode) {
meta->access_size = access_size;
meta->regno = regno;
@@ -1344,8 +1358,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
if (reg->type != PTR_TO_PACKET &&
reg->type != PTR_TO_PACKET_END)
continue;
- reg->type = UNKNOWN_VALUE;
- reg->imm = 0;
+ __mark_reg_unknown_value(state->spilled_regs,
+ i / BPF_REG_SIZE);
}
}
@@ -1414,7 +1428,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
* is inferred from register state.
*/
for (i = 0; i < meta.access_size; i++) {
- err = check_mem_access(env, meta.regno, i, BPF_B, BPF_WRITE, -1);
+ err = check_mem_access(env, insn_idx, meta.regno, i, BPF_B, BPF_WRITE, -1);
if (err)
return err;
}
@@ -1650,6 +1664,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
+static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ u8 opcode = BPF_OP(insn->code);
+ s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
+
+ /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
+ if (src_reg->imm > 0 && dst_reg->imm) {
+ switch (opcode) {
+ case BPF_ADD:
+ /* dreg += sreg
+ * where both have zero upper bits. Adding them
+ * can only result making one more bit non-zero
+ * in the larger value.
+ * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+ * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+ */
+ dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ dst_reg->imm--;
+ break;
+ case BPF_AND:
+ /* dreg &= sreg
+ * AND can not extend zero bits only shrink
+ * Ex. 0x00..00ffffff
+ * & 0x0f..ffffffff
+ * ----------------
+ * 0x00..00ffffff
+ */
+ dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+ break;
+ case BPF_OR:
+ /* dreg |= sreg
+ * OR can only extend zero bits
+ * Ex. 0x00..00ffffff
+ * | 0x0f..ffffffff
+ * ----------------
+ * 0x0f..00ffffff
+ */
+ dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ break;
+ case BPF_SUB:
+ case BPF_MUL:
+ case BPF_RSH:
+ case BPF_LSH:
+ /* These may be flushed out later */
+ default:
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ }
+ } else {
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ }
+
+ dst_reg->type = UNKNOWN_VALUE;
+ return 0;
+}
+
static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
@@ -1659,6 +1732,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
u8 opcode = BPF_OP(insn->code);
u64 dst_imm = dst_reg->imm;
+ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
+ return evaluate_reg_imm_alu_unknown(env, insn);
+
/* dst_reg->type == CONST_IMM here. Simulate execution of insns
* containing ALU ops. Don't care about overflow or negative
* values, just add/sub/... them; registers are in u64.
@@ -1763,10 +1839,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
dst_align = dst_reg->min_align;
/* We don't know anything about what was done to this register, mark it
- * as unknown.
+ * as unknown. Also, if both derived bounds came from signed/unsigned
+ * mixed compares and one side is unbounded, we cannot really do anything
+ * with them as boundaries cannot be trusted. Thus, arithmetic of two
+ * regs of such kind will get invalidated bounds on the dst side.
*/
- if (min_val == BPF_REGISTER_MIN_RANGE &&
- max_val == BPF_REGISTER_MAX_RANGE) {
+ if ((min_val == BPF_REGISTER_MIN_RANGE &&
+ max_val == BPF_REGISTER_MAX_RANGE) ||
+ (BPF_SRC(insn->code) == BPF_X &&
+ ((min_val != BPF_REGISTER_MIN_RANGE &&
+ max_val == BPF_REGISTER_MAX_RANGE) ||
+ (min_val == BPF_REGISTER_MIN_RANGE &&
+ max_val != BPF_REGISTER_MAX_RANGE) ||
+ (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
+ dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
+ (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
+ dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
+ regs[insn->dst_reg].value_from_signed !=
+ regs[insn->src_reg].value_from_signed)) {
reset_reg_range_values(regs, insn->dst_reg);
return;
}
@@ -1775,10 +1865,12 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* do our normal operations to the register, we need to set the values
* to the min/max since they are undefined.
*/
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if (opcode != BPF_SUB) {
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ }
switch (opcode) {
case BPF_ADD:
@@ -1789,10 +1881,17 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
dst_reg->min_align = min(src_align, dst_align);
break;
case BPF_SUB:
+ /* If one of our values was at the end of our ranges, then the
+ * _opposite_ value in the dst_reg goes to the end of our range.
+ */
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value -= min_val;
+ dst_reg->min_value -= max_val;
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value -= max_val;
+ dst_reg->max_value -= min_val;
dst_reg->min_align = min(src_align, dst_align);
break;
case BPF_MUL:
@@ -1950,9 +2049,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
*/
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = insn->imm;
+ regs[insn->dst_reg].id = 0;
regs[insn->dst_reg].max_value = insn->imm;
regs[insn->dst_reg].min_value = insn->imm;
regs[insn->dst_reg].min_align = calc_align(insn->imm);
+ regs[insn->dst_reg].value_from_signed = false;
}
} else if (opcode > BPF_END) {
@@ -2128,40 +2229,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
+ bool value_from_signed = true;
+ bool is_range = true;
+
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
true_reg->max_value = true_reg->min_value = val;
+ is_range = false;
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
false_reg->max_value = false_reg->min_value = val;
+ is_range = false;
break;
case BPF_JGT:
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
+ value_from_signed = false;
/* fallthrough */
case BPF_JSGT:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGT) {
+ /* Unsigned comparison, the minimum value is 0. */
+ false_reg->min_value = 0;
+ }
/* If this is false then we know the maximum val is val,
* otherwise we know the min val is val+1.
*/
false_reg->max_value = val;
+ false_reg->value_from_signed = value_from_signed;
true_reg->min_value = val + 1;
+ true_reg->value_from_signed = value_from_signed;
break;
case BPF_JGE:
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
+ value_from_signed = false;
/* fallthrough */
case BPF_JSGE:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGE) {
+ /* Unsigned comparison, the minimum value is 0. */
+ false_reg->min_value = 0;
+ }
/* If this is false then we know the maximum value is val - 1,
* otherwise we know the mimimum value is val.
*/
false_reg->max_value = val - 1;
+ false_reg->value_from_signed = value_from_signed;
true_reg->min_value = val;
+ true_reg->value_from_signed = value_from_signed;
break;
default:
break;
@@ -2169,6 +2293,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
check_reg_overflow(false_reg);
check_reg_overflow(true_reg);
+ if (is_range) {
+ if (__is_pointer_value(false, false_reg))
+ reset_reg_range_values(false_reg, 0);
+ if (__is_pointer_value(false, true_reg))
+ reset_reg_range_values(true_reg, 0);
+ }
}
/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
@@ -2178,41 +2308,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
+ bool value_from_signed = true;
+ bool is_range = true;
+
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
true_reg->max_value = true_reg->min_value = val;
+ is_range = false;
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
false_reg->max_value = false_reg->min_value = val;
+ is_range = false;
break;
case BPF_JGT:
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
+ value_from_signed = false;
/* fallthrough */
case BPF_JSGT:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGT) {
+ /* Unsigned comparison, the minimum value is 0. */
+ true_reg->min_value = 0;
+ }
/*
* If this is false, then the val is <= the register, if it is
* true the register <= to the val.
*/
false_reg->min_value = val;
+ false_reg->value_from_signed = value_from_signed;
true_reg->max_value = val - 1;
+ true_reg->value_from_signed = value_from_signed;
break;
case BPF_JGE:
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
+ value_from_signed = false;
/* fallthrough */
case BPF_JSGE:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGE) {
+ /* Unsigned comparison, the minimum value is 0. */
+ true_reg->min_value = 0;
+ }
/* If this is false then constant < register, if it is true then
* the register < constant.
*/
false_reg->min_value = val + 1;
+ false_reg->value_from_signed = value_from_signed;
true_reg->max_value = val;
+ true_reg->value_from_signed = value_from_signed;
break;
default:
break;
@@ -2220,6 +2373,12 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
check_reg_overflow(false_reg);
check_reg_overflow(true_reg);
+ if (is_range) {
+ if (__is_pointer_value(false, false_reg))
+ reset_reg_range_values(false_reg, 0);
+ if (__is_pointer_value(false, true_reg))
+ reset_reg_range_values(true_reg, 0);
+ }
}
static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
@@ -2407,6 +2566,7 @@ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
regs[insn->dst_reg].type = CONST_IMM;
regs[insn->dst_reg].imm = imm;
+ regs[insn->dst_reg].id = 0;
return 0;
}
@@ -2826,6 +2986,8 @@ static bool states_equal(struct bpf_verifier_env *env,
return false;
if (i % BPF_REG_SIZE)
continue;
+ if (old->stack_slot_type[i] != STACK_SPILL)
+ continue;
if (memcmp(&old->spilled_regs[i / BPF_REG_SIZE],
&cur->spilled_regs[i / BPF_REG_SIZE],
sizeof(old->spilled_regs[0])))
@@ -2987,18 +3149,12 @@ static int do_check(struct bpf_verifier_env *env)
/* check that memory (src_reg + off) is readable,
* the state of dst_reg will be updated by this func
*/
- err = check_mem_access(env, insn->src_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ,
insn->dst_reg);
if (err)
return err;
- if (BPF_SIZE(insn->code) != BPF_W &&
- BPF_SIZE(insn->code) != BPF_DW) {
- insn_idx++;
- continue;
- }
-
prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
if (*prev_src_type == NOT_INIT) {
@@ -3026,7 +3182,7 @@ static int do_check(struct bpf_verifier_env *env)
enum bpf_reg_type *prev_dst_type, dst_reg_type;
if (BPF_MODE(insn->code) == BPF_XADD) {
- err = check_xadd(env, insn);
+ err = check_xadd(env, insn_idx, insn);
if (err)
return err;
insn_idx++;
@@ -3045,7 +3201,7 @@ static int do_check(struct bpf_verifier_env *env)
dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
insn->src_reg);
if (err)
@@ -3074,7 +3230,7 @@ static int do_check(struct bpf_verifier_env *env)
return err;
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn->dst_reg, insn->off,
+ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_WRITE,
-1);
if (err)
@@ -3172,7 +3328,8 @@ process_bpf_exit:
insn_idx++;
}
- verbose("processed %d insns\n", insn_processed);
+ verbose("processed %d insns, stack depth %d\n",
+ insn_processed, env->prog->aux->stack_depth);
return 0;
}
@@ -3372,11 +3529,13 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of
static int convert_ctx_accesses(struct bpf_verifier_env *env)
{
const struct bpf_verifier_ops *ops = env->prog->aux->ops;
+ int i, cnt, size, ctx_field_size, delta = 0;
const int insn_cnt = env->prog->len;
struct bpf_insn insn_buf[16], *insn;
struct bpf_prog *new_prog;
enum bpf_access_type type;
- int i, cnt, delta = 0;
+ bool is_narrower_load;
+ u32 target_size;
if (ops->gen_prologue) {
cnt = ops->gen_prologue(insn_buf, env->seen_direct_write,
@@ -3416,12 +3575,52 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX)
continue;
- cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog);
- if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) {
+ ctx_field_size = env->insn_aux_data[i + delta].ctx_field_size;
+ size = BPF_LDST_BYTES(insn);
+
+ /* If the read access is a narrower load of the field,
+ * convert to a 4/8-byte load, to minimum program type specific
+ * convert_ctx_access changes. If conversion is successful,
+ * we will apply proper mask to the result.
+ */
+ is_narrower_load = size < ctx_field_size;
+ if (is_narrower_load) {
+ u32 off = insn->off;
+ u8 size_code;
+
+ if (type == BPF_WRITE) {
+ verbose("bpf verifier narrow ctx access misconfigured\n");
+ return -EINVAL;
+ }
+
+ size_code = BPF_H;
+ if (ctx_field_size == 4)
+ size_code = BPF_W;
+ else if (ctx_field_size == 8)
+ size_code = BPF_DW;
+
+ insn->off = off & ~(ctx_field_size - 1);
+ insn->code = BPF_LDX | BPF_MEM | size_code;
+ }
+
+ target_size = 0;
+ cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog,
+ &target_size);
+ if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf) ||
+ (ctx_field_size && !target_size)) {
verbose("bpf verifier is misconfigured\n");
return -EINVAL;
}
+ if (is_narrower_load && size < target_size) {
+ if (ctx_field_size <= 4)
+ insn_buf[cnt++] = BPF_ALU32_IMM(BPF_AND, insn->dst_reg,
+ (1 << size * 8) - 1);
+ else
+ insn_buf[cnt++] = BPF_ALU64_IMM(BPF_AND, insn->dst_reg,
+ (1 << size * 8) - 1);
+ }
+
new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
if (!new_prog)
return -ENOMEM;
@@ -3467,6 +3666,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
* the program array.
*/
prog->cb_access = 1;
+ env->prog->aux->stack_depth = MAX_BPF_STACK;
/* mark bpf_tail_call as different opcode to avoid
* conditional branch in the interpeter for every normal
@@ -3474,7 +3674,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
* that doesn't support bpf_tail_call yet
*/
insn->imm = 0;
- insn->code |= BPF_X;
+ insn->code = BPF_JMP | BPF_TAIL_CALL;
continue;
}
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 387348a40c64..ce693ccb8c58 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_CGROUP_FREEZER) += freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 00f4d6bf048f..8b4c3c2f2509 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -33,6 +33,9 @@ struct cgroup_taskset {
struct list_head src_csets;
struct list_head dst_csets;
+ /* the number of tasks in the set */
+ int nr_tasks;
+
/* the subsys currently being processed */
int ssid;
@@ -192,6 +195,8 @@ int cgroup_rmdir(struct kernfs_node *kn);
int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
struct kernfs_root *kf_root);
+int cgroup_task_count(const struct cgroup *cgrp);
+
/*
* namespace.c
*/
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 85d75152402d..7bf4b1533f34 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -334,19 +334,15 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp,
/**
* cgroup_task_count - count the number of tasks in a cgroup.
* @cgrp: the cgroup in question
- *
- * Return the number of tasks in the cgroup. The returned number can be
- * higher than the actual number of tasks due to css_set references from
- * namespace roots and temporary usages.
*/
-static int cgroup_task_count(const struct cgroup *cgrp)
+int cgroup_task_count(const struct cgroup *cgrp)
{
int count = 0;
struct cgrp_cset_link *link;
spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
- count += refcount_read(&link->cset->refcount);
+ count += link->cset->nr_tasks;
spin_unlock_irq(&css_set_lock);
return count;
}
@@ -1263,150 +1259,3 @@ static int __init cgroup_no_v1(char *str)
return 1;
}
__setup("cgroup_no_v1=", cgroup_no_v1);
-
-
-#ifdef CONFIG_CGROUP_DEBUG
-static struct cgroup_subsys_state *
-debug_css_alloc(struct cgroup_subsys_state *parent_css)
-{
- struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
-
- if (!css)
- return ERR_PTR(-ENOMEM);
-
- return css;
-}
-
-static void debug_css_free(struct cgroup_subsys_state *css)
-{
- kfree(css);
-}
-
-static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return cgroup_task_count(css->cgroup);
-}
-
-static u64 current_css_set_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- return (u64)(unsigned long)current->cgroups;
-}
-
-static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
-{
- u64 count;
-
- rcu_read_lock();
- count = refcount_read(&task_css_set(current)->refcount);
- rcu_read_unlock();
- return count;
-}
-
-static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
-{
- struct cgrp_cset_link *link;
- struct css_set *cset;
- char *name_buf;
-
- name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
- if (!name_buf)
- return -ENOMEM;
-
- spin_lock_irq(&css_set_lock);
- rcu_read_lock();
- cset = rcu_dereference(current->cgroups);
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
-
- cgroup_name(c, name_buf, NAME_MAX + 1);
- seq_printf(seq, "Root %d group %s\n",
- c->root->hierarchy_id, name_buf);
- }
- rcu_read_unlock();
- spin_unlock_irq(&css_set_lock);
- kfree(name_buf);
- return 0;
-}
-
-#define MAX_TASKS_SHOWN_PER_CSS 25
-static int cgroup_css_links_read(struct seq_file *seq, void *v)
-{
- struct cgroup_subsys_state *css = seq_css(seq);
- struct cgrp_cset_link *link;
-
- spin_lock_irq(&css_set_lock);
- list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
- struct css_set *cset = link->cset;
- struct task_struct *task;
- int count = 0;
-
- seq_printf(seq, "css_set %pK\n", cset);
-
- list_for_each_entry(task, &cset->tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
-
- list_for_each_entry(task, &cset->mg_tasks, cg_list) {
- if (count++ > MAX_TASKS_SHOWN_PER_CSS)
- goto overflow;
- seq_printf(seq, " task %d\n", task_pid_vnr(task));
- }
- continue;
- overflow:
- seq_puts(seq, " ...\n");
- }
- spin_unlock_irq(&css_set_lock);
- return 0;
-}
-
-static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- return (!cgroup_is_populated(css->cgroup) &&
- !css_has_online_children(&css->cgroup->self));
-}
-
-static struct cftype debug_files[] = {
- {
- .name = "taskcount",
- .read_u64 = debug_taskcount_read,
- },
-
- {
- .name = "current_css_set",
- .read_u64 = current_css_set_read,
- },
-
- {
- .name = "current_css_set_refcount",
- .read_u64 = current_css_set_refcount_read,
- },
-
- {
- .name = "current_css_set_cg_links",
- .seq_show = current_css_set_cg_links_read,
- },
-
- {
- .name = "cgroup_css_links",
- .seq_show = cgroup_css_links_read,
- },
-
- {
- .name = "releasable",
- .read_u64 = releasable_read,
- },
-
- { } /* terminate */
-};
-
-struct cgroup_subsys debug_cgrp_subsys = {
- .css_alloc = debug_css_alloc,
- .css_free = debug_css_free,
- .legacy_cftypes = debug_files,
-};
-#endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8d4e85eae42c..df2e0f14a95d 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -573,6 +573,11 @@ static int css_set_count = 1; /* 1 for init_css_set */
/**
* css_set_populated - does a css_set contain any tasks?
* @cset: target css_set
+ *
+ * css_set_populated() should be the same as !!cset->nr_tasks at steady
+ * state. However, css_set_populated() can be called while a task is being
+ * added to or removed from the linked list before the nr_tasks is
+ * properly updated. Hence, we can't just look at ->nr_tasks here.
*/
static bool css_set_populated(struct css_set *cset)
{
@@ -1542,10 +1547,56 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
return len;
}
+static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
+{
+ char *token;
+
+ *root_flags = 0;
+
+ if (!data)
+ return 0;
+
+ while ((token = strsep(&data, ",")) != NULL) {
+ if (!strcmp(token, "nsdelegate")) {
+ *root_flags |= CGRP_ROOT_NS_DELEGATE;
+ continue;
+ }
+
+ pr_err("cgroup2: unknown option \"%s\"\n", token);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void apply_cgroup_root_flags(unsigned int root_flags)
+{
+ if (current->nsproxy->cgroup_ns == &init_cgroup_ns) {
+ if (root_flags & CGRP_ROOT_NS_DELEGATE)
+ cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+ }
+}
+
+static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root)
+{
+ if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
+ seq_puts(seq, ",nsdelegate");
+ return 0;
+}
+
static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data)
{
- pr_err("remount is not allowed\n");
- return -EINVAL;
+ unsigned int root_flags;
+ int ret;
+
+ ret = parse_cgroup_root_flags(data, &root_flags);
+ if (ret)
+ return ret;
+
+ apply_cgroup_root_flags(root_flags);
+ return 0;
}
/*
@@ -1598,6 +1649,7 @@ static void cgroup_enable_task_cg_lists(void)
css_set_update_populated(cset, true);
list_add_tail(&p->cg_list, &cset->tasks);
get_css_set(cset);
+ cset->nr_tasks++;
}
spin_unlock(&p->sighand->siglock);
} while_each_thread(g, p);
@@ -1784,6 +1836,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
{
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct dentry *dentry;
+ int ret;
get_cgroup_ns(ns);
@@ -1801,16 +1854,21 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
cgroup_enable_task_cg_lists();
if (fs_type == &cgroup2_fs_type) {
- if (data) {
- pr_err("cgroup2: unknown option \"%s\"\n", (char *)data);
+ unsigned int root_flags;
+
+ ret = parse_cgroup_root_flags(data, &root_flags);
+ if (ret) {
put_cgroup_ns(ns);
- return ERR_PTR(-EINVAL);
+ return ERR_PTR(ret);
}
+
cgrp_dfl_visible = true;
cgroup_get_live(&cgrp_dfl_root.cgrp);
dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root,
CGROUP2_SUPER_MAGIC, ns);
+ if (!IS_ERR(dentry))
+ apply_cgroup_root_flags(root_flags);
} else {
dentry = cgroup1_mount(&cgroup_fs_type, flags, data,
CGROUP_SUPER_MAGIC, ns);
@@ -1948,6 +2006,8 @@ static void cgroup_migrate_add_task(struct task_struct *task,
if (!cset->mg_src_cgrp)
return;
+ mgctx->tset.nr_tasks++;
+
list_move_tail(&task->cg_list, &cset->mg_tasks);
if (list_empty(&cset->mg_node))
list_add_tail(&cset->mg_node,
@@ -2036,21 +2096,19 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
struct css_set *cset, *tmp_cset;
int ssid, failed_ssid, ret;
- /* methods shouldn't be called if no task is actually migrating */
- if (list_empty(&tset->src_csets))
- return 0;
-
/* check that we can legitimately attach to the cgroup */
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->can_attach) {
- tset->ssid = ssid;
- ret = ss->can_attach(tset);
- if (ret) {
- failed_ssid = ssid;
- goto out_cancel_attach;
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->can_attach) {
+ tset->ssid = ssid;
+ ret = ss->can_attach(tset);
+ if (ret) {
+ failed_ssid = ssid;
+ goto out_cancel_attach;
+ }
}
- }
- } while_each_subsys_mask();
+ } while_each_subsys_mask();
+ }
/*
* Now that we're guaranteed success, proceed to move all tasks to
@@ -2064,8 +2122,10 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
struct css_set *to_cset = cset->mg_dst_cset;
get_css_set(to_cset);
+ to_cset->nr_tasks++;
css_set_move_task(task, from_cset, to_cset, true);
put_css_set_locked(from_cset);
+ from_cset->nr_tasks--;
}
}
spin_unlock_irq(&css_set_lock);
@@ -2077,25 +2137,29 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx)
*/
tset->csets = &tset->dst_csets;
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ss->attach) {
- tset->ssid = ssid;
- ss->attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ss->attach) {
+ tset->ssid = ssid;
+ ss->attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
ret = 0;
goto out_release_tset;
out_cancel_attach:
- do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
- if (ssid == failed_ssid)
- break;
- if (ss->cancel_attach) {
- tset->ssid = ssid;
- ss->cancel_attach(tset);
- }
- } while_each_subsys_mask();
+ if (tset->nr_tasks) {
+ do_each_subsys_mask(ss, ssid, mgctx->ss_mask) {
+ if (ssid == failed_ssid)
+ break;
+ if (ss->cancel_attach) {
+ tset->ssid = ssid;
+ ss->cancel_attach(tset);
+ }
+ } while_each_subsys_mask();
+ }
out_release_tset:
spin_lock_irq(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
@@ -2355,27 +2419,14 @@ static int cgroup_procs_write_permission(struct task_struct *task,
struct cgroup *dst_cgrp,
struct kernfs_open_file *of)
{
- int ret = 0;
-
- if (cgroup_on_dfl(dst_cgrp)) {
- struct super_block *sb = of->file->f_path.dentry->d_sb;
- struct cgroup *cgrp;
- struct inode *inode;
-
- spin_lock_irq(&css_set_lock);
- cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- while (!cgroup_is_descendant(dst_cgrp, cgrp))
- cgrp = cgroup_parent(cgrp);
+ struct super_block *sb = of->file->f_path.dentry->d_sb;
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup *root_cgrp = ns->root_cset->dfl_cgrp;
+ struct cgroup *src_cgrp, *com_cgrp;
+ struct inode *inode;
+ int ret;
- ret = -ENOMEM;
- inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
- if (inode) {
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
- }
- } else {
+ if (!cgroup_on_dfl(dst_cgrp)) {
const struct cred *cred = current_cred();
const struct cred *tcred = get_task_cred(task);
@@ -2383,14 +2434,47 @@ static int cgroup_procs_write_permission(struct task_struct *task,
* even if we're attaching all tasks in the thread group,
* we only need to check permissions on one of them.
*/
- if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
- !uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
+ if (uid_eq(cred->euid, GLOBAL_ROOT_UID) ||
+ uid_eq(cred->euid, tcred->uid) ||
+ uid_eq(cred->euid, tcred->suid))
+ ret = 0;
+ else
ret = -EACCES;
+
put_cred(tcred);
+ return ret;
}
- return ret;
+ /* find the source cgroup */
+ spin_lock_irq(&css_set_lock);
+ src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
+ spin_unlock_irq(&css_set_lock);
+
+ /* and the common ancestor */
+ com_cgrp = src_cgrp;
+ while (!cgroup_is_descendant(dst_cgrp, com_cgrp))
+ com_cgrp = cgroup_parent(com_cgrp);
+
+ /* %current should be authorized to migrate to the common ancestor */
+ inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(inode, MAY_WRITE);
+ iput(inode);
+ if (ret)
+ return ret;
+
+ /*
+ * If namespaces are delegation boundaries, %current must be able
+ * to see both source and destination cgroups from its namespace.
+ */
+ if ((cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) &&
+ (!cgroup_is_descendant(src_cgrp, root_cgrp) ||
+ !cgroup_is_descendant(dst_cgrp, root_cgrp)))
+ return -ENOENT;
+
+ return 0;
}
/*
@@ -2917,11 +3001,11 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
cgrp->subtree_control &= ~disable;
ret = cgroup_apply_control(cgrp);
-
cgroup_finalize_control(cgrp, ret);
+ if (ret)
+ goto out_unlock;
kernfs_activate(cgrp->kn);
- ret = 0;
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
@@ -2954,11 +3038,23 @@ static void cgroup_file_release(struct kernfs_open_file *of)
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
+ struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of->kn->priv;
struct cgroup_subsys_state *css;
int ret;
+ /*
+ * If namespaces are delegation boundaries, disallow writes to
+ * files in an non-init namespace root from inside the namespace
+ * except for the files explicitly marked delegatable -
+ * cgroup.procs and cgroup.subtree_control.
+ */
+ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
+ !(cft->flags & CFTYPE_NS_DELEGATABLE) &&
+ ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ return -EPERM;
+
if (cft->write)
return cft->write(of, buf, nbytes, off);
@@ -3792,6 +3888,7 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
static struct cftype cgroup_base_files[] = {
{
.name = "cgroup.procs",
+ .flags = CFTYPE_NS_DELEGATABLE,
.file_offset = offsetof(struct cgroup, procs_file),
.release = cgroup_procs_release,
.seq_start = cgroup_procs_start,
@@ -3805,6 +3902,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cgroup.subtree_control",
+ .flags = CFTYPE_NS_DELEGATABLE,
.seq_show = cgroup_subtree_control_show,
.write = cgroup_subtree_control_write,
},
@@ -4393,6 +4491,7 @@ int cgroup_rmdir(struct kernfs_node *kn)
}
static struct kernfs_syscall_ops cgroup_kf_syscall_ops = {
+ .show_options = cgroup_show_options,
.remount_fs = cgroup_remount,
.mkdir = cgroup_mkdir,
.rmdir = cgroup_rmdir,
@@ -4574,6 +4673,10 @@ int __init cgroup_init(void)
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
+
+ mutex_lock(&cgroup_mutex);
+ css_populate_dir(init_css_set.subsys[ssid]);
+ mutex_unlock(&cgroup_mutex);
}
/* init_css_set.subsys[] has been updated, re-hash */
@@ -4789,6 +4892,7 @@ void cgroup_post_fork(struct task_struct *child)
cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
get_css_set(cset);
+ cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
}
spin_unlock_irq(&css_set_lock);
@@ -4838,6 +4942,7 @@ void cgroup_exit(struct task_struct *tsk)
if (!list_empty(&tsk->cg_list)) {
spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
+ cset->nr_tasks--;
spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ae643412948a..df403e97b073 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -63,6 +63,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
+DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
@@ -1038,40 +1039,25 @@ static void cpuset_post_attach(void)
* @tsk: the task to change
* @newmems: new nodes that the task will be set
*
- * In order to avoid seeing no nodes if the old and new nodes are disjoint,
- * we structure updates as setting all new allowed nodes, then clearing newly
- * disallowed ones.
+ * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
+ * and rebind an eventual tasks' mempolicy. If the task is allocating in
+ * parallel, it might temporarily see an empty intersection, which results in
+ * a seqlock check and retry before OOM or allocation failure.
*/
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
- bool need_loop;
-
task_lock(tsk);
- /*
- * Determine if a loop is necessary if another thread is doing
- * read_mems_allowed_begin(). If at least one node remains unchanged and
- * tsk does not have a mempolicy, then an empty nodemask will not be
- * possible when mems_allowed is larger than a word.
- */
- need_loop = task_has_mempolicy(tsk) ||
- !nodes_intersects(*newmems, tsk->mems_allowed);
- if (need_loop) {
- local_irq_disable();
- write_seqcount_begin(&tsk->mems_allowed_seq);
- }
+ local_irq_disable();
+ write_seqcount_begin(&tsk->mems_allowed_seq);
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
+ mpol_rebind_task(tsk, newmems);
tsk->mems_allowed = *newmems;
- if (need_loop) {
- write_seqcount_end(&tsk->mems_allowed_seq);
- local_irq_enable();
- }
+ write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
task_unlock(tsk);
}
@@ -1906,6 +1892,7 @@ static struct cftype files[] = {
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
},
{
@@ -2357,13 +2344,7 @@ void cpuset_update_active_cpus(void)
* We're inside cpu hotplug critical region which usually nests
* inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order.
- *
- * We still need to do partition_sched_domains() synchronously;
- * otherwise, the scheduler will get confused and put tasks to the
- * dead CPU. Fall back to the default single domain.
- * cpuset_hotplug_workfn() will rebuild it as necessary.
*/
- partition_sched_domains(1, NULL, NULL);
schedule_work(&cpuset_hotplug_work);
}
diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c
new file mode 100644
index 000000000000..dac46af22782
--- /dev/null
+++ b/kernel/cgroup/debug.c
@@ -0,0 +1,357 @@
+/*
+ * Debug controller
+ *
+ * WARNING: This controller is for cgroup core debugging only.
+ * Its interfaces are unstable and subject to changes at any time.
+ */
+#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+
+#include "cgroup-internal.h"
+
+static struct cgroup_subsys_state *
+debug_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL);
+
+ if (!css)
+ return ERR_PTR(-ENOMEM);
+
+ return css;
+}
+
+static void debug_css_free(struct cgroup_subsys_state *css)
+{
+ kfree(css);
+}
+
+/*
+ * debug_taskcount_read - return the number of tasks in a cgroup.
+ * @cgrp: the cgroup in question
+ */
+static u64 debug_taskcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return cgroup_task_count(css->cgroup);
+}
+
+static int current_css_set_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct css_set *cset;
+ struct cgroup_subsys *ss;
+ struct cgroup_subsys_state *css;
+ int i, refcnt;
+
+ if (!cgroup_kn_lock_live(of->kn, false))
+ return -ENODEV;
+
+ spin_lock_irq(&css_set_lock);
+ rcu_read_lock();
+ cset = rcu_dereference(current->cgroups);
+ refcnt = refcount_read(&cset->refcount);
+ seq_printf(seq, "css_set %pK %d", cset, refcnt);
+ if (refcnt > cset->nr_tasks)
+ seq_printf(seq, " +%d", refcnt - cset->nr_tasks);
+ seq_puts(seq, "\n");
+
+ /*
+ * Print the css'es stored in the current css_set.
+ */
+ for_each_subsys(ss, i) {
+ css = cset->subsys[ss->id];
+ if (!css)
+ continue;
+ seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name,
+ (unsigned long)css, css->id);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&css_set_lock);
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ u64 count;
+
+ rcu_read_lock();
+ count = refcount_read(&task_css_set(current)->refcount);
+ rcu_read_unlock();
+ return count;
+}
+
+static int current_css_set_cg_links_read(struct seq_file *seq, void *v)
+{
+ struct cgrp_cset_link *link;
+ struct css_set *cset;
+ char *name_buf;
+
+ name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL);
+ if (!name_buf)
+ return -ENOMEM;
+
+ spin_lock_irq(&css_set_lock);
+ rcu_read_lock();
+ cset = rcu_dereference(current->cgroups);
+ list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
+ struct cgroup *c = link->cgrp;
+
+ cgroup_name(c, name_buf, NAME_MAX + 1);
+ seq_printf(seq, "Root %d group %s\n",
+ c->root->hierarchy_id, name_buf);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&css_set_lock);
+ kfree(name_buf);
+ return 0;
+}
+
+#define MAX_TASKS_SHOWN_PER_CSS 25
+static int cgroup_css_links_read(struct seq_file *seq, void *v)
+{
+ struct cgroup_subsys_state *css = seq_css(seq);
+ struct cgrp_cset_link *link;
+ int dead_cnt = 0, extra_refs = 0;
+
+ spin_lock_irq(&css_set_lock);
+ list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
+ struct css_set *cset = link->cset;
+ struct task_struct *task;
+ int count = 0;
+ int refcnt = refcount_read(&cset->refcount);
+
+ seq_printf(seq, " %d", refcnt);
+ if (refcnt - cset->nr_tasks > 0) {
+ int extra = refcnt - cset->nr_tasks;
+
+ seq_printf(seq, " +%d", extra);
+ /*
+ * Take out the one additional reference in
+ * init_css_set.
+ */
+ if (cset == &init_css_set)
+ extra--;
+ extra_refs += extra;
+ }
+ seq_puts(seq, "\n");
+
+ list_for_each_entry(task, &cset->tasks, cg_list) {
+ if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " task %d\n",
+ task_pid_vnr(task));
+ }
+
+ list_for_each_entry(task, &cset->mg_tasks, cg_list) {
+ if (count++ <= MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " task %d\n",
+ task_pid_vnr(task));
+ }
+ /* show # of overflowed tasks */
+ if (count > MAX_TASKS_SHOWN_PER_CSS)
+ seq_printf(seq, " ... (%d)\n",
+ count - MAX_TASKS_SHOWN_PER_CSS);
+
+ if (cset->dead) {
+ seq_puts(seq, " [dead]\n");
+ dead_cnt++;
+ }
+
+ WARN_ON(count != cset->nr_tasks);
+ }
+ spin_unlock_irq(&css_set_lock);
+
+ if (!dead_cnt && !extra_refs)
+ return 0;
+
+ seq_puts(seq, "\n");
+ if (extra_refs)
+ seq_printf(seq, "extra references = %d\n", extra_refs);
+ if (dead_cnt)
+ seq_printf(seq, "dead css_sets = %d\n", dead_cnt);
+
+ return 0;
+}
+
+static int cgroup_subsys_states_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct cgroup *cgrp;
+ struct cgroup_subsys *ss;
+ struct cgroup_subsys_state *css;
+ char pbuf[16];
+ int i;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ for_each_subsys(ss, i) {
+ css = rcu_dereference_check(cgrp->subsys[ss->id], true);
+ if (!css)
+ continue;
+
+ pbuf[0] = '\0';
+
+ /* Show the parent CSS if applicable*/
+ if (css->parent)
+ snprintf(pbuf, sizeof(pbuf) - 1, " P=%d",
+ css->parent->id);
+ seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name,
+ (unsigned long)css, css->id,
+ atomic_read(&css->online_cnt), pbuf);
+ }
+
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static void cgroup_masks_read_one(struct seq_file *seq, const char *name,
+ u16 mask)
+{
+ struct cgroup_subsys *ss;
+ int ssid;
+ bool first = true;
+
+ seq_printf(seq, "%-17s: ", name);
+ for_each_subsys(ss, ssid) {
+ if (!(mask & (1 << ssid)))
+ continue;
+ if (!first)
+ seq_puts(seq, ", ");
+ seq_puts(seq, ss->name);
+ first = false;
+ }
+ seq_putc(seq, '\n');
+}
+
+static int cgroup_masks_read(struct seq_file *seq, void *v)
+{
+ struct kernfs_open_file *of = seq->private;
+ struct cgroup *cgrp;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENODEV;
+
+ cgroup_masks_read_one(seq, "subtree_control", cgrp->subtree_control);
+ cgroup_masks_read_one(seq, "subtree_ss_mask", cgrp->subtree_ss_mask);
+
+ cgroup_kn_unlock(of->kn);
+ return 0;
+}
+
+static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ return (!cgroup_is_populated(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
+}
+
+static struct cftype debug_legacy_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .seq_show = current_css_set_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "cgroup_css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "cgroup_subsys_states",
+ .seq_show = cgroup_subsys_states_read,
+ },
+
+ {
+ .name = "cgroup_masks",
+ .seq_show = cgroup_masks_read,
+ },
+
+ {
+ .name = "releasable",
+ .read_u64 = releasable_read,
+ },
+
+ { } /* terminate */
+};
+
+static struct cftype debug_files[] = {
+ {
+ .name = "taskcount",
+ .read_u64 = debug_taskcount_read,
+ },
+
+ {
+ .name = "current_css_set",
+ .seq_show = current_css_set_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_refcount",
+ .read_u64 = current_css_set_refcount_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "current_css_set_cg_links",
+ .seq_show = current_css_set_cg_links_read,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+
+ {
+ .name = "css_links",
+ .seq_show = cgroup_css_links_read,
+ },
+
+ {
+ .name = "csses",
+ .seq_show = cgroup_subsys_states_read,
+ },
+
+ {
+ .name = "masks",
+ .seq_show = cgroup_masks_read,
+ },
+
+ { } /* terminate */
+};
+
+struct cgroup_subsys debug_cgrp_subsys = {
+ .css_alloc = debug_css_alloc,
+ .css_free = debug_css_free,
+ .legacy_cftypes = debug_legacy_files,
+};
+
+/*
+ * On v2, debug is an implicit controller enabled by "cgroup_debug" boot
+ * parameter.
+ */
+static int __init enable_cgroup_debug(char *str)
+{
+ debug_cgrp_subsys.dfl_cftypes = debug_files;
+ debug_cgrp_subsys.implicit_on_dfl = true;
+ return 1;
+}
+__setup("cgroup_debug", enable_cgroup_debug);
diff --git a/kernel/compat.c b/kernel/compat.c
index ebd8bdc3fd68..6f0a0e723a06 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -120,6 +120,50 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
+static int __compat_get_timespec64(struct timespec64 *ts64,
+ const struct compat_timespec __user *cts)
+{
+ struct compat_timespec ts;
+ int ret;
+
+ ret = copy_from_user(&ts, cts, sizeof(ts));
+ if (ret)
+ return -EFAULT;
+
+ ts64->tv_sec = ts.tv_sec;
+ ts64->tv_nsec = ts.tv_nsec;
+
+ return 0;
+}
+
+static int __compat_put_timespec64(const struct timespec64 *ts64,
+ struct compat_timespec __user *cts)
+{
+ struct compat_timespec ts = {
+ .tv_sec = ts64->tv_sec,
+ .tv_nsec = ts64->tv_nsec
+ };
+ return copy_to_user(cts, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+
+int compat_get_timespec64(struct timespec64 *ts, const void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_get_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_get_timespec64);
+
+int compat_put_timespec64(const struct timespec64 *ts, void __user *uts)
+{
+ if (COMPAT_USE_64BIT_TIME)
+ return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
+ else
+ return __compat_put_timespec64(ts, uts);
+}
+EXPORT_SYMBOL_GPL(compat_put_timespec64);
+
int compat_get_timeval(struct timeval *tv, const void __user *utv)
{
if (COMPAT_USE_64BIT_TIME)
@@ -203,53 +247,6 @@ int put_compat_itimerval(struct compat_itimerval __user *o, const struct itimerv
return copy_to_user(o, &v32, sizeof(struct compat_itimerval)) ? -EFAULT : 0;
}
-static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
-{
- return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
-}
-
-COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
-{
- if (tbuf) {
- struct tms tms;
- struct compat_tms tmp;
-
- do_sys_times(&tms);
- /* Convert our struct tms to the compat version. */
- tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
- tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
- tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
- tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
- if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
- return -EFAULT;
- }
- force_successful_syscall_return();
- return compat_jiffies_to_clock_t(jiffies);
-}
-
-#ifdef __ARCH_WANT_SYS_SIGPENDING
-
-/*
- * Assumption: old_sigset_t and compat_old_sigset_t are both
- * types that can be passed to put_user()/get_user().
- */
-
-COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set)
-{
- old_sigset_t s;
- long ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_sigpending((old_sigset_t __user *) &s);
- set_fs(old_fs);
- if (ret == 0)
- ret = put_user(s, set);
- return ret;
-}
-
-#endif
-
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/*
@@ -304,164 +301,33 @@ COMPAT_SYSCALL_DEFINE3(sigprocmask, int, how,
#endif
-COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
-
- if (!access_ok(VERIFY_READ, rlim, sizeof(*rlim)) ||
- __get_user(r.rlim_cur, &rlim->rlim_cur) ||
- __get_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
-
- if (r.rlim_cur == COMPAT_RLIM_INFINITY)
- r.rlim_cur = RLIM_INFINITY;
- if (r.rlim_max == COMPAT_RLIM_INFINITY)
- r.rlim_max = RLIM_INFINITY;
- return do_prlimit(current, resource, &r, NULL);
-}
-
-#ifdef COMPAT_RLIM_OLD_INFINITY
-
-COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
- int ret;
- mm_segment_t old_fs = get_fs();
-
- set_fs(KERNEL_DS);
- ret = sys_old_getrlimit(resource, (struct rlimit __user *)&r);
- set_fs(old_fs);
-
- if (!ret) {
- if (r.rlim_cur > COMPAT_RLIM_OLD_INFINITY)
- r.rlim_cur = COMPAT_RLIM_INFINITY;
- if (r.rlim_max > COMPAT_RLIM_OLD_INFINITY)
- r.rlim_max = COMPAT_RLIM_INFINITY;
-
- if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
- __put_user(r.rlim_cur, &rlim->rlim_cur) ||
- __put_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
- }
- return ret;
-}
-
-#endif
-
-COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
- struct compat_rlimit __user *, rlim)
-{
- struct rlimit r;
- int ret;
-
- ret = do_prlimit(current, resource, NULL, &r);
- if (!ret) {
- if (r.rlim_cur > COMPAT_RLIM_INFINITY)
- r.rlim_cur = COMPAT_RLIM_INFINITY;
- if (r.rlim_max > COMPAT_RLIM_INFINITY)
- r.rlim_max = COMPAT_RLIM_INFINITY;
-
- if (!access_ok(VERIFY_WRITE, rlim, sizeof(*rlim)) ||
- __put_user(r.rlim_cur, &rlim->rlim_cur) ||
- __put_user(r.rlim_max, &rlim->rlim_max))
- return -EFAULT;
- }
- return ret;
-}
-
int put_compat_rusage(const struct rusage *r, struct compat_rusage __user *ru)
{
- if (!access_ok(VERIFY_WRITE, ru, sizeof(*ru)) ||
- __put_user(r->ru_utime.tv_sec, &ru->ru_utime.tv_sec) ||
- __put_user(r->ru_utime.tv_usec, &ru->ru_utime.tv_usec) ||
- __put_user(r->ru_stime.tv_sec, &ru->ru_stime.tv_sec) ||
- __put_user(r->ru_stime.tv_usec, &ru->ru_stime.tv_usec) ||
- __put_user(r->ru_maxrss, &ru->ru_maxrss) ||
- __put_user(r->ru_ixrss, &ru->ru_ixrss) ||
- __put_user(r->ru_idrss, &ru->ru_idrss) ||
- __put_user(r->ru_isrss, &ru->ru_isrss) ||
- __put_user(r->ru_minflt, &ru->ru_minflt) ||
- __put_user(r->ru_majflt, &ru->ru_majflt) ||
- __put_user(r->ru_nswap, &ru->ru_nswap) ||
- __put_user(r->ru_inblock, &ru->ru_inblock) ||
- __put_user(r->ru_oublock, &ru->ru_oublock) ||
- __put_user(r->ru_msgsnd, &ru->ru_msgsnd) ||
- __put_user(r->ru_msgrcv, &ru->ru_msgrcv) ||
- __put_user(r->ru_nsignals, &ru->ru_nsignals) ||
- __put_user(r->ru_nvcsw, &ru->ru_nvcsw) ||
- __put_user(r->ru_nivcsw, &ru->ru_nivcsw))
+ struct compat_rusage r32;
+ memset(&r32, 0, sizeof(r32));
+ r32.ru_utime.tv_sec = r->ru_utime.tv_sec;
+ r32.ru_utime.tv_usec = r->ru_utime.tv_usec;
+ r32.ru_stime.tv_sec = r->ru_stime.tv_sec;
+ r32.ru_stime.tv_usec = r->ru_stime.tv_usec;
+ r32.ru_maxrss = r->ru_maxrss;
+ r32.ru_ixrss = r->ru_ixrss;
+ r32.ru_idrss = r->ru_idrss;
+ r32.ru_isrss = r->ru_isrss;
+ r32.ru_minflt = r->ru_minflt;
+ r32.ru_majflt = r->ru_majflt;
+ r32.ru_nswap = r->ru_nswap;
+ r32.ru_inblock = r->ru_inblock;
+ r32.ru_oublock = r->ru_oublock;
+ r32.ru_msgsnd = r->ru_msgsnd;
+ r32.ru_msgrcv = r->ru_msgrcv;
+ r32.ru_nsignals = r->ru_nsignals;
+ r32.ru_nvcsw = r->ru_nvcsw;
+ r32.ru_nivcsw = r->ru_nivcsw;
+ if (copy_to_user(ru, &r32, sizeof(r32)))
return -EFAULT;
return 0;
}
-COMPAT_SYSCALL_DEFINE4(wait4,
- compat_pid_t, pid,
- compat_uint_t __user *, stat_addr,
- int, options,
- struct compat_rusage __user *, ru)
-{
- if (!ru) {
- return sys_wait4(pid, stat_addr, options, NULL);
- } else {
- struct rusage r;
- int ret;
- unsigned int status;
- mm_segment_t old_fs = get_fs();
-
- set_fs (KERNEL_DS);
- ret = sys_wait4(pid,
- (stat_addr ?
- (unsigned int __user *) &status : NULL),
- options, (struct rusage __user *) &r);
- set_fs (old_fs);
-
- if (ret > 0) {
- if (put_compat_rusage(&r, ru))
- return -EFAULT;
- if (stat_addr && put_user(status, stat_addr))
- return -EFAULT;
- }
- return ret;
- }
-}
-
-COMPAT_SYSCALL_DEFINE5(waitid,
- int, which, compat_pid_t, pid,
- struct compat_siginfo __user *, uinfo, int, options,
- struct compat_rusage __user *, uru)
-{
- siginfo_t info;
- struct rusage ru;
- long ret;
- mm_segment_t old_fs = get_fs();
-
- memset(&info, 0, sizeof(info));
-
- set_fs(KERNEL_DS);
- ret = sys_waitid(which, pid, (siginfo_t __user *)&info, options,
- uru ? (struct rusage __user *)&ru : NULL);
- set_fs(old_fs);
-
- if ((ret < 0) || (info.si_signo == 0))
- return ret;
-
- if (uru) {
- /* sys_waitid() overwrites everything in ru */
- if (COMPAT_USE_64BIT_TIME)
- ret = copy_to_user(uru, &ru, sizeof(ru));
- else
- ret = put_compat_rusage(&ru, uru);
- if (ret)
- return -EFAULT;
- }
-
- BUG_ON(info.si_code & __SI_MASK);
- info.si_code |= __SI_CHLD;
- return copy_siginfo_to_user32(uinfo, &info);
-}
-
static int compat_get_user_cpu_mask(compat_ulong_t __user *user_mask_ptr,
unsigned len, struct cpumask *new_mask)
{
@@ -542,6 +408,27 @@ int put_compat_itimerspec(struct compat_itimerspec __user *dst,
return 0;
}
+int get_compat_itimerspec64(struct itimerspec64 *its,
+ const struct compat_itimerspec __user *uits)
+{
+
+ if (__compat_get_timespec64(&its->it_interval, &uits->it_interval) ||
+ __compat_get_timespec64(&its->it_value, &uits->it_value))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(get_compat_itimerspec64);
+
+int put_compat_itimerspec64(const struct itimerspec64 *its,
+ struct compat_itimerspec __user *uits)
+{
+ if (__compat_put_timespec64(&its->it_interval, &uits->it_interval) ||
+ __compat_put_timespec64(&its->it_value, &uits->it_value))
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(put_compat_itimerspec64);
+
/*
* We currently only need the following fields from the sigevent
* structure: sigev_value, sigev_signo, sig_notify and (sometimes
@@ -566,84 +453,59 @@ int get_compat_sigevent(struct sigevent *event,
long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask,
unsigned long bitmap_size)
{
- int i, j;
- unsigned long m;
- compat_ulong_t um;
unsigned long nr_compat_longs;
/* align bitmap up to nearest compat_long_t boundary */
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
if (!access_ok(VERIFY_READ, umask, bitmap_size / 8))
return -EFAULT;
- nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
-
- for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
- m = 0;
-
- for (j = 0; j < sizeof(m)/sizeof(um); j++) {
- /*
- * We dont want to read past the end of the userspace
- * bitmap. We must however ensure the end of the
- * kernel bitmap is zeroed.
- */
- if (nr_compat_longs) {
- nr_compat_longs--;
- if (__get_user(um, umask))
- return -EFAULT;
- } else {
- um = 0;
- }
-
- umask++;
- m |= (long)um << (j * BITS_PER_COMPAT_LONG);
- }
- *mask++ = m;
+ user_access_begin();
+ while (nr_compat_longs > 1) {
+ compat_ulong_t l1, l2;
+ unsafe_get_user(l1, umask++, Efault);
+ unsafe_get_user(l2, umask++, Efault);
+ *mask++ = ((unsigned long)l2 << BITS_PER_COMPAT_LONG) | l1;
+ nr_compat_longs -= 2;
}
-
+ if (nr_compat_longs)
+ unsafe_get_user(*mask, umask++, Efault);
+ user_access_end();
return 0;
+
+Efault:
+ user_access_end();
+ return -EFAULT;
}
long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask,
unsigned long bitmap_size)
{
- int i, j;
- unsigned long m;
- compat_ulong_t um;
unsigned long nr_compat_longs;
/* align bitmap up to nearest compat_long_t boundary */
bitmap_size = ALIGN(bitmap_size, BITS_PER_COMPAT_LONG);
+ nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
if (!access_ok(VERIFY_WRITE, umask, bitmap_size / 8))
return -EFAULT;
- nr_compat_longs = BITS_TO_COMPAT_LONGS(bitmap_size);
-
- for (i = 0; i < BITS_TO_LONGS(bitmap_size); i++) {
- m = *mask++;
-
- for (j = 0; j < sizeof(m)/sizeof(um); j++) {
- um = m;
-
- /*
- * We dont want to write past the end of the userspace
- * bitmap.
- */
- if (nr_compat_longs) {
- nr_compat_longs--;
- if (__put_user(um, umask))
- return -EFAULT;
- }
-
- umask++;
- m >>= 4*sizeof(um);
- m >>= 4*sizeof(um);
- }
+ user_access_begin();
+ while (nr_compat_longs > 1) {
+ unsigned long m = *mask++;
+ unsafe_put_user((compat_ulong_t)m, umask++, Efault);
+ unsafe_put_user(m >> BITS_PER_COMPAT_LONG, umask++, Efault);
+ nr_compat_longs -= 2;
}
-
+ if (nr_compat_longs)
+ unsafe_put_user((compat_ulong_t)*mask, umask++, Efault);
+ user_access_end();
return 0;
+Efault:
+ user_access_end();
+ return -EFAULT;
}
void
@@ -669,38 +531,6 @@ sigset_to_compat(compat_sigset_t *compat, const sigset_t *set)
}
}
-COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
- struct compat_siginfo __user *, uinfo,
- struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
-{
- compat_sigset_t s32;
- sigset_t s;
- struct timespec t;
- siginfo_t info;
- long ret;
-
- if (sigsetsize != sizeof(sigset_t))
- return -EINVAL;
-
- if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
- return -EFAULT;
- sigset_from_compat(&s, &s32);
-
- if (uts) {
- if (compat_get_timespec(&t, uts))
- return -EFAULT;
- }
-
- ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
-
- if (ret > 0 && uinfo) {
- if (copy_siginfo_to_user32(uinfo, &info))
- ret = -EFAULT;
- }
-
- return ret;
-}
-
#ifdef CONFIG_NUMA
COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
compat_uptr_t __user *, pages32,
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
index 26a06e09a5bd..d70829033bb7 100644
--- a/kernel/configs/android-base.config
+++ b/kernel/configs/android-base.config
@@ -1,10 +1,13 @@
# KEEP ALPHABETICALLY SORTED
# CONFIG_DEVKMEM is not set
# CONFIG_DEVMEM is not set
+# CONFIG_FHANDLE is not set
# CONFIG_INET_LRO is not set
-# CONFIG_MODULES is not set
+# CONFIG_NFSD is not set
+# CONFIG_NFS_FS is not set
# CONFIG_OABI_COMPAT is not set
# CONFIG_SYSVIPC is not set
+# CONFIG_USELIB is not set
CONFIG_ANDROID=y
CONFIG_ANDROID_BINDER_IPC=y
CONFIG_ANDROID_LOW_MEMORY_KILLER=y
@@ -13,6 +16,7 @@ CONFIG_ASHMEM=y
CONFIG_AUDIT=y
CONFIG_BLK_DEV_INITRD=y
CONFIG_CGROUPS=y
+CONFIG_CGROUP_BPF=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_DEBUG=y
CONFIG_CGROUP_FREEZER=y
@@ -23,6 +27,8 @@ CONFIG_EMBEDDED=y
CONFIG_FB=y
CONFIG_HARDENED_USERCOPY=y
CONFIG_HIGH_RES_TIMERS=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
CONFIG_INET6_AH=y
CONFIG_INET6_ESP=y
CONFIG_INET6_IPCOMP=y
@@ -60,6 +66,9 @@ CONFIG_IP_NF_TARGET_MASQUERADE=y
CONFIG_IP_NF_TARGET_NETMAP=y
CONFIG_IP_NF_TARGET_REDIRECT=y
CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
CONFIG_NET=y
CONFIG_NETDEVICES=y
CONFIG_NETFILTER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
index 28ee064b6744..946fb92418f7 100644
--- a/kernel/configs/android-recommended.config
+++ b/kernel/configs/android-recommended.config
@@ -6,13 +6,15 @@
# CONFIG_NF_CONNTRACK_SIP is not set
# CONFIG_PM_WAKELOCKS_GC is not set
# CONFIG_VT is not set
+CONFIG_ARM64_SW_TTBR0_PAN=y
CONFIG_BACKLIGHT_LCD_SUPPORT=y
CONFIG_BLK_DEV_DM=y
CONFIG_BLK_DEV_LOOP=y
CONFIG_BLK_DEV_RAM=y
CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_CC_STACKPROTECTOR_STRONG=y
CONFIG_COMPACTION=y
-CONFIG_STRICT_KERNEL_RWX=y
+CONFIG_CPU_SW_DOMAIN_PAN=y
CONFIG_DM_CRYPT=y
CONFIG_DM_UEVENT=y
CONFIG_DM_VERITY=y
@@ -105,6 +107,7 @@ CONFIG_SCHEDSTATS=y
CONFIG_SMARTJOYPLUS_FF=y
CONFIG_SND=y
CONFIG_SOUND=y
+CONFIG_STRICT_KERNEL_RWX=y
CONFIG_SUSPEND_TIME=y
CONFIG_TABLET_USB_ACECAD=y
CONFIG_TABLET_USB_AIPTEK=y
diff --git a/kernel/cpu.c b/kernel/cpu.c
index b03a32595cfe..bfbd649ccdc8 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -271,11 +271,26 @@ void cpu_hotplug_enable(void)
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
#endif /* CONFIG_HOTPLUG_CPU */
+static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
wait_for_completion(&st->done);
+ if (WARN_ON_ONCE((!cpu_online(cpu))))
+ return -ECANCELED;
+
+ /* Unpark the stopper thread and the hotplug thread of the target cpu */
+ stop_machine_unpark(cpu);
+ kthread_unpark(st->thread);
+
+ /* Should we go further up ? */
+ if (st->target > CPUHP_AP_ONLINE_IDLE) {
+ __cpuhp_kick_ap_work(st);
+ wait_for_completion(&st->done);
+ }
return st->result;
}
@@ -296,9 +311,7 @@ static int bringup_cpu(unsigned int cpu)
irq_unlock_sparse();
if (ret)
return ret;
- ret = bringup_wait_for_ap(cpu);
- BUG_ON(!cpu_online(cpu));
- return ret;
+ return bringup_wait_for_ap(cpu);
}
/*
@@ -637,6 +650,7 @@ static int takedown_cpu(unsigned int cpu)
__cpu_die(cpu);
tick_cleanup_dead_cpu(cpu);
+ rcutree_migrate_callbacks(cpu);
return 0;
}
@@ -767,31 +781,20 @@ void notify_cpu_starting(unsigned int cpu)
}
/*
- * Called from the idle task. We need to set active here, so we can kick off
- * the stopper thread and unpark the smpboot threads. If the target state is
- * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the
- * cpu further.
+ * Called from the idle task. Wake up the controlling task which brings the
+ * stopper and the hotplug thread of the upcoming CPU up and then delegates
+ * the rest of the online bringup to the hotplug thread.
*/
void cpuhp_online_idle(enum cpuhp_state state)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
- unsigned int cpu = smp_processor_id();
/* Happens for the boot cpu */
if (state != CPUHP_AP_ONLINE_IDLE)
return;
st->state = CPUHP_AP_ONLINE_IDLE;
-
- /* Unpark the stopper thread and the hotplug thread of this cpu */
- stop_machine_unpark(cpu);
- kthread_unpark(st->thread);
-
- /* Should we go further up ? */
- if (st->target > CPUHP_AP_ONLINE_IDLE)
- __cpuhp_kick_ap_work(st);
- else
- complete(&st->done);
+ complete(&st->done);
}
/* Requires cpu_add_remove_lock to be held */
diff --git a/kernel/crash_core.c b/kernel/crash_core.c
index fcbd568f1e95..6db80fc0810b 100644
--- a/kernel/crash_core.c
+++ b/kernel/crash_core.c
@@ -14,10 +14,12 @@
#include <asm/sections.h>
/* vmcoreinfo stuff */
-static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
-u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
-size_t vmcoreinfo_size;
-size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
+static unsigned char *vmcoreinfo_data;
+static size_t vmcoreinfo_size;
+u32 *vmcoreinfo_note;
+
+/* trusted vmcoreinfo, e.g. we can make a copy in the crash memory */
+static unsigned char *vmcoreinfo_data_safecopy;
/*
* parsing the "crashkernel" commandline
@@ -324,8 +326,23 @@ static void update_vmcoreinfo_note(void)
final_note(buf);
}
+void crash_update_vmcoreinfo_safecopy(void *ptr)
+{
+ if (ptr)
+ memcpy(ptr, vmcoreinfo_data, vmcoreinfo_size);
+
+ vmcoreinfo_data_safecopy = ptr;
+}
+
void crash_save_vmcoreinfo(void)
{
+ if (!vmcoreinfo_note)
+ return;
+
+ /* Use the safe copy to generate vmcoreinfo note if have */
+ if (vmcoreinfo_data_safecopy)
+ vmcoreinfo_data = vmcoreinfo_data_safecopy;
+
vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds());
update_vmcoreinfo_note();
}
@@ -340,7 +357,7 @@ void vmcoreinfo_append_str(const char *fmt, ...)
r = vscnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- r = min(r, vmcoreinfo_max_size - vmcoreinfo_size);
+ r = min(r, (size_t)VMCOREINFO_BYTES - vmcoreinfo_size);
memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
@@ -356,11 +373,26 @@ void __weak arch_crash_save_vmcoreinfo(void)
phys_addr_t __weak paddr_vmcoreinfo_note(void)
{
- return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note);
+ return __pa(vmcoreinfo_note);
}
static int __init crash_save_vmcoreinfo_init(void)
{
+ vmcoreinfo_data = (unsigned char *)get_zeroed_page(GFP_KERNEL);
+ if (!vmcoreinfo_data) {
+ pr_warn("Memory allocation for vmcoreinfo_data failed\n");
+ return -ENOMEM;
+ }
+
+ vmcoreinfo_note = alloc_pages_exact(VMCOREINFO_NOTE_SIZE,
+ GFP_KERNEL | __GFP_ZERO);
+ if (!vmcoreinfo_note) {
+ free_page((unsigned long)vmcoreinfo_data);
+ vmcoreinfo_data = NULL;
+ pr_warn("Memory allocation for vmcoreinfo_note failed\n");
+ return -ENOMEM;
+ }
+
VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
VMCOREINFO_PAGESIZE(PAGE_SIZE);
diff --git a/kernel/cred.c b/kernel/cred.c
index 2bc66075740f..ecf03657e71c 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management - see Documentation/security/credentials.txt
+/* Task credentials management - see Documentation/security/credentials.rst
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4d2c32f98482..ce64f3fed5c6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1249,26 +1249,31 @@ unclone_ctx(struct perf_event_context *ctx)
return parent_ctx;
}
-static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid_type(struct perf_event *event, struct task_struct *p,
+ enum pid_type type)
{
+ u32 nr;
/*
* only top level events have the pid namespace they were created in
*/
if (event->parent)
event = event->parent;
- return task_tgid_nr_ns(p, event->ns);
+ nr = __task_pid_nr_ns(p, type, event->ns);
+ /* avoid -1 if it is idle thread or runs in another ns */
+ if (!nr && !pid_alive(p))
+ nr = -1;
+ return nr;
}
-static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
{
- /*
- * only top level events have the pid namespace they were created in
- */
- if (event->parent)
- event = event->parent;
+ return perf_event_pid_type(event, p, __PIDTYPE_TGID);
+}
- return task_pid_nr_ns(p, event->ns);
+static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
+{
+ return perf_event_pid_type(event, p, PIDTYPE_PID);
}
/*
@@ -1452,6 +1457,13 @@ static enum event_type_t get_event_type(struct perf_event *event)
lockdep_assert_held(&ctx->lock);
+ /*
+ * It's 'group type', really, because if our group leader is
+ * pinned, so are we.
+ */
+ if (event->group_leader != event)
+ event = event->group_leader;
+
event_type = event->attr.pinned ? EVENT_PINNED : EVENT_FLEXIBLE;
if (!ctx->task)
event_type |= EVENT_CPU;
@@ -1563,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_TRANSACTION)
size += sizeof(data->txn);
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ size += sizeof(data->phys_addr);
+
event->header_size = size;
}
@@ -2210,6 +2225,33 @@ static int group_can_go_on(struct perf_event *event,
return can_add_hw;
}
+/*
+ * Complement to update_event_times(). This computes the tstamp_* values to
+ * continue 'enabled' state from @now, and effectively discards the time
+ * between the prior tstamp_stopped and now (as we were in the OFF state, or
+ * just switched (context) time base).
+ *
+ * This further assumes '@event->state == INACTIVE' (we just came from OFF) and
+ * cannot have been scheduled in yet. And going into INACTIVE state means
+ * '@event->tstamp_stopped = @now'.
+ *
+ * Thus given the rules of update_event_times():
+ *
+ * total_time_enabled = tstamp_stopped - tstamp_enabled
+ * total_time_running = tstamp_stopped - tstamp_running
+ *
+ * We can insert 'tstamp_stopped == now' and reverse them to compute new
+ * tstamp_* values.
+ */
+static void __perf_event_enable_time(struct perf_event *event, u64 now)
+{
+ WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
+
+ event->tstamp_stopped = now;
+ event->tstamp_enabled = now - event->total_time_enabled;
+ event->tstamp_running = now - event->total_time_running;
+}
+
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
@@ -2217,9 +2259,12 @@ static void add_event_to_ctx(struct perf_event *event,
list_add_event(event, ctx);
perf_group_attach(event);
- event->tstamp_enabled = tstamp;
- event->tstamp_running = tstamp;
- event->tstamp_stopped = tstamp;
+ /*
+ * We can be called with event->state == STATE_OFF when we create with
+ * .disabled = 1. In that case the IOC_ENABLE will call this function.
+ */
+ if (event->state == PERF_EVENT_STATE_INACTIVE)
+ __perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2464,10 +2509,11 @@ static void __perf_event_mark_enabled(struct perf_event *event)
u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
- event->tstamp_enabled = tstamp - event->total_time_enabled;
+ __perf_event_enable_time(event, tstamp);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ /* XXX should not be > INACTIVE if event isn't */
if (sub->state >= PERF_EVENT_STATE_INACTIVE)
- sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ __perf_event_enable_time(sub, tstamp);
}
}
@@ -3173,6 +3219,13 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
return;
perf_ctx_lock(cpuctx, ctx);
+ /*
+ * We must check ctx->nr_events while holding ctx->lock, such
+ * that we serialize against perf_install_in_context().
+ */
+ if (!ctx->nr_events)
+ goto unlock;
+
perf_pmu_disable(ctx->pmu);
/*
* We want to keep the following priority order:
@@ -3186,6 +3239,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
+
+unlock:
perf_ctx_unlock(cpuctx, ctx);
}
@@ -3632,10 +3687,10 @@ static inline u64 perf_event_count(struct perf_event *event)
* will not be local and we cannot read them atomically
* - must not have a pmu::count method
*/
-u64 perf_event_read_local(struct perf_event *event)
+int perf_event_read_local(struct perf_event *event, u64 *value)
{
unsigned long flags;
- u64 val;
+ int ret = 0;
/*
* Disabling interrupts avoids all counter scheduling (context
@@ -3643,25 +3698,37 @@ u64 perf_event_read_local(struct perf_event *event)
*/
local_irq_save(flags);
- /* If this is a per-task event, it must be for current */
- WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
- event->hw.target != current);
-
- /* If this is a per-CPU event, it must be for this CPU */
- WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
- event->cpu != smp_processor_id());
-
/*
* It must not be an event with inherit set, we cannot read
* all child counters from atomic context.
*/
- WARN_ON_ONCE(event->attr.inherit);
+ if (event->attr.inherit) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
/*
* It must not have a pmu::count method, those are not
* NMI safe.
*/
- WARN_ON_ONCE(event->pmu->count);
+ if (event->pmu->count) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
+ /* If this is a per-task event, it must be for current */
+ if ((event->attach_state & PERF_ATTACH_TASK) &&
+ event->hw.target != current) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ /* If this is a per-CPU event, it must be for this CPU */
+ if (!(event->attach_state & PERF_ATTACH_TASK) &&
+ event->cpu != smp_processor_id()) {
+ ret = -EINVAL;
+ goto out;
+ }
/*
* If the event is currently on this CPU, its either a per-task event,
@@ -3671,10 +3738,11 @@ u64 perf_event_read_local(struct perf_event *event)
if (event->oncpu == smp_processor_id())
event->pmu->read(event);
- val = local64_read(&event->count);
+ *value = local64_read(&event->count);
+out:
local_irq_restore(flags);
- return val;
+ return ret;
}
static int perf_event_read(struct perf_event *event, bool group)
@@ -4365,7 +4433,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value);
static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
+ struct perf_event_context *ctx = leader->ctx;
struct perf_event *sub;
+ unsigned long flags;
int n = 1; /* skip @nr */
int ret;
@@ -4395,12 +4465,15 @@ static int __perf_read_group_add(struct perf_event *leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
}
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
return 0;
}
@@ -5065,7 +5138,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
atomic_inc(&event->rb->aux_mmap_count);
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -5088,7 +5161,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
unsigned long size = perf_data_size(rb);
if (event->pmu->event_unmapped)
- event->pmu->event_unmapped(event);
+ event->pmu->event_unmapped(event, vma->vm_mm);
/*
* rb->aux_mmap_count will always drop before rb->mmap_count and
@@ -5386,7 +5459,7 @@ aux_unlock:
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
- event->pmu->event_mapped(event);
+ event->pmu->event_mapped(event, vma->vm_mm);
return ret;
}
@@ -5947,6 +6020,9 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ perf_output_put(handle, data->phys_addr);
+
if (!event->attr.watermark) {
int wakeup_events = event->attr.wakeup_events;
@@ -5962,6 +6038,38 @@ void perf_output_sample(struct perf_output_handle *handle,
}
}
+static u64 perf_virt_to_phys(u64 virt)
+{
+ u64 phys_addr = 0;
+ struct page *p = NULL;
+
+ if (!virt)
+ return 0;
+
+ if (virt >= TASK_SIZE) {
+ /* If it's vmalloc()d memory, leave phys_addr as 0 */
+ if (virt_addr_valid((void *)(uintptr_t)virt) &&
+ !(virt >= VMALLOC_START && virt < VMALLOC_END))
+ phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+ } else {
+ /*
+ * Walking the pages tables for user address.
+ * Interrupts are disabled, so it prevents any tear down
+ * of the page tables.
+ * Try IRQ-safe __get_user_pages_fast first.
+ * If failed, leave phys_addr as 0.
+ */
+ if ((current->mm != NULL) &&
+ (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+ phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+ if (p)
+ put_page(p);
+ }
+
+ return phys_addr;
+}
+
void perf_prepare_sample(struct perf_event_header *header,
struct perf_sample_data *data,
struct perf_event *event,
@@ -6080,6 +6188,9 @@ void perf_prepare_sample(struct perf_event_header *header,
header->size += size;
}
+
+ if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+ data->phys_addr = perf_virt_to_phys(data->addr);
}
static void __always_inline
@@ -7231,6 +7342,11 @@ static void perf_log_throttle(struct perf_event *event, int enable)
perf_output_end(&handle);
}
+void perf_event_itrace_started(struct perf_event *event)
+{
+ event->attach_state |= PERF_ATTACH_ITRACE;
+}
+
static void perf_log_itrace_start(struct perf_event *event)
{
struct perf_output_handle handle;
@@ -7246,7 +7362,7 @@ static void perf_log_itrace_start(struct perf_event *event)
event = event->parent;
if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
- event->hw.itrace_started)
+ event->attach_state & PERF_ATTACH_ITRACE)
return;
rec.header.type = PERF_RECORD_ITRACE_START;
@@ -7308,21 +7424,6 @@ int perf_event_account_interrupt(struct perf_event *event)
return __perf_event_account_interrupt(event, 1);
}
-static bool sample_is_allowed(struct perf_event *event, struct pt_regs *regs)
-{
- /*
- * Due to interrupt latency (AKA "skid"), we may enter the
- * kernel before taking an overflow, even if the PMU is only
- * counting user events.
- * To avoid leaking information to userspace, we must always
- * reject kernel samples when exclude_kernel is set.
- */
- if (event->attr.exclude_kernel && !user_mode(regs))
- return false;
-
- return true;
-}
-
/*
* Generic event overflow handling, sampling.
*/
@@ -7344,12 +7445,6 @@ static int __perf_event_overflow(struct perf_event *event,
ret = __perf_event_account_interrupt(event, throttle);
/*
- * For security, drop the skid kernel samples if necessary.
- */
- if (!sample_is_allowed(event, regs))
- return ret;
-
- /*
* XXX event_limit might not quite work as expected on inherited
* events
*/
@@ -7871,16 +7966,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
}
}
perf_tp_event(call->event.type, count, raw_data, size, regs, head,
- rctx, task);
+ rctx, task, NULL);
}
EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
struct pt_regs *regs, struct hlist_head *head, int rctx,
- struct task_struct *task)
+ struct task_struct *task, struct perf_event *event)
{
struct perf_sample_data data;
- struct perf_event *event;
struct perf_raw_record raw = {
.frag = {
@@ -7894,9 +7988,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
perf_trace_buf_update(record, event_type);
- hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ /* Use the given event instead of the hlist */
+ if (event) {
if (perf_tp_event_match(event, &data, regs))
perf_swevent_event(event, count, &data, regs);
+ } else {
+ hlist_for_each_entry_rcu(event, head, hlist_entry) {
+ if (perf_tp_event_match(event, &data, regs))
+ perf_swevent_event(event, count, &data, regs);
+ }
}
/*
@@ -8049,12 +8149,8 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
bool is_kprobe, is_tracepoint;
struct bpf_prog *prog;
- if (event->attr.type == PERF_TYPE_HARDWARE ||
- event->attr.type == PERF_TYPE_SOFTWARE)
- return perf_event_set_bpf_handler(event, prog_fd);
-
if (event->attr.type != PERF_TYPE_TRACEPOINT)
- return -EINVAL;
+ return perf_event_set_bpf_handler(event, prog_fd);
if (event->tp_event->prog)
return -EEXIST;
@@ -9580,6 +9676,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (ret)
return -EFAULT;
+ attr->size = size;
+
if (attr->__reserved_1)
return -EINVAL;
@@ -9852,6 +9950,11 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /* Only privileged users can get physical addresses */
+ if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+ perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
if (!attr.sample_max_stack)
attr.sample_max_stack = sysctl_perf_event_max_stack;
@@ -10001,28 +10104,27 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
/*
- * Do not allow to attach to a group in a different
- * task or CPU context:
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
*/
- if (move_group) {
- /*
- * Make sure we're both on the same task, or both
- * per-cpu events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ if (group_leader->cpu != event->cpu)
+ goto err_context;
- /*
- * Make sure we're both events for the same CPU;
- * grouping events for different CPUs is broken; since
- * you can never concurrently schedule them anyhow.
- */
- if (group_leader->cpu != event->cpu)
- goto err_context;
- } else {
- if (group_leader->ctx != ctx)
- goto err_context;
- }
+ /*
+ * Make sure we're both on the same task, or both
+ * per-CPU events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Do not allow to attach to a group in a different task
+ * or CPU context. If we're moving SW events, we'll fix
+ * this up later, so allow that.
+ */
+ if (!move_group && group_leader->ctx != ctx)
+ goto err_context;
/*
* Only a group leader can be exclusive or pinned
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 486fd78eb8d5..843e97047335 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -38,9 +38,9 @@ struct ring_buffer {
struct user_struct *mmap_user;
/* AUX area */
- local_t aux_head;
+ long aux_head;
local_t aux_nest;
- local_t aux_wakeup;
+ long aux_wakeup; /* last aux_watermark boundary crossed by aux_head */
unsigned long aux_pgoff;
int aux_nr_pages;
int aux_overwrite;
@@ -208,7 +208,7 @@ static inline int get_recursion_context(int *recursion)
{
int rctx;
- if (in_nmi())
+ if (unlikely(in_nmi()))
rctx = 3;
else if (in_irq())
rctx = 2;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index ee97196bb151..af71a84e12ee 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -367,7 +367,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1)))
goto err_put;
- aux_head = local_read(&rb->aux_head);
+ aux_head = rb->aux_head;
handle->rb = rb;
handle->event = event;
@@ -382,7 +382,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
*/
if (!rb->aux_overwrite) {
aux_tail = ACCESS_ONCE(rb->user_page->aux_tail);
- handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark;
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
if (aux_head - aux_tail < perf_aux_size(rb))
handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb));
@@ -433,12 +433,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE;
aux_head = handle->head;
- local_set(&rb->aux_head, aux_head);
+ rb->aux_head = aux_head;
} else {
handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE;
- aux_head = local_read(&rb->aux_head);
- local_add(size, &rb->aux_head);
+ aux_head = rb->aux_head;
+ rb->aux_head += size;
}
if (size || handle->aux_flags) {
@@ -450,11 +450,10 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
handle->aux_flags);
}
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
-
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ rb->user_page->aux_head = rb->aux_head;
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
wakeup = true;
- local_add(rb->aux_watermark, &rb->aux_wakeup);
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
}
if (wakeup) {
@@ -478,22 +477,20 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size)
int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size)
{
struct ring_buffer *rb = handle->rb;
- unsigned long aux_head;
if (size > handle->size)
return -ENOSPC;
- local_add(size, &rb->aux_head);
+ rb->aux_head += size;
- aux_head = rb->user_page->aux_head = local_read(&rb->aux_head);
- if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) {
+ rb->user_page->aux_head = rb->aux_head;
+ if (rb->aux_head - rb->aux_wakeup >= rb->aux_watermark) {
perf_output_wakeup(handle);
- local_add(rb->aux_watermark, &rb->aux_wakeup);
- handle->wakeup = local_read(&rb->aux_wakeup) +
- rb->aux_watermark;
+ rb->aux_wakeup = rounddown(rb->aux_head, rb->aux_watermark);
+ handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
}
- handle->head = aux_head;
+ handle->head = rb->aux_head;
handle->size -= size;
return 0;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 0e137f98a50c..267f6ef91d97 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void)
void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- newmm->uprobes_state.xol_area = NULL;
-
if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
set_bit(MMF_HAS_UPROBES, &newmm->flags);
/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
diff --git a/kernel/exit.c b/kernel/exit.c
index c63226283aef..f9ef3ecc78c1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -51,7 +51,6 @@
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
-#include <linux/userfaultfd_k.h>
#include <linux/init_task.h>
#include <linux/perf_event.h>
#include <trace/events/sched.h>
@@ -62,6 +61,7 @@
#include <linux/kcov.h>
#include <linux/random.h>
#include <linux/rcuwait.h>
+#include <linux/compat.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@@ -764,7 +764,6 @@ void __noreturn do_exit(long code)
{
struct task_struct *tsk = current;
int group_dead;
- TASKS_RCU(int tasks_rcu_i);
profile_task_exit(tsk);
kcov_task_exit(tsk);
@@ -819,7 +818,8 @@ void __noreturn do_exit(long code)
* Ensure that we must observe the pi_state in exit_mm() ->
* mm_release() -> exit_pi_state_list().
*/
- raw_spin_unlock_wait(&tsk->pi_lock);
+ raw_spin_lock_irq(&tsk->pi_lock);
+ raw_spin_unlock_irq(&tsk->pi_lock);
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -881,9 +881,7 @@ void __noreturn do_exit(long code)
*/
flush_ptrace_hw_breakpoint(tsk);
- TASKS_RCU(preempt_disable());
- TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
- TASKS_RCU(preempt_enable());
+ exit_tasks_rcu_start();
exit_notify(tsk, group_dead);
proc_exit_connector(tsk);
mpol_put_task_policy(tsk);
@@ -918,7 +916,7 @@ void __noreturn do_exit(long code)
if (tsk->nr_dirtied)
__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
exit_rcu();
- TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
+ exit_tasks_rcu_finish();
do_task_dead();
}
@@ -982,14 +980,21 @@ SYSCALL_DEFINE1(exit_group, int, error_code)
return 0;
}
+struct waitid_info {
+ pid_t pid;
+ uid_t uid;
+ int status;
+ int cause;
+};
+
struct wait_opts {
enum pid_type wo_type;
int wo_flags;
struct pid *wo_pid;
- struct siginfo __user *wo_info;
- int __user *wo_stat;
- struct rusage __user *wo_rusage;
+ struct waitid_info *wo_info;
+ int wo_stat;
+ struct rusage *wo_rusage;
wait_queue_entry_t child_wait;
int notask_error;
@@ -1036,34 +1041,6 @@ eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p)
return 1;
}
-static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
- pid_t pid, uid_t uid, int why, int status)
-{
- struct siginfo __user *infop;
- int retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
-
- put_task_struct(p);
- infop = wo->wo_info;
- if (infop) {
- if (!retval)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval)
- retval = put_user(0, &infop->si_errno);
- if (!retval)
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(pid, &infop->si_pid);
- if (!retval)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = put_user(status, &infop->si_status);
- }
- if (!retval)
- retval = pid;
- return retval;
-}
-
/*
* Handle sys_wait4 work for one task in state EXIT_ZOMBIE. We hold
* read_lock(&tasklist_lock) on entry. If we return zero, we still hold
@@ -1072,30 +1049,23 @@ static int wait_noreap_copyout(struct wait_opts *wo, struct task_struct *p,
*/
static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
{
- int state, retval, status;
+ int state, status;
pid_t pid = task_pid_vnr(p);
uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p));
- struct siginfo __user *infop;
+ struct waitid_info *infop;
if (!likely(wo->wo_flags & WEXITED))
return 0;
if (unlikely(wo->wo_flags & WNOWAIT)) {
- int exit_code = p->exit_code;
- int why;
-
+ status = p->exit_code;
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
-
- if ((exit_code & 0x7f) == 0) {
- why = CLD_EXITED;
- status = exit_code >> 8;
- } else {
- why = (exit_code & 0x80) ? CLD_DUMPED : CLD_KILLED;
- status = exit_code & 0x7f;
- }
- return wait_noreap_copyout(wo, p, pid, uid, why, status);
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
+ goto out_info;
}
/*
* Move the task's state to DEAD/TRACE, only one thread can do this.
@@ -1168,38 +1138,11 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
spin_unlock_irq(&current->sighand->siglock);
}
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
- if (!retval && wo->wo_stat)
- retval = put_user(status, wo->wo_stat);
-
- infop = wo->wo_info;
- if (!retval && infop)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval && infop)
- retval = put_user(0, &infop->si_errno);
- if (!retval && infop) {
- int why;
-
- if ((status & 0x7f) == 0) {
- why = CLD_EXITED;
- status >>= 8;
- } else {
- why = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
- status &= 0x7f;
- }
- retval = put_user((short)why, &infop->si_code);
- if (!retval)
- retval = put_user(status, &infop->si_status);
- }
- if (!retval && infop)
- retval = put_user(pid, &infop->si_pid);
- if (!retval && infop)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = pid;
+ wo->wo_stat = status;
if (state == EXIT_TRACE) {
write_lock_irq(&tasklist_lock);
@@ -1216,7 +1159,21 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
if (state == EXIT_DEAD)
release_task(p);
- return retval;
+out_info:
+ infop = wo->wo_info;
+ if (infop) {
+ if ((status & 0x7f) == 0) {
+ infop->cause = CLD_EXITED;
+ infop->status = status >> 8;
+ } else {
+ infop->cause = (status & 0x80) ? CLD_DUMPED : CLD_KILLED;
+ infop->status = status & 0x7f;
+ }
+ infop->pid = pid;
+ infop->uid = uid;
+ }
+
+ return pid;
}
static int *task_stopped_code(struct task_struct *p, bool ptrace)
@@ -1252,8 +1209,8 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
static int wait_task_stopped(struct wait_opts *wo,
int ptrace, struct task_struct *p)
{
- struct siginfo __user *infop;
- int retval, exit_code, *p_code, why;
+ struct waitid_info *infop;
+ int exit_code, *p_code, why;
uid_t uid = 0; /* unneeded, required by compiler */
pid_t pid;
@@ -1298,34 +1255,21 @@ unlock_sig:
why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
read_unlock(&tasklist_lock);
sched_annotate_sleep();
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
- if (unlikely(wo->wo_flags & WNOWAIT))
- return wait_noreap_copyout(wo, p, pid, uid, why, exit_code);
-
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
- if (!retval && wo->wo_stat)
- retval = put_user((exit_code << 8) | 0x7f, wo->wo_stat);
+ if (likely(!(wo->wo_flags & WNOWAIT)))
+ wo->wo_stat = (exit_code << 8) | 0x7f;
infop = wo->wo_info;
- if (!retval && infop)
- retval = put_user(SIGCHLD, &infop->si_signo);
- if (!retval && infop)
- retval = put_user(0, &infop->si_errno);
- if (!retval && infop)
- retval = put_user((short)why, &infop->si_code);
- if (!retval && infop)
- retval = put_user(exit_code, &infop->si_status);
- if (!retval && infop)
- retval = put_user(pid, &infop->si_pid);
- if (!retval && infop)
- retval = put_user(uid, &infop->si_uid);
- if (!retval)
- retval = pid;
- put_task_struct(p);
-
- BUG_ON(!retval);
- return retval;
+ if (infop) {
+ infop->cause = why;
+ infop->status = exit_code;
+ infop->pid = pid;
+ infop->uid = uid;
+ }
+ return pid;
}
/*
@@ -1336,7 +1280,7 @@ unlock_sig:
*/
static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
{
- int retval;
+ struct waitid_info *infop;
pid_t pid;
uid_t uid;
@@ -1361,22 +1305,20 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();
+ if (wo->wo_rusage)
+ getrusage(p, RUSAGE_BOTH, wo->wo_rusage);
+ put_task_struct(p);
- if (!wo->wo_info) {
- retval = wo->wo_rusage
- ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0;
- put_task_struct(p);
- if (!retval && wo->wo_stat)
- retval = put_user(0xffff, wo->wo_stat);
- if (!retval)
- retval = pid;
+ infop = wo->wo_info;
+ if (!infop) {
+ wo->wo_stat = 0xffff;
} else {
- retval = wait_noreap_copyout(wo, p, pid, uid,
- CLD_CONTINUED, SIGCONT);
- BUG_ON(retval == 0);
+ infop->cause = CLD_CONTINUED;
+ infop->pid = pid;
+ infop->uid = uid;
+ infop->status = SIGCONT;
}
-
- return retval;
+ return pid;
}
/*
@@ -1604,8 +1546,8 @@ end:
return retval;
}
-SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
- infop, int, options, struct rusage __user *, ru)
+static long kernel_waitid(int which, pid_t upid, struct waitid_info *infop,
+ int options, struct rusage *ru)
{
struct wait_opts wo;
struct pid *pid = NULL;
@@ -1643,38 +1585,48 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
wo.wo_pid = pid;
wo.wo_flags = options;
wo.wo_info = infop;
- wo.wo_stat = NULL;
wo.wo_rusage = ru;
ret = do_wait(&wo);
- if (ret > 0) {
- ret = 0;
- } else if (infop) {
- /*
- * For a WNOHANG return, clear out all the fields
- * we would set so the user can easily tell the
- * difference.
- */
- if (!ret)
- ret = put_user(0, &infop->si_signo);
- if (!ret)
- ret = put_user(0, &infop->si_errno);
- if (!ret)
- ret = put_user(0, &infop->si_code);
- if (!ret)
- ret = put_user(0, &infop->si_pid);
- if (!ret)
- ret = put_user(0, &infop->si_uid);
- if (!ret)
- ret = put_user(0, &infop->si_status);
- }
-
put_pid(pid);
return ret;
}
-SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
- int, options, struct rusage __user *, ru)
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+ infop, int, options, struct rusage __user *, ru)
+{
+ struct rusage r;
+ struct waitid_info info = {.status = 0};
+ long err = kernel_waitid(which, upid, &info, options, ru ? &r : NULL);
+ int signo = 0;
+ if (err > 0) {
+ signo = SIGCHLD;
+ err = 0;
+ }
+
+ if (!err) {
+ if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+ return -EFAULT;
+ }
+ if (!infop)
+ return err;
+
+ user_access_begin();
+ unsafe_put_user(signo, &infop->si_signo, Efault);
+ unsafe_put_user(0, &infop->si_errno, Efault);
+ unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.pid, &infop->si_pid, Efault);
+ unsafe_put_user(info.uid, &infop->si_uid, Efault);
+ unsafe_put_user(info.status, &infop->si_status, Efault);
+ user_access_end();
+ return err;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+
+long kernel_wait4(pid_t upid, int __user *stat_addr, int options,
+ struct rusage *ru)
{
struct wait_opts wo;
struct pid *pid = NULL;
@@ -1685,6 +1637,10 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
__WNOTHREAD|__WCLONE|__WALL))
return -EINVAL;
+ /* -INT_MIN is not defined */
+ if (upid == INT_MIN)
+ return -ESRCH;
+
if (upid == -1)
type = PIDTYPE_MAX;
else if (upid < 0) {
@@ -1702,14 +1658,29 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
wo.wo_pid = pid;
wo.wo_flags = options | WEXITED;
wo.wo_info = NULL;
- wo.wo_stat = stat_addr;
+ wo.wo_stat = 0;
wo.wo_rusage = ru;
ret = do_wait(&wo);
put_pid(pid);
+ if (ret > 0 && stat_addr && put_user(wo.wo_stat, stat_addr))
+ ret = -EFAULT;
return ret;
}
+SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
+ int, options, struct rusage __user *, ru)
+{
+ struct rusage r;
+ long err = kernel_wait4(upid, stat_addr, options, ru ? &r : NULL);
+
+ if (err > 0) {
+ if (ru && copy_to_user(ru, &r, sizeof(struct rusage)))
+ return -EFAULT;
+ }
+ return err;
+}
+
#ifdef __ARCH_WANT_SYS_WAITPID
/*
@@ -1722,3 +1693,61 @@ SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
}
#endif
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(wait4,
+ compat_pid_t, pid,
+ compat_uint_t __user *, stat_addr,
+ int, options,
+ struct compat_rusage __user *, ru)
+{
+ struct rusage r;
+ long err = kernel_wait4(pid, stat_addr, options, ru ? &r : NULL);
+ if (err > 0) {
+ if (ru && put_compat_rusage(&r, ru))
+ return -EFAULT;
+ }
+ return err;
+}
+
+COMPAT_SYSCALL_DEFINE5(waitid,
+ int, which, compat_pid_t, pid,
+ struct compat_siginfo __user *, infop, int, options,
+ struct compat_rusage __user *, uru)
+{
+ struct rusage ru;
+ struct waitid_info info = {.status = 0};
+ long err = kernel_waitid(which, pid, &info, options, uru ? &ru : NULL);
+ int signo = 0;
+ if (err > 0) {
+ signo = SIGCHLD;
+ err = 0;
+ }
+
+ if (!err && uru) {
+ /* kernel_waitid() overwrites everything in ru */
+ if (COMPAT_USE_64BIT_TIME)
+ err = copy_to_user(uru, &ru, sizeof(ru));
+ else
+ err = put_compat_rusage(&ru, uru);
+ if (err)
+ return -EFAULT;
+ }
+
+ if (!infop)
+ return err;
+
+ user_access_begin();
+ unsafe_put_user(signo, &infop->si_signo, Efault);
+ unsafe_put_user(0, &infop->si_errno, Efault);
+ unsafe_put_user((short)info.cause, &infop->si_code, Efault);
+ unsafe_put_user(info.pid, &infop->si_pid, Efault);
+ unsafe_put_user(info.uid, &infop->si_uid, Efault);
+ unsafe_put_user(info.status, &infop->si_status, Efault);
+ user_access_end();
+ return err;
+Efault:
+ user_access_end();
+ return -EFAULT;
+}
+#endif
diff --git a/kernel/extable.c b/kernel/extable.c
index 0fbdd8582f08..38c2412401a1 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -55,7 +55,8 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
{
const struct exception_table_entry *e;
- e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
+ e = search_extable(__start___ex_table,
+ __stop___ex_table - __start___ex_table, addr);
if (!e)
e = search_module_extables(addr);
return e;
@@ -69,7 +70,7 @@ static inline int init_kernel_text(unsigned long addr)
return 0;
}
-int core_kernel_text(unsigned long addr)
+int notrace core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr < (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index e53770d2bf95..b7e9e57b71ea 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -205,19 +205,17 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
void *stack;
int i;
- local_irq_disable();
for (i = 0; i < NR_CACHED_STACKS; i++) {
- struct vm_struct *s = this_cpu_read(cached_stacks[i]);
+ struct vm_struct *s;
+
+ s = this_cpu_xchg(cached_stacks[i], NULL);
if (!s)
continue;
- this_cpu_write(cached_stacks[i], NULL);
tsk->stack_vm_area = s;
- local_irq_enable();
return s->addr;
}
- local_irq_enable();
stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE,
VMALLOC_START, VMALLOC_END,
@@ -245,19 +243,15 @@ static inline void free_thread_stack(struct task_struct *tsk)
{
#ifdef CONFIG_VMAP_STACK
if (task_stack_vm_area(tsk)) {
- unsigned long flags;
int i;
- local_irq_save(flags);
for (i = 0; i < NR_CACHED_STACKS; i++) {
- if (this_cpu_read(cached_stacks[i]))
+ if (this_cpu_cmpxchg(cached_stacks[i],
+ NULL, tsk->stack_vm_area) != NULL)
continue;
- this_cpu_write(cached_stacks[i], tsk->stack_vm_area);
- local_irq_restore(flags);
return;
}
- local_irq_restore(flags);
vfree_atomic(tsk->stack);
return;
@@ -326,8 +320,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
}
/* All stack pages belong to the same memcg. */
- memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
+ mod_memcg_page_state(vm->pages[0], MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
} else {
/*
* All stack pages are in the same zone and belong to the
@@ -338,8 +332,8 @@ static void account_kernel_stack(struct task_struct *tsk, int account)
mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB,
THREAD_SIZE / 1024 * account);
- memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB,
- account * (THREAD_SIZE / 1024));
+ mod_memcg_page_state(first_page, MEMCG_KERNEL_STACK_KB,
+ account * (THREAD_SIZE / 1024));
}
}
@@ -560,7 +554,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
set_task_stack_end_magic(tsk);
#ifdef CONFIG_CC_STACKPROTECTOR
- tsk->stack_canary = get_random_long();
+ tsk->stack_canary = get_random_canary();
#endif
/*
@@ -579,6 +573,10 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
kcov_task_init(tsk);
+#ifdef CONFIG_FAULT_INJECTION
+ tsk->fail_nth = 0;
+#endif
+
return tsk;
free_stack:
@@ -787,6 +785,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
+static void mm_init_uprobes_state(struct mm_struct *mm)
+{
+#ifdef CONFIG_UPROBES
+ mm->uprobes_state.xol_area = NULL;
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
@@ -808,11 +813,13 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
- clear_tlb_flush_pending(mm);
+ init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
+ mm_init_uprobes_state(mm);
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -1637,9 +1644,9 @@ static __latent_entropy struct task_struct *copy_process(
prev_cputime_init(&p->prev_cputime);
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
- seqcount_init(&p->vtime_seqcount);
- p->vtime_snap = 0;
- p->vtime_snap_whence = VTIME_INACTIVE;
+ seqcount_init(&p->vtime.seqcount);
+ p->vtime.starttime = 0;
+ p->vtime.state = VTIME_INACTIVE;
#endif
#if defined(SPLIT_RSS_COUNTING)
diff --git a/kernel/futex.c b/kernel/futex.c
index d6cf71d08f21..f50b434756c1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -212,7 +212,7 @@ struct futex_pi_state {
atomic_t refcount;
union futex_key key;
-};
+} __randomize_layout;
/**
* struct futex_q - The hashed futex queue entry, one per waiting task
@@ -246,7 +246,7 @@ struct futex_q {
struct rt_mutex_waiter *rt_waiter;
union futex_key *requeue_pi_key;
u32 bitset;
-};
+} __randomize_layout;
static const struct futex_q futex_q_init = {
/* list gets initialized in queue_me()*/
@@ -488,7 +488,7 @@ static void drop_futex_key_refs(union futex_key *key)
*
* Return: a negative error code or 0
*
- * The key words are stored in *key on success.
+ * The key words are stored in @key on success.
*
* For shared mappings, it's (page->index, file_inode(vma->vm_file),
* offset_within_page). For private mappings, it's (uaddr, current->mm).
@@ -670,13 +670,14 @@ again:
* this reference was taken by ihold under the page lock
* pinning the inode in place so i_lock was unnecessary. The
* only way for this check to fail is if the inode was
- * truncated in parallel so warn for now if this happens.
+ * truncated in parallel which is almost certainly an
+ * application bug. In such a case, just retry.
*
* We are not calling into get_futex_key_refs() in file-backed
* cases, therefore a successful atomic_inc return below will
* guarantee that get_futex_key() will still imply smp_mb(); (B).
*/
- if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+ if (!atomic_inc_not_zero(&inode->i_count)) {
rcu_read_unlock();
put_page(page);
@@ -1259,9 +1260,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Return:
- * 0 - ready to wait;
- * 1 - acquired the lock;
- * <0 - error
+ * - 0 - ready to wait;
+ * - 1 - acquired the lock;
+ * - <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
*/
@@ -1717,9 +1718,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* hb1 and hb2 must be held by the caller.
*
* Return:
- * 0 - failed to acquire the lock atomically;
- * >0 - acquired the lock, return value is vpid of the top_waiter
- * <0 - error
+ * - 0 - failed to acquire the lock atomically;
+ * - >0 - acquired the lock, return value is vpid of the top_waiter
+ * - <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
struct futex_hash_bucket *hb1,
@@ -1785,8 +1786,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* uaddr2 atomically on behalf of the top waiter.
*
* Return:
- * >=0 - on success, the number of tasks requeued or woken;
- * <0 - on error
+ * - >=0 - on success, the number of tasks requeued or woken;
+ * - <0 - on error
*/
static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
u32 __user *uaddr2, int nr_wake, int nr_requeue,
@@ -2142,8 +2143,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
* be paired with exactly one earlier call to queue_me().
*
* Return:
- * 1 - if the futex_q was still queued (and we removed unqueued it);
- * 0 - if the futex_q was already removed by the waking thread
+ * - 1 - if the futex_q was still queued (and we removed unqueued it);
+ * - 0 - if the futex_q was already removed by the waking thread
*/
static int unqueue_me(struct futex_q *q)
{
@@ -2333,9 +2334,9 @@ static long futex_wait_restart(struct restart_block *restart);
* acquire the lock. Must be called with the hb lock held.
*
* Return:
- * 1 - success, lock taken;
- * 0 - success, lock not taken;
- * <0 - on error (-EFAULT)
+ * - 1 - success, lock taken;
+ * - 0 - success, lock not taken;
+ * - <0 - on error (-EFAULT)
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
@@ -2422,8 +2423,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* with no q.key reference on failure.
*
* Return:
- * 0 - uaddr contains val and hb has been locked;
- * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
+ * - 0 - uaddr contains val and hb has been locked;
+ * - <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
*/
static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2895,8 +2896,8 @@ pi_faulted:
* called with the hb lock held.
*
* Return:
- * 0 = no early wakeup detected;
- * <0 = -ETIMEDOUT or -ERESTARTNOINTR
+ * - 0 = no early wakeup detected;
+ * - <0 = -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2968,8 +2969,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
* Return:
- * 0 - On success;
- * <0 - On error
+ * - 0 - On success;
+ * - <0 - On error
*/
static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 val, ktime_t *abs_time, u32 bitset,
diff --git a/kernel/groups.c b/kernel/groups.c
index d09727692a2a..434f6665f187 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -5,6 +5,7 @@
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/security.h>
+#include <linux/sort.h>
#include <linux/syscalls.h>
#include <linux/user_namespace.h>
#include <linux/vmalloc.h>
@@ -76,32 +77,18 @@ static int groups_from_user(struct group_info *group_info,
return 0;
}
-/* a simple Shell sort */
+static int gid_cmp(const void *_a, const void *_b)
+{
+ kgid_t a = *(kgid_t *)_a;
+ kgid_t b = *(kgid_t *)_b;
+
+ return gid_gt(a, b) - gid_lt(a, b);
+}
+
static void groups_sort(struct group_info *group_info)
{
- int base, max, stride;
- int gidsetsize = group_info->ngroups;
-
- for (stride = 1; stride < gidsetsize; stride = 3 * stride + 1)
- ; /* nothing */
- stride /= 3;
-
- while (stride) {
- max = gidsetsize - stride;
- for (base = 0; base < max; base++) {
- int left = base;
- int right = left + stride;
- kgid_t tmp = group_info->gid[right];
-
- while (left >= 0 && gid_gt(group_info->gid[left], tmp)) {
- group_info->gid[right] = group_info->gid[left];
- right = left;
- left -= stride;
- }
- group_info->gid[right] = tmp;
- }
- stride /= 3;
- }
+ sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid),
+ gid_cmp, NULL);
}
/* a simple bsearch */
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index d2747f9c5707..d69bd77252a7 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -110,6 +110,13 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
struct cpumask *masks;
cpumask_var_t nmsk, *node_to_present_cpumask;
+ /*
+ * If there aren't any vectors left after applying the pre/post
+ * vectors don't bother with assigning affinity.
+ */
+ if (!affv)
+ return NULL;
+
if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
return NULL;
@@ -192,15 +199,19 @@ out:
/**
* irq_calc_affinity_vectors - Calculate the optimal number of vectors
+ * @minvec: The minimum number of vectors available
* @maxvec: The maximum number of vectors available
* @affd: Description of the affinity requirements
*/
-int irq_calc_affinity_vectors(int maxvec, const struct irq_affinity *affd)
+int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity *affd)
{
int resv = affd->pre_vectors + affd->post_vectors;
int vecs = maxvec - resv;
int ret;
+ if (resv > minvec)
+ return 0;
+
get_online_cpus();
ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
put_online_cpus();
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 2e30d925a40d..3675c6004f2a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -7,7 +7,7 @@
* This file contains the core interrupt handling code, for irq-chip
* based architectures.
*
- * Detailed information is available in Documentation/DocBook/genericirq
+ * Detailed information is available in Documentation/core-api/genericirq.rst
*/
#include <linux/irq.h>
@@ -170,21 +170,11 @@ static void irq_state_clr_disabled(struct irq_desc *desc)
irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
}
-static void irq_state_set_disabled(struct irq_desc *desc)
-{
- irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
-}
-
static void irq_state_clr_masked(struct irq_desc *desc)
{
irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
}
-static void irq_state_set_masked(struct irq_desc *desc)
-{
- irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
-}
-
static void irq_state_clr_started(struct irq_desc *desc)
{
irqd_clear(&desc->irq_data, IRQD_IRQ_STARTED);
@@ -234,7 +224,7 @@ __irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
return IRQ_STARTUP_MANAGED;
}
#else
-static int
+static __always_inline int
__irq_startup_managed(struct irq_desc *desc, struct cpumask *aff, bool force)
{
return IRQ_STARTUP_NORMAL;
@@ -1010,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- unsigned long flags;
+ unsigned long flags, trigger, tmp;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
@@ -1024,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
irq_settings_clr_and_set(desc, clr, set);
+ trigger = irqd_get_trigger_type(&desc->irq_data);
+
irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
if (irq_settings_has_no_balance_set(desc))
@@ -1035,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
if (irq_settings_is_level(desc))
irqd_set(&desc->irq_data, IRQD_LEVEL);
- irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+ tmp = irq_settings_get_trigger_mask(desc);
+ if (tmp != IRQ_TYPE_NONE)
+ trigger = tmp;
+
+ irqd_set(&desc->irq_data, trigger);
irq_put_desc_unlock(desc, flags);
}
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
index aee8f7ec40af..638eb9c83d9f 100644
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -95,8 +95,13 @@ static bool migrate_one_irq(struct irq_desc *desc)
affinity = cpu_online_mask;
brokeaff = true;
}
-
- err = irq_do_set_affinity(d, affinity, true);
+ /*
+ * Do not set the force argument of irq_do_set_affinity() as this
+ * disables the masking of offline CPUs from the supplied affinity
+ * mask and therefore might keep/reassign the irq to the outgoing
+ * CPU.
+ */
+ err = irq_do_set_affinity(d, affinity, false);
if (err) {
pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
d->irq, err);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index eb4d3e8945b8..79f987b942b8 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -6,7 +6,7 @@
*
* This file contains the core interrupt handling code.
*
- * Detailed information is available in Documentation/DocBook/genericirq
+ * Detailed information is available in Documentation/core-api/genericirq.rst
*
*/
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 9da14d125df4..a2c48058354c 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -227,6 +227,16 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
return __irqd_to_state(d) & mask;
}
+static inline void irq_state_set_disabled(struct irq_desc *desc)
+{
+ irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
+}
+
+static inline void irq_state_set_masked(struct irq_desc *desc)
+{
+ irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
+}
+
#undef __irqd_to_state
static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
@@ -437,7 +447,9 @@ static inline void irq_remove_debugfs_entry(struct irq_desc *desc)
# ifdef CONFIG_IRQ_DOMAIN
void irq_domain_debugfs_init(struct dentry *root);
# else
-static inline void irq_domain_debugfs_init(struct dentry *root);
+static inline void irq_domain_debugfs_init(struct dentry *root)
+{
+}
# endif
#else /* CONFIG_GENERIC_IRQ_DEBUGFS */
static inline void irq_add_debugfs_entry(unsigned int irq, struct irq_desc *d)
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 1a9abc1c8ea0..259a22aa9934 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
struct irq_data *data = irq_get_irq_data(irq);
struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
- if (!data || !ipimask || cpu > nr_cpu_ids)
+ if (!data || !ipimask || cpu >= nr_cpu_ids)
return INVALID_HWIRQ;
if (!cpumask_test_cpu(cpu, ipimask))
@@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
if (!chip->ipi_send_single && !chip->ipi_send_mask)
return -EINVAL;
- if (cpu > nr_cpu_ids)
+ if (cpu >= nr_cpu_ids)
return -EINVAL;
if (dest) {
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 948b50e78549..73be2b3909bd 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -4,7 +4,7 @@
*
* This file contains the interrupt descriptor management code
*
- * Detailed information is available in Documentation/DocBook/genericirq
+ * Detailed information is available in Documentation/core-api/genericirq.rst
*
*/
#include <linux/irq.h>
@@ -373,6 +373,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
raw_spin_lock_init(&desc->lock);
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
+ mutex_init(&desc->request_mutex);
init_rcu_head(&desc->rcu);
desc_set_defaults(irq, desc, node, affinity, owner);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 14fe862aa2e3..f1f251479aa6 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1,5 +1,6 @@
#define pr_fmt(fmt) "irq: " fmt
+#include <linux/acpi.h>
#include <linux/debugfs.h>
#include <linux/hardirq.h>
#include <linux/interrupt.h>
@@ -155,6 +156,21 @@ struct irq_domain *__irq_domain_add(struct fwnode_handle *fwnode, int size,
domain->name = fwid->name;
break;
}
+#ifdef CONFIG_ACPI
+ } else if (is_acpi_device_node(fwnode)) {
+ struct acpi_buffer buf = {
+ .length = ACPI_ALLOCATE_BUFFER,
+ };
+ acpi_handle handle;
+
+ handle = acpi_device_handle(to_acpi_device_node(fwnode));
+ if (acpi_get_name(handle, ACPI_FULL_PATHNAME, &buf) == AE_OK) {
+ domain->name = buf.pointer;
+ domain->flags |= IRQ_DOMAIN_NAME_ALLOCATED;
+ }
+
+ domain->fwnode = fwnode;
+#endif
} else if (of_node) {
char *name;
@@ -1667,8 +1683,7 @@ static void debugfs_add_domain_dir(struct irq_domain *d)
static void debugfs_remove_domain_dir(struct irq_domain *d)
{
- if (d->debugfs_file)
- debugfs_remove(d->debugfs_file);
+ debugfs_remove(d->debugfs_file);
}
void __init irq_domain_debugfs_init(struct dentry *root)
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 5c11c1730ba5..1d1a5b945ab4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1090,6 +1090,16 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
+ *
+ * Locking rules:
+ *
+ * desc->request_mutex Provides serialization against a concurrent free_irq()
+ * chip_bus_lock Provides serialization for slow bus operations
+ * desc->lock Provides serialization against hard interrupts
+ *
+ * chip_bus_lock and desc->lock are sufficient for all other management and
+ * interrupt related functions. desc->request_mutex solely serializes
+ * request/free_irq().
*/
static int
__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
@@ -1168,7 +1178,34 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->flags &= ~IRQF_ONESHOT;
/*
+ * Protects against a concurrent __free_irq() call which might wait
+ * for synchronize_irq() to complete without holding the optional
+ * chip bus lock and desc->lock.
+ */
+ mutex_lock(&desc->request_mutex);
+
+ /*
+ * Acquire bus lock as the irq_request_resources() callback below
+ * might rely on the serialization or the magic power management
+ * functions which are abusing the irq_bus_lock() callback,
+ */
+ chip_bus_lock(desc);
+
+ /* First installed action requests resources. */
+ if (!desc->action) {
+ ret = irq_request_resources(desc);
+ if (ret) {
+ pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
+ new->name, irq, desc->irq_data.chip->name);
+ goto out_bus_unlock;
+ }
+ }
+
+ /*
* The following block of code has to be executed atomically
+ * protected against a concurrent interrupt and any of the other
+ * management calls which are not serialized via
+ * desc->request_mutex or the optional bus lock.
*/
raw_spin_lock_irqsave(&desc->lock, flags);
old_ptr = &desc->action;
@@ -1267,13 +1304,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!shared) {
- ret = irq_request_resources(desc);
- if (ret) {
- pr_err("Failed to request resources for %s (irq %d) on irqchip %s\n",
- new->name, irq, desc->irq_data.chip->name);
- goto out_unlock;
- }
-
init_waitqueue_head(&desc->wait_for_threads);
/* Setup the type (level, edge polarity) if configured: */
@@ -1281,10 +1311,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
ret = __irq_set_trigger(desc,
new->flags & IRQF_TRIGGER_MASK);
- if (ret) {
- irq_release_resources(desc);
+ if (ret)
goto out_unlock;
- }
}
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
@@ -1347,6 +1375,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ chip_bus_sync_unlock(desc);
+ mutex_unlock(&desc->request_mutex);
irq_setup_timings(desc, new);
@@ -1378,6 +1408,12 @@ mismatch:
out_unlock:
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (!desc->action)
+ irq_release_resources(desc);
+out_bus_unlock:
+ chip_bus_sync_unlock(desc);
+ mutex_unlock(&desc->request_mutex);
+
out_thread:
if (new->thread) {
struct task_struct *t = new->thread;
@@ -1417,9 +1453,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
if (retval < 0)
return retval;
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
- chip_bus_sync_unlock(desc);
if (retval)
irq_chip_pm_put(&desc->irq_data);
@@ -1443,6 +1477,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
if (!desc)
return NULL;
+ mutex_lock(&desc->request_mutex);
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
@@ -1458,6 +1493,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
WARN(1, "Trying to free already-free IRQ %d\n", irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
chip_bus_sync_unlock(desc);
+ mutex_unlock(&desc->request_mutex);
return NULL;
}
@@ -1475,8 +1511,6 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
if (!desc->action) {
irq_settings_clr_disable_unlazy(desc);
irq_shutdown(desc);
- irq_release_resources(desc);
- irq_remove_timings(desc);
}
#ifdef CONFIG_SMP
@@ -1486,6 +1520,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
#endif
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ /*
+ * Drop bus_lock here so the changes which were done in the chip
+ * callbacks above are synced out to the irq chips which hang
+ * behind a slow bus (I2C, SPI) before calling synchronize_irq().
+ *
+ * Aside of that the bus_lock can also be taken from the threaded
+ * handler in irq_finalize_oneshot() which results in a deadlock
+ * because synchronize_irq() would wait forever for the thread to
+ * complete, which is blocked on the bus lock.
+ *
+ * The still held desc->request_mutex() protects against a
+ * concurrent request_irq() of this irq so the release of resources
+ * and timing data is properly serialized.
+ */
chip_bus_sync_unlock(desc);
unregister_handler_proc(irq, action);
@@ -1518,6 +1566,20 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
}
}
+ /* Last action releases resources */
+ if (!desc->action) {
+ /*
+ * Reaquire bus lock as irq_release_resources() might
+ * require it to deallocate resources over the slow bus.
+ */
+ chip_bus_lock(desc);
+ irq_release_resources(desc);
+ chip_bus_sync_unlock(desc);
+ irq_remove_timings(desc);
+ }
+
+ mutex_unlock(&desc->request_mutex);
+
irq_chip_pm_put(&desc->irq_data);
module_put(desc->owner);
kfree(action->secondary);
@@ -1674,9 +1736,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
- chip_bus_sync_unlock(desc);
if (retval) {
irq_chip_pm_put(&desc->irq_data);
@@ -1924,9 +1984,7 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
if (retval < 0)
return retval;
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, act);
- chip_bus_sync_unlock(desc);
if (retval)
irq_chip_pm_put(&desc->irq_data);
@@ -1935,9 +1993,10 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
}
/**
- * request_percpu_irq - allocate a percpu interrupt line
+ * __request_percpu_irq - allocate a percpu interrupt line
* @irq: Interrupt line to allocate
* @handler: Function to be called when the IRQ occurs.
+ * @flags: Interrupt type flags (IRQF_TIMER only)
* @devname: An ascii name for the claiming device
* @dev_id: A percpu cookie passed back to the handler function
*
@@ -1950,8 +2009,9 @@ int setup_percpu_irq(unsigned int irq, struct irqaction *act)
* the handler gets called with the interrupted CPU's instance of
* that variable.
*/
-int request_percpu_irq(unsigned int irq, irq_handler_t handler,
- const char *devname, void __percpu *dev_id)
+int __request_percpu_irq(unsigned int irq, irq_handler_t handler,
+ unsigned long flags, const char *devname,
+ void __percpu *dev_id)
{
struct irqaction *action;
struct irq_desc *desc;
@@ -1965,12 +2025,15 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
!irq_settings_is_per_cpu_devid(desc))
return -EINVAL;
+ if (flags && flags != IRQF_TIMER)
+ return -EINVAL;
+
action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
if (!action)
return -ENOMEM;
action->handler = handler;
- action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
+ action->flags = flags | IRQF_PERCPU | IRQF_NO_SUSPEND;
action->name = devname;
action->percpu_dev_id = dev_id;
@@ -1980,9 +2043,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
- chip_bus_lock(desc);
retval = __setup_irq(irq, desc, action);
- chip_bus_sync_unlock(desc);
if (retval) {
irq_chip_pm_put(&desc->irq_data);
@@ -1991,7 +2052,7 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
return retval;
}
-EXPORT_SYMBOL_GPL(request_percpu_irq);
+EXPORT_SYMBOL_GPL(__request_percpu_irq);
/**
* irq_get_irqchip_state - returns the irqchip state of a interrupt.
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index cea1de0161f1..6bd9b58429cc 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -149,6 +149,8 @@ static void resume_irq(struct irq_desc *desc)
/* Pretend that it got disabled ! */
desc->depth++;
+ irq_state_set_disabled(desc);
+ irq_state_set_masked(desc);
resume:
desc->istate &= ~IRQS_SUSPENDED;
__enable_irq(desc);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6a3b249a2ae1..127e7cfafa55 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -28,12 +28,6 @@
#include <asm/sections.h>
-#ifdef CONFIG_KALLSYMS_ALL
-#define all_var 1
-#else
-#define all_var 0
-#endif
-
/*
* These will be re-linked against their real values
* during the second link stage.
@@ -82,7 +76,7 @@ static inline int is_kernel(unsigned long addr)
static int is_ksym_addr(unsigned long addr)
{
- if (all_var)
+ if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
return is_kernel(addr);
return is_kernel_text(addr) || is_kernel_inittext(addr);
@@ -280,7 +274,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
if (!symbol_end) {
if (is_kernel_inittext(addr))
symbol_end = (unsigned long)_einittext;
- else if (all_var)
+ else if (IS_ENABLED(CONFIG_KALLSYMS_ALL))
symbol_end = (unsigned long)_end;
else
symbol_end = (unsigned long)_etext;
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 3a47fa998fe0..ea34ed8bb952 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -11,6 +11,10 @@
#include <linux/bug.h>
#include <linux/err.h>
#include <linux/kcmp.h>
+#include <linux/capability.h>
+#include <linux/list.h>
+#include <linux/eventpoll.h>
+#include <linux/file.h>
#include <asm/unistd.h>
@@ -94,6 +98,56 @@ static int kcmp_lock(struct mutex *m1, struct mutex *m2)
return err;
}
+#ifdef CONFIG_EPOLL
+static int kcmp_epoll_target(struct task_struct *task1,
+ struct task_struct *task2,
+ unsigned long idx1,
+ struct kcmp_epoll_slot __user *uslot)
+{
+ struct file *filp, *filp_epoll, *filp_tgt;
+ struct kcmp_epoll_slot slot;
+ struct files_struct *files;
+
+ if (copy_from_user(&slot, uslot, sizeof(slot)))
+ return -EFAULT;
+
+ filp = get_file_raw_ptr(task1, idx1);
+ if (!filp)
+ return -EBADF;
+
+ files = get_files_struct(task2);
+ if (!files)
+ return -EBADF;
+
+ spin_lock(&files->file_lock);
+ filp_epoll = fcheck_files(files, slot.efd);
+ if (filp_epoll)
+ get_file(filp_epoll);
+ else
+ filp_tgt = ERR_PTR(-EBADF);
+ spin_unlock(&files->file_lock);
+ put_files_struct(files);
+
+ if (filp_epoll) {
+ filp_tgt = get_epoll_tfile_raw_ptr(filp_epoll, slot.tfd, slot.toff);
+ fput(filp_epoll);
+ } else
+
+ if (IS_ERR(filp_tgt))
+ return PTR_ERR(filp_tgt);
+
+ return kcmp_ptr(filp, filp_tgt, KCMP_FILE);
+}
+#else
+static int kcmp_epoll_target(struct task_struct *task1,
+ struct task_struct *task2,
+ unsigned long idx1,
+ struct kcmp_epoll_slot __user *uslot)
+{
+ return -EOPNOTSUPP;
+}
+#endif
+
SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
unsigned long, idx1, unsigned long, idx2)
{
@@ -165,6 +219,9 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
ret = -EOPNOTSUPP;
#endif
break;
+ case KCMP_EPOLL_TFD:
+ ret = kcmp_epoll_target(task1, task2, idx1, (void *)idx2);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 980936a90ee6..e62ec4dc6620 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -144,6 +144,14 @@ static int do_kexec_load(unsigned long entry, unsigned long nr_segments,
if (ret)
goto out;
+ /*
+ * Some architecture(like S390) may touch the crash memory before
+ * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
+ */
+ ret = kimage_crash_copy_vmcoreinfo(image);
+ if (ret)
+ goto out;
+
for (i = 0; i < nr_segments; i++) {
ret = kimage_load_segment(image, &image->segment[i]);
if (ret)
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 154ffb489b93..1ae7c41c33c1 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -482,6 +482,40 @@ struct page *kimage_alloc_control_pages(struct kimage *image,
return pages;
}
+int kimage_crash_copy_vmcoreinfo(struct kimage *image)
+{
+ struct page *vmcoreinfo_page;
+ void *safecopy;
+
+ if (image->type != KEXEC_TYPE_CRASH)
+ return 0;
+
+ /*
+ * For kdump, allocate one vmcoreinfo safe copy from the
+ * crash memory. as we have arch_kexec_protect_crashkres()
+ * after kexec syscall, we naturally protect it from write
+ * (even read) access under kernel direct mapping. But on
+ * the other hand, we still need to operate it when crash
+ * happens to generate vmcoreinfo note, hereby we rely on
+ * vmap for this purpose.
+ */
+ vmcoreinfo_page = kimage_alloc_control_pages(image, 0);
+ if (!vmcoreinfo_page) {
+ pr_warn("Could not allocate vmcoreinfo buffer\n");
+ return -ENOMEM;
+ }
+ safecopy = vmap(&vmcoreinfo_page, 1, VM_MAP, PAGE_KERNEL);
+ if (!safecopy) {
+ pr_warn("Could not vmap vmcoreinfo buffer\n");
+ return -ENOMEM;
+ }
+
+ image->vmcoreinfo_data_copy = safecopy;
+ crash_update_vmcoreinfo_safecopy(safecopy);
+
+ return 0;
+}
+
static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
{
if (*image->entry != 0)
@@ -569,6 +603,11 @@ void kimage_free(struct kimage *image)
if (!image)
return;
+ if (image->vmcoreinfo_data_copy) {
+ crash_update_vmcoreinfo_safecopy(NULL);
+ vunmap(image->vmcoreinfo_data_copy);
+ }
+
kimage_free_extra_pages(image);
for_each_kimage_entry(image, ptr, entry) {
if (entry & IND_INDIRECTION) {
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index b118735fea9d..9f48f4412297 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -26,13 +26,6 @@
#include <linux/vmalloc.h>
#include "kexec_internal.h"
-/*
- * Declare these symbols weak so that if architecture provides a purgatory,
- * these will be overridden.
- */
-char __weak kexec_purgatory[0];
-size_t __weak kexec_purgatory_size = 0;
-
static int kexec_calculate_store_digests(struct kimage *image);
/* Architectures can provide this probe function */
@@ -162,16 +155,10 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd,
}
if (cmdline_len) {
- image->cmdline_buf = kzalloc(cmdline_len, GFP_KERNEL);
- if (!image->cmdline_buf) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = copy_from_user(image->cmdline_buf, cmdline_ptr,
- cmdline_len);
- if (ret) {
- ret = -EFAULT;
+ image->cmdline_buf = memdup_user(cmdline_ptr, cmdline_len);
+ if (IS_ERR(image->cmdline_buf)) {
+ ret = PTR_ERR(image->cmdline_buf);
+ image->cmdline_buf = NULL;
goto out;
}
@@ -304,6 +291,14 @@ SYSCALL_DEFINE5(kexec_file_load, int, kernel_fd, int, initrd_fd,
if (ret)
goto out;
+ /*
+ * Some architecture(like S390) may touch the crash memory before
+ * machine_kexec_prepare(), we must copy vmcoreinfo data after it.
+ */
+ ret = kimage_crash_copy_vmcoreinfo(image);
+ if (ret)
+ goto out;
+
ret = kexec_calculate_store_digests(image);
if (ret)
goto out;
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
index 799a8a452187..50dfcb039a41 100644
--- a/kernel/kexec_internal.h
+++ b/kernel/kexec_internal.h
@@ -17,6 +17,8 @@ extern struct mutex kexec_mutex;
#ifdef CONFIG_KEXEC_FILE
#include <linux/purgatory.h>
void kimage_file_post_load_cleanup(struct kimage *image);
+extern char kexec_purgatory[];
+extern size_t kexec_purgatory_size;
#else /* CONFIG_KEXEC_FILE */
static inline void kimage_file_post_load_cleanup(struct kimage *image) { }
#endif /* CONFIG_KEXEC_FILE */
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 563f97e2be36..2f37acde640b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -45,8 +45,6 @@
#include <trace/events/module.h>
-extern int max_threads;
-
#define CAP_BSET (void *)1
#define CAP_PI (void *)2
@@ -56,6 +54,33 @@ static DEFINE_SPINLOCK(umh_sysctl_lock);
static DECLARE_RWSEM(umhelper_sem);
#ifdef CONFIG_MODULES
+/*
+ * Assuming:
+ *
+ * threads = div64_u64((u64) totalram_pages * (u64) PAGE_SIZE,
+ * (u64) THREAD_SIZE * 8UL);
+ *
+ * If you need less than 50 threads would mean we're dealing with systems
+ * smaller than 3200 pages. This assuems you are capable of having ~13M memory,
+ * and this would only be an be an upper limit, after which the OOM killer
+ * would take effect. Systems like these are very unlikely if modules are
+ * enabled.
+ */
+#define MAX_KMOD_CONCURRENT 50
+static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
+static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
+
+/*
+ * This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
+ * running at the same time without returning. When this happens we
+ * believe you've somehow ended up with a recursive module dependency
+ * creating a loop.
+ *
+ * We have no option but to fail.
+ *
+ * Userspace should proactively try to detect and prevent these.
+ */
+#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
/*
modprobe_path is set via /proc/sys.
@@ -127,11 +152,7 @@ int __request_module(bool wait, const char *fmt, ...)
{
va_list args;
char module_name[MODULE_NAME_LEN];
- unsigned int max_modprobes;
int ret;
- static atomic_t kmod_concurrent = ATOMIC_INIT(0);
-#define MAX_KMOD_CONCURRENT 50 /* Completely arbitrary value - KAO */
- static int kmod_loop_msg;
/*
* We don't allow synchronous module loading from async. Module
@@ -154,40 +175,34 @@ int __request_module(bool wait, const char *fmt, ...)
if (ret)
return ret;
- /* If modprobe needs a service that is in a module, we get a recursive
- * loop. Limit the number of running kmod threads to max_threads/2 or
- * MAX_KMOD_CONCURRENT, whichever is the smaller. A cleaner method
- * would be to run the parents of this process, counting how many times
- * kmod was invoked. That would mean accessing the internals of the
- * process tables to get the command line, proc_pid_cmdline is static
- * and it is not worth changing the proc code just to handle this case.
- * KAO.
- *
- * "trace the ppid" is simple, but will fail if someone's
- * parent exits. I think this is as good as it gets. --RR
- */
- max_modprobes = min(max_threads/2, MAX_KMOD_CONCURRENT);
- atomic_inc(&kmod_concurrent);
- if (atomic_read(&kmod_concurrent) > max_modprobes) {
- /* We may be blaming an innocent here, but unlikely */
- if (kmod_loop_msg < 5) {
- printk(KERN_ERR
- "request_module: runaway loop modprobe %s\n",
- module_name);
- kmod_loop_msg++;
+ if (atomic_dec_if_positive(&kmod_concurrent_max) < 0) {
+ pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
+ atomic_read(&kmod_concurrent_max),
+ MAX_KMOD_CONCURRENT, module_name);
+ ret = wait_event_killable_timeout(kmod_wq,
+ atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
+ MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
+ if (!ret) {
+ pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
+ module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
+ return -ETIME;
+ } else if (ret == -ERESTARTSYS) {
+ pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
+ return ret;
}
- atomic_dec(&kmod_concurrent);
- return -ENOMEM;
}
trace_module_request(module_name, wait, _RET_IP_);
ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
- atomic_dec(&kmod_concurrent);
+ atomic_inc(&kmod_concurrent_max);
+ wake_up(&kmod_wq);
+
return ret;
}
EXPORT_SYMBOL(__request_module);
+
#endif /* CONFIG_MODULES */
static void call_usermodehelper_freeinfo(struct subprocess_info *info)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 6756d750b31b..a1606a4224e1 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1771,24 +1771,13 @@ unsigned long __weak arch_deref_entry_point(void *entry)
int register_jprobes(struct jprobe **jps, int num)
{
- struct jprobe *jp;
int ret = 0, i;
if (num <= 0)
return -EINVAL;
+
for (i = 0; i < num; i++) {
- unsigned long addr, offset;
- jp = jps[i];
- addr = arch_deref_entry_point(jp->entry);
-
- /* Verify probepoint is a function entry point */
- if (kallsyms_lookup_size_offset(addr, NULL, &offset) &&
- offset == 0) {
- jp->kp.pre_handler = setjmp_pre_handler;
- jp->kp.break_handler = longjmp_break_handler;
- ret = register_kprobe(&jp->kp);
- } else
- ret = -EINVAL;
+ ret = register_jprobe(jps[i]);
if (ret < 0) {
if (i > 0)
@@ -1796,13 +1785,30 @@ int register_jprobes(struct jprobe **jps, int num)
break;
}
}
+
return ret;
}
EXPORT_SYMBOL_GPL(register_jprobes);
int register_jprobe(struct jprobe *jp)
{
- return register_jprobes(&jp, 1);
+ unsigned long addr, offset;
+ struct kprobe *kp = &jp->kp;
+
+ /*
+ * Verify probepoint as well as the jprobe handler are
+ * valid function entry points.
+ */
+ addr = arch_deref_entry_point(jp->entry);
+
+ if (kallsyms_lookup_size_offset(addr, NULL, &offset) && offset == 0 &&
+ kprobe_on_func_entry(kp->addr, kp->symbol_name, kp->offset)) {
+ kp->pre_handler = setjmp_pre_handler;
+ kp->break_handler = longjmp_break_handler;
+ return register_kprobe(kp);
+ }
+
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(register_jprobe);
@@ -1888,12 +1894,12 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
}
NOKPROBE_SYMBOL(pre_handler_kretprobe);
-bool __weak arch_function_offset_within_entry(unsigned long offset)
+bool __weak arch_kprobe_on_func_entry(unsigned long offset)
{
return !offset;
}
-bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
+bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset)
{
kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset);
@@ -1901,7 +1907,7 @@ bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsign
return false;
if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) ||
- !arch_function_offset_within_entry(offset))
+ !arch_kprobe_on_func_entry(offset))
return false;
return true;
@@ -1914,7 +1920,7 @@ int register_kretprobe(struct kretprobe *rp)
int i;
void *addr;
- if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
+ if (!kprobe_on_func_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset))
return -EINVAL;
if (kretprobe_blacklist_size) {
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 23cd70651238..46ba853656f6 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -134,7 +134,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj,
{
phys_addr_t vmcore_base = paddr_vmcoreinfo_note();
return sprintf(buf, "%pa %x\n", &vmcore_base,
- (unsigned int)sizeof(vmcoreinfo_note));
+ (unsigned int)VMCOREINFO_NOTE_SIZE);
}
KERNEL_ATTR_RO(vmcoreinfo);
@@ -234,7 +234,7 @@ static struct attribute * kernel_attrs[] = {
NULL
};
-static struct attribute_group kernel_attr_group = {
+static const struct attribute_group kernel_attr_group = {
.attrs = kernel_attrs,
};
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 26db528c1d88..1c19edf82427 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -637,6 +637,7 @@ repeat:
schedule();
try_to_freeze();
+ cond_resched();
goto repeat;
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
index 198527a62149..858a07590e39 100644
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -227,9 +227,9 @@ static void __sched __mutex_lock_slowpath(struct mutex *lock);
* (or statically defined) before it can be locked. memset()-ing
* the mutex to 0 is not allowed.
*
- * ( The CONFIG_DEBUG_MUTEXES .config option turns on debugging
- * checks that will enforce the restrictions and will also do
- * deadlock debugging. )
+ * (The CONFIG_DEBUG_MUTEXES .config option turns on debugging
+ * checks that will enforce the restrictions and will also do
+ * deadlock debugging)
*
* This function is similar to (but not equivalent to) down().
*/
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index cc3ed0ccdfa2..2655f26ec882 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -20,6 +20,7 @@
#include <linux/cpumask.h>
#include <linux/percpu.h>
#include <linux/hardirq.h>
+#include <linux/spinlock.h>
#include <asm/qrwlock.h>
/*
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index b2caec7315af..294294c71ba4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -28,6 +28,7 @@
#include <linux/percpu.h>
#include <linux/hardirq.h>
#include <linux/mutex.h>
+#include <linux/prefetch.h>
#include <asm/byteorder.h>
#include <asm/qspinlock.h>
@@ -267,123 +268,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
#endif
-/*
- * Various notes on spin_is_locked() and spin_unlock_wait(), which are
- * 'interesting' functions:
- *
- * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
- * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
- * PPC). Also qspinlock has a similar issue per construction, the setting of
- * the locked byte can be unordered acquiring the lock proper.
- *
- * This gets to be 'interesting' in the following cases, where the /should/s
- * end up false because of this issue.
- *
- *
- * CASE 1:
- *
- * So the spin_is_locked() correctness issue comes from something like:
- *
- * CPU0 CPU1
- *
- * global_lock(); local_lock(i)
- * spin_lock(&G) spin_lock(&L[i])
- * for (i) if (!spin_is_locked(&G)) {
- * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep();
- * return;
- * }
- * // deal with fail
- *
- * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
- * that there is exclusion between the two critical sections.
- *
- * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
- * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
- * /should/ be constrained by the ACQUIRE from spin_lock(&G).
- *
- * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
- *
- *
- * CASE 2:
- *
- * For spin_unlock_wait() there is a second correctness issue, namely:
- *
- * CPU0 CPU1
- *
- * flag = set;
- * smp_mb(); spin_lock(&l)
- * spin_unlock_wait(&l); if (!flag)
- * // add to lockless list
- * spin_unlock(&l);
- * // iterate lockless list
- *
- * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
- * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
- * semantics etc..)
- *
- * Where flag /should/ be ordered against the locked store of l.
- */
-
-/*
- * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
- * issuing an _unordered_ store to set _Q_LOCKED_VAL.
- *
- * This means that the store can be delayed, but no later than the
- * store-release from the unlock. This means that simply observing
- * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
- *
- * There are two paths that can issue the unordered store:
- *
- * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
- *
- * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
- * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
- *
- * However, in both cases we have other !0 state we've set before to queue
- * ourseves:
- *
- * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
- * load is constrained by that ACQUIRE to not pass before that, and thus must
- * observe the store.
- *
- * For (2) we have a more intersting scenario. We enqueue ourselves using
- * xchg_tail(), which ends up being a RELEASE. This in itself is not
- * sufficient, however that is followed by an smp_cond_acquire() on the same
- * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
- * guarantees we must observe that store.
- *
- * Therefore both cases have other !0 state that is observable before the
- * unordered locked byte store comes through. This means we can use that to
- * wait for the lock store, and then wait for an unlock.
- */
-#ifndef queued_spin_unlock_wait
-void queued_spin_unlock_wait(struct qspinlock *lock)
-{
- u32 val;
-
- for (;;) {
- val = atomic_read(&lock->val);
-
- if (!val) /* not locked, we're done */
- goto done;
-
- if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
- break;
-
- /* not locked, but pending, wait until we observe the lock */
- cpu_relax();
- }
-
- /* any unlock is good */
- while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
- cpu_relax();
-
-done:
- smp_acquire__after_ctrl_dep();
-}
-EXPORT_SYMBOL(queued_spin_unlock_wait);
-#endif
-
#endif /* _GEN_PV_LOCK_SLOWPATH */
/**
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index e6b2f7ad3e51..4ccfcaae5b89 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -193,7 +193,8 @@ void __init __pv_init_lock_hash(void)
*/
pv_lock_hash = alloc_large_system_hash("PV qspinlock",
sizeof(struct pv_hash_entry),
- pv_hash_size, 0, HASH_EARLY,
+ pv_hash_size, 0,
+ HASH_EARLY | HASH_ZERO,
&pv_lock_hash_bits, NULL,
pv_hash_size, pv_hash_size);
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 78069895032a..649dc9d3951a 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -963,7 +963,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
- rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
waiter->prio = task->prio;
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index c65f7989f850..20819df98125 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -231,8 +231,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
out_nolock:
list_del(&waiter.list);
- if (!list_empty(&sem->wait_list))
- __rwsem_do_wake(sem, 1);
+ if (!list_empty(&sem->wait_list) && sem->count >= 0)
+ __rwsem_do_wake(sem, 0);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
return -EINTR;
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
deleted file mode 100644
index 9f9284f37f8d..000000000000
--- a/kernel/membarrier.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
- *
- * membarrier system call
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- */
-
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-
-/*
- * Bitmask made from a "or" of all commands within enum membarrier_cmd,
- * except MEMBARRIER_CMD_QUERY.
- */
-#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
-
-/**
- * sys_membarrier - issue memory barriers on a set of threads
- * @cmd: Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
- *
- * If this system call is not implemented, -ENOSYS is returned. If the
- * command specified does not exist, or if the command argument is invalid,
- * this system call returns -EINVAL. For a given command, with flags argument
- * set to 0, this system call is guaranteed to always return the same value
- * until reboot.
- *
- * All memory accesses performed in program order from each targeted thread
- * is guaranteed to be ordered with respect to sys_membarrier(). If we use
- * the semantic "barrier()" to represent a compiler barrier forcing memory
- * accesses to be performed in program order across the barrier, and
- * smp_mb() to represent explicit memory barriers forcing full memory
- * ordering across the barrier, we have the following ordering table for
- * each pair of barrier(), sys_membarrier() and smp_mb():
- *
- * The pair ordering is detailed as (O: ordered, X: not ordered):
- *
- * barrier() smp_mb() sys_membarrier()
- * barrier() X X O
- * smp_mb() X O O
- * sys_membarrier() O O O
- */
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
-{
- /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
- if (tick_nohz_full_enabled())
- return -ENOSYS;
- if (unlikely(flags))
- return -EINVAL;
- switch (cmd) {
- case MEMBARRIER_CMD_QUERY:
- return MEMBARRIER_CMD_BITMASK;
- case MEMBARRIER_CMD_SHARED:
- if (num_online_cpus() > 1)
- synchronize_sched();
- return 0;
- default:
- return -EINVAL;
- }
-}
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 23a6483c3666..124bed776532 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -358,7 +358,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
goto err_pfn_remap;
mem_hotplug_begin();
- error = arch_add_memory(nid, align_start, align_size, true);
+ error = arch_add_memory(nid, align_start, align_size, false);
+ if (!error)
+ move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ align_start >> PAGE_SHIFT,
+ align_size >> PAGE_SHIFT);
mem_hotplug_done();
if (error)
goto err_add_memory;
diff --git a/kernel/module.c b/kernel/module.c
index 4a3665f8f837..40f983cbea81 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -49,9 +49,7 @@
#include <linux/rculist.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
-#ifdef CONFIG_STRICT_MODULE_RWX
-#include <asm/set_memory.h>
-#endif
+#include <linux/set_memory.h>
#include <asm/mmu_context.h>
#include <linux/license.h>
#include <asm/sections.h>
@@ -302,6 +300,7 @@ int unregister_module_notifier(struct notifier_block *nb)
EXPORT_SYMBOL(unregister_module_notifier);
struct load_info {
+ const char *name;
Elf_Ehdr *hdr;
unsigned long len;
Elf_Shdr *sechdrs;
@@ -602,7 +601,7 @@ static struct module *find_module_all(const char *name, size_t len,
module_assert_mutex_or_preempt();
- list_for_each_entry(mod, &modules, list) {
+ list_for_each_entry_rcu(mod, &modules, list) {
if (!even_unformed && mod->state == MODULE_STATE_UNFORMED)
continue;
if (strlen(mod->name) == len && !memcmp(mod->name, name, len))
@@ -1202,10 +1201,7 @@ static ssize_t store_uevent(struct module_attribute *mattr,
struct module_kobject *mk,
const char *buffer, size_t count)
{
- enum kobject_action action;
-
- if (kobject_action_type(buffer, count, &action) == 0)
- kobject_uevent(&mk->kobj, action);
+ kobject_synth_uevent(&mk->kobj, buffer, count);
return count;
}
@@ -1278,12 +1274,13 @@ static u32 resolve_rel_crc(const s32 *crc)
return *(u32 *)((void *)crc + *crc);
}
-static int check_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
+static int check_version(const struct load_info *info,
const char *symname,
struct module *mod,
const s32 *crc)
{
+ Elf_Shdr *sechdrs = info->sechdrs;
+ unsigned int versindex = info->index.vers;
unsigned int i, num_versions;
struct modversion_info *versions;
@@ -1317,17 +1314,16 @@ static int check_version(Elf_Shdr *sechdrs,
}
/* Broken toolchain. Warn once, then let it go.. */
- pr_warn_once("%s: no symbol version for %s\n", mod->name, symname);
+ pr_warn_once("%s: no symbol version for %s\n", info->name, symname);
return 1;
bad_version:
pr_warn("%s: disagrees about version of symbol %s\n",
- mod->name, symname);
+ info->name, symname);
return 0;
}
-static inline int check_modstruct_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
+static inline int check_modstruct_version(const struct load_info *info,
struct module *mod)
{
const s32 *crc;
@@ -1343,8 +1339,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
BUG();
}
preempt_enable();
- return check_version(sechdrs, versindex,
- VMLINUX_SYMBOL_STR(module_layout), mod, crc);
+ return check_version(info, VMLINUX_SYMBOL_STR(module_layout),
+ mod, crc);
}
/* First part is kernel version, which we ignore if module has crcs. */
@@ -1358,8 +1354,7 @@ static inline int same_magic(const char *amagic, const char *bmagic,
return strcmp(amagic, bmagic) == 0;
}
#else
-static inline int check_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
+static inline int check_version(const struct load_info *info,
const char *symname,
struct module *mod,
const s32 *crc)
@@ -1367,8 +1362,7 @@ static inline int check_version(Elf_Shdr *sechdrs,
return 1;
}
-static inline int check_modstruct_version(Elf_Shdr *sechdrs,
- unsigned int versindex,
+static inline int check_modstruct_version(const struct load_info *info,
struct module *mod)
{
return 1;
@@ -1404,7 +1398,7 @@ static const struct kernel_symbol *resolve_symbol(struct module *mod,
if (!sym)
goto unlock;
- if (!check_version(info->sechdrs, info->index.vers, name, mod, crc)) {
+ if (!check_version(info, name, mod, crc)) {
sym = ERR_PTR(-EINVAL);
goto getname;
}
@@ -1667,31 +1661,36 @@ static inline void remove_notes_attrs(struct module *mod)
}
#endif /* CONFIG_KALLSYMS */
-static void add_usage_links(struct module *mod)
+static void del_usage_links(struct module *mod)
{
#ifdef CONFIG_MODULE_UNLOAD
struct module_use *use;
- int nowarn;
mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list) {
- nowarn = sysfs_create_link(use->target->holders_dir,
- &mod->mkobj.kobj, mod->name);
- }
+ list_for_each_entry(use, &mod->target_list, target_list)
+ sysfs_remove_link(use->target->holders_dir, mod->name);
mutex_unlock(&module_mutex);
#endif
}
-static void del_usage_links(struct module *mod)
+static int add_usage_links(struct module *mod)
{
+ int ret = 0;
#ifdef CONFIG_MODULE_UNLOAD
struct module_use *use;
mutex_lock(&module_mutex);
- list_for_each_entry(use, &mod->target_list, target_list)
- sysfs_remove_link(use->target->holders_dir, mod->name);
+ list_for_each_entry(use, &mod->target_list, target_list) {
+ ret = sysfs_create_link(use->target->holders_dir,
+ &mod->mkobj.kobj, mod->name);
+ if (ret)
+ break;
+ }
mutex_unlock(&module_mutex);
+ if (ret)
+ del_usage_links(mod);
#endif
+ return ret;
}
static int module_add_modinfo_attrs(struct module *mod)
@@ -1802,13 +1801,18 @@ static int mod_sysfs_setup(struct module *mod,
if (err)
goto out_unreg_param;
- add_usage_links(mod);
+ err = add_usage_links(mod);
+ if (err)
+ goto out_unreg_modinfo_attrs;
+
add_sect_attrs(mod, info);
add_notes_attrs(mod, info);
kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
return 0;
+out_unreg_modinfo_attrs:
+ module_remove_modinfo_attrs(mod);
out_unreg_param:
module_param_sysfs_remove(mod);
out_unreg_holders:
@@ -2915,9 +2919,15 @@ static int rewrite_section_headers(struct load_info *info, int flags)
info->index.vers = 0; /* Pretend no __versions section! */
else
info->index.vers = find_sec(info, "__versions");
+ info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
info->index.info = find_sec(info, ".modinfo");
+ if (!info->index.info)
+ info->name = "(missing .modinfo section)";
+ else
+ info->name = get_modinfo(info, "name");
info->sechdrs[info->index.info].sh_flags &= ~(unsigned long)SHF_ALLOC;
- info->sechdrs[info->index.vers].sh_flags &= ~(unsigned long)SHF_ALLOC;
+
return 0;
}
@@ -2957,21 +2967,29 @@ static struct module *setup_load_info(struct load_info *info, int flags)
info->index.mod = find_sec(info, ".gnu.linkonce.this_module");
if (!info->index.mod) {
- pr_warn("No module found in object\n");
+ pr_warn("%s: No module found in object\n",
+ info->name ?: "(missing .modinfo name field)");
return ERR_PTR(-ENOEXEC);
}
/* This is temporary: point mod into copy of data. */
mod = (void *)info->sechdrs[info->index.mod].sh_addr;
+ /*
+ * If we didn't load the .modinfo 'name' field, fall back to
+ * on-disk struct mod 'name' field.
+ */
+ if (!info->name)
+ info->name = mod->name;
+
if (info->index.sym == 0) {
- pr_warn("%s: module has no symbols (stripped?)\n", mod->name);
+ pr_warn("%s: module has no symbols (stripped?)\n", info->name);
return ERR_PTR(-ENOEXEC);
}
info->index.pcpu = find_pcpusec(info);
/* Check module struct version now, before we try to use module. */
- if (!check_modstruct_version(info->sechdrs, info->index.vers, mod))
+ if (!check_modstruct_version(info, mod))
return ERR_PTR(-ENOEXEC);
return mod;
@@ -2992,7 +3010,7 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags)
return err;
} else if (!same_magic(modmagic, vermagic, info->index.vers)) {
pr_err("%s: version magic '%s' should be '%s'\n",
- mod->name, modmagic, vermagic);
+ info->name, modmagic, vermagic);
return -ENOEXEC;
}
@@ -3077,9 +3095,9 @@ static int find_module_sections(struct module *mod, struct load_info *info)
mod->trace_events = section_objs(info, "_ftrace_events",
sizeof(*mod->trace_events),
&mod->num_trace_events);
- mod->trace_enums = section_objs(info, "_ftrace_enum_map",
- sizeof(*mod->trace_enums),
- &mod->num_trace_enums);
+ mod->trace_evals = section_objs(info, "_ftrace_eval_map",
+ sizeof(*mod->trace_evals),
+ &mod->num_trace_evals);
#endif
#ifdef CONFIG_TRACING
mod->trace_bprintk_fmt_start = section_objs(info, "__trace_printk_fmt",
@@ -3242,7 +3260,7 @@ int __weak module_frob_arch_sections(Elf_Ehdr *hdr,
/* module_blacklist is a comma-separated list of module names */
static char *module_blacklist;
-static bool blacklisted(char *module_name)
+static bool blacklisted(const char *module_name)
{
const char *p;
size_t len;
@@ -3272,7 +3290,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags)
if (IS_ERR(mod))
return mod;
- if (blacklisted(mod->name))
+ if (blacklisted(info->name))
return ERR_PTR(-EPERM);
err = check_modinfo(mod, info, flags);
@@ -4201,7 +4219,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
goto out;
e = search_extable(mod->extable,
- mod->extable + mod->num_exentries - 1,
+ mod->num_exentries,
addr);
out:
preempt_enable();
diff --git a/kernel/pid.c b/kernel/pid.c
index fd1cde1e4576..020dedbdf066 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (!ns)
ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
- if (type != PIDTYPE_PID)
+ if (type != PIDTYPE_PID) {
+ if (type == __PIDTYPE_TGID)
+ type = PIDTYPE_PID;
task = task->group_leader;
+ }
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
}
rcu_read_unlock();
@@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
}
EXPORT_SYMBOL(__task_pid_nr_ns);
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return pid_nr_ns(task_tgid(tsk), ns);
-}
-EXPORT_SYMBOL(task_tgid_nr_ns);
-
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
@@ -575,16 +572,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
*/
void __init pidhash_init(void)
{
- unsigned int i, pidhash_size;
-
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
- HASH_EARLY | HASH_SMALL,
+ HASH_EARLY | HASH_SMALL | HASH_ZERO,
&pidhash_shift, NULL,
0, 4096);
- pidhash_size = 1U << pidhash_shift;
-
- for (i = 0; i < pidhash_size; i++)
- INIT_HLIST_HEAD(&pid_hash[i]);
}
void __init pidmap_init(void)
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index a8b978c35a6a..e1914c7b85b1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -1108,7 +1108,7 @@ static struct attribute * g[] = {
};
-static struct attribute_group attr_group = {
+static const struct attribute_group attr_group = {
.attrs = g,
};
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d401c21136d1..42bd800a6755 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -705,7 +705,7 @@ static struct attribute * g[] = {
NULL,
};
-static struct attribute_group attr_group = {
+static const struct attribute_group attr_group = {
.attrs = g,
};
diff --git a/kernel/power/process.c b/kernel/power/process.c
index c7209f060eeb..78672d324a6e 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -132,7 +132,7 @@ int freeze_processes(void)
if (!pm_freezing)
atomic_inc(&system_freezing_cnt);
- pm_wakeup_clear();
+ pm_wakeup_clear(true);
pr_info("Freezing user space processes ... ");
pm_freezing = true;
error = try_to_freeze_tasks(true);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index fa46606f3356..0972a8e09d08 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -30,19 +30,17 @@
#include <linux/slab.h>
#include <linux/compiler.h>
#include <linux/ktime.h>
+#include <linux/set_memory.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <asm/io.h>
-#ifdef CONFIG_STRICT_KERNEL_RWX
-#include <asm/set_memory.h>
-#endif
#include "power.h"
-#ifdef CONFIG_STRICT_KERNEL_RWX
+#if defined(CONFIG_STRICT_KERNEL_RWX) && defined(CONFIG_ARCH_HAS_SET_MEMORY)
static bool hibernate_restore_protection;
static bool hibernate_restore_protection_active;
@@ -77,7 +75,7 @@ static inline void hibernate_restore_protection_begin(void) {}
static inline void hibernate_restore_protection_end(void) {}
static inline void hibernate_restore_protect_page(void *page_address) {}
static inline void hibernate_restore_unprotect_page(void *page_address) {}
-#endif /* CONFIG_STRICT_KERNEL_RWX */
+#endif /* CONFIG_STRICT_KERNEL_RWX && CONFIG_ARCH_HAS_SET_MEMORY */
static int swsusp_page_is_free(struct page *);
static void swsusp_set_page_forbidden(struct page *);
@@ -1652,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
- size = global_page_state(NR_SLAB_RECLAIMABLE)
+ size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)
@@ -1929,8 +1927,7 @@ static inline unsigned int alloc_highmem_pages(struct memory_bitmap *bm,
* also be located in the high memory, because of the way in which
* copy_data_pages() works.
*/
-static int swsusp_alloc(struct memory_bitmap *orig_bm,
- struct memory_bitmap *copy_bm,
+static int swsusp_alloc(struct memory_bitmap *copy_bm,
unsigned int nr_pages, unsigned int nr_highmem)
{
if (nr_highmem > 0) {
@@ -1976,7 +1973,7 @@ asmlinkage __visible int swsusp_save(void)
return -ENOMEM;
}
- if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
+ if (swsusp_alloc(&copy_bm, nr_pages, nr_highmem)) {
printk(KERN_ERR "PM: Memory allocation failed\n");
return -ENOMEM;
}
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 15e6baef5c73..3ecf275d7e44 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -72,6 +72,8 @@ static void freeze_begin(void)
static void freeze_enter(void)
{
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true);
+
spin_lock_irq(&suspend_freeze_lock);
if (pm_wakeup_pending())
goto out;
@@ -84,11 +86,9 @@ static void freeze_enter(void)
/* Push all the CPUs into the idle loop. */
wake_up_all_idle_cpus();
- pr_debug("PM: suspend-to-idle\n");
/* Make the current CPU wait so it can enter the idle loop too. */
wait_event(suspend_freeze_wait_head,
suspend_freeze_state == FREEZE_STATE_WAKE);
- pr_debug("PM: resume from suspend-to-idle\n");
cpuidle_pause();
put_online_cpus();
@@ -98,6 +98,31 @@ static void freeze_enter(void)
out:
suspend_freeze_state = FREEZE_STATE_NONE;
spin_unlock_irq(&suspend_freeze_lock);
+
+ trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false);
+}
+
+static void s2idle_loop(void)
+{
+ pr_debug("PM: suspend-to-idle\n");
+
+ do {
+ freeze_enter();
+
+ if (freeze_ops && freeze_ops->wake)
+ freeze_ops->wake();
+
+ dpm_resume_noirq(PMSG_RESUME);
+ if (freeze_ops && freeze_ops->sync)
+ freeze_ops->sync();
+
+ if (pm_wakeup_pending())
+ break;
+
+ pm_wakeup_clear(false);
+ } while (!dpm_suspend_noirq(PMSG_SUSPEND));
+
+ pr_debug("PM: resume from suspend-to-idle\n");
}
void freeze_wake(void)
@@ -371,10 +396,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
* all the devices are suspended.
*/
if (state == PM_SUSPEND_FREEZE) {
- trace_suspend_resume(TPS("machine_suspend"), state, true);
- freeze_enter();
- trace_suspend_resume(TPS("machine_suspend"), state, false);
- goto Platform_wake;
+ s2idle_loop();
+ goto Platform_early_resume;
}
error = disable_nonboot_cpus();
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index 1db044f808b7..2a7d04049af4 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -18,12 +18,14 @@
#ifdef CONFIG_PRINTK
-#define PRINTK_SAFE_CONTEXT_MASK 0x7fffffff
-#define PRINTK_NMI_CONTEXT_MASK 0x80000000
+#define PRINTK_SAFE_CONTEXT_MASK 0x3fffffff
+#define PRINTK_NMI_DEFERRED_CONTEXT_MASK 0x40000000
+#define PRINTK_NMI_CONTEXT_MASK 0x80000000
extern raw_spinlock_t logbuf_lock;
__printf(1, 0) int vprintk_default(const char *fmt, va_list args);
+__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args);
__printf(1, 0) int vprintk_func(const char *fmt, va_list args);
void __printk_safe_enter(void);
void __printk_safe_exit(void);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bd53ea579dc8..fc47863f629c 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2720,16 +2720,13 @@ void wake_up_klogd(void)
preempt_enable();
}
-int printk_deferred(const char *fmt, ...)
+int vprintk_deferred(const char *fmt, va_list args)
{
- va_list args;
int r;
- preempt_disable();
- va_start(args, fmt);
r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, 0, fmt, args);
- va_end(args);
+ preempt_disable();
__this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT);
irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
preempt_enable();
@@ -2737,6 +2734,18 @@ int printk_deferred(const char *fmt, ...)
return r;
}
+int printk_deferred(const char *fmt, ...)
+{
+ va_list args;
+ int r;
+
+ va_start(args, fmt);
+ r = vprintk_deferred(fmt, args);
+ va_end(args);
+
+ return r;
+}
+
/*
* printk rate limiting, lifted from the networking subsystem.
*
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 033e50a7d706..3cdaeaef9ce1 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -80,8 +80,8 @@ static void queue_flush_work(struct printk_safe_seq_buf *s)
* happen, printk_safe_log_store() will notice the buffer->len mismatch
* and repeat the write.
*/
-static int printk_safe_log_store(struct printk_safe_seq_buf *s,
- const char *fmt, va_list args)
+static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s,
+ const char *fmt, va_list args)
{
int add;
size_t len;
@@ -299,7 +299,7 @@ void printk_safe_flush_on_panic(void)
* one writer running. But the buffer might get flushed from another
* CPU, so we need to be careful.
*/
-static int vprintk_nmi(const char *fmt, va_list args)
+static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
{
struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq);
@@ -308,17 +308,29 @@ static int vprintk_nmi(const char *fmt, va_list args)
void printk_nmi_enter(void)
{
- this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+ /*
+ * The size of the extra per-CPU buffer is limited. Use it only when
+ * the main one is locked. If this CPU is not in the safe context,
+ * the lock must be taken on another CPU and we could wait for it.
+ */
+ if ((this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) &&
+ raw_spin_is_locked(&logbuf_lock)) {
+ this_cpu_or(printk_context, PRINTK_NMI_CONTEXT_MASK);
+ } else {
+ this_cpu_or(printk_context, PRINTK_NMI_DEFERRED_CONTEXT_MASK);
+ }
}
void printk_nmi_exit(void)
{
- this_cpu_and(printk_context, ~PRINTK_NMI_CONTEXT_MASK);
+ this_cpu_and(printk_context,
+ ~(PRINTK_NMI_CONTEXT_MASK |
+ PRINTK_NMI_DEFERRED_CONTEXT_MASK));
}
#else
-static int vprintk_nmi(const char *fmt, va_list args)
+static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args)
{
return 0;
}
@@ -330,7 +342,7 @@ static int vprintk_nmi(const char *fmt, va_list args)
* into itself. It uses a per-CPU buffer to store the message, just like
* NMI.
*/
-static int vprintk_safe(const char *fmt, va_list args)
+static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args)
{
struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq);
@@ -351,12 +363,22 @@ void __printk_safe_exit(void)
__printf(1, 0) int vprintk_func(const char *fmt, va_list args)
{
+ /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */
if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK)
return vprintk_nmi(fmt, args);
+ /* Use extra buffer to prevent a recursion deadlock in safe mode. */
if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK)
return vprintk_safe(fmt, args);
+ /*
+ * Use the main logbuf when logbuf_lock is available in NMI.
+ * But avoid calling console drivers that might have their own locks.
+ */
+ if (this_cpu_read(printk_context) & PRINTK_NMI_DEFERRED_CONTEXT_MASK)
+ return vprintk_deferred(fmt, args);
+
+ /* No obstacles. */
return vprintk_default(fmt, args);
}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index be90c945063f..9210379c0353 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -69,8 +69,7 @@ config TREE_SRCU
This option selects the full-fledged version of SRCU.
config TASKS_RCU
- bool
- default n
+ def_bool PREEMPT
select SRCU
help
This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 808b8c85f626..e4b43fef89f5 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -356,22 +356,10 @@ do { \
#ifdef CONFIG_TINY_RCU
/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
-static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */
-{
- return true;
-}
-static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
-{
- return false;
-}
-
-static inline void rcu_expedite_gp(void)
-{
-}
-
-static inline void rcu_unexpedite_gp(void)
-{
-}
+static inline bool rcu_gp_is_normal(void) { return true; }
+static inline bool rcu_gp_is_expedited(void) { return false; }
+static inline void rcu_expedite_gp(void) { }
+static inline void rcu_unexpedite_gp(void) { }
#else /* #ifdef CONFIG_TINY_RCU */
bool rcu_gp_is_normal(void); /* Internal RCU use. */
bool rcu_gp_is_expedited(void); /* Internal RCU use. */
@@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
*gpnum = 0;
*completed = 0;
}
-static inline void rcutorture_record_test_transition(void)
-{
-}
-static inline void rcutorture_record_progress(unsigned long vernum)
-{
-}
+static inline void rcutorture_record_test_transition(void) { }
+static inline void rcutorture_record_progress(unsigned long vernum) { }
#ifdef CONFIG_RCU_TRACE
void do_trace_rcu_torture_read(const char *rcutorturename,
struct rcu_head *rhp,
@@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
#endif
#ifdef CONFIG_TINY_RCU
-
-/*
- * Return the number of grace periods started.
- */
-static inline unsigned long rcu_batches_started(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods started.
- */
-static inline unsigned long rcu_batches_started_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods started.
- */
-static inline unsigned long rcu_batches_started_sched(void)
-{
- return 0;
-}
-
-/*
- * Return the number of grace periods completed.
- */
-static inline unsigned long rcu_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of bottom-half grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_bh(void)
-{
- return 0;
-}
-
-/*
- * Return the number of sched grace periods completed.
- */
-static inline unsigned long rcu_batches_completed_sched(void)
-{
- return 0;
-}
-
-/*
- * Return the number of expedited grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed(void)
-{
- return 0;
-}
-
-/*
- * Return the number of expedited sched grace periods completed.
- */
-static inline unsigned long rcu_exp_batches_completed_sched(void)
-{
- return 0;
-}
-
-static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
-{
- return 0;
-}
-
-static inline void rcu_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_bh_force_quiescent_state(void)
-{
-}
-
-static inline void rcu_sched_force_quiescent_state(void)
-{
-}
-
-static inline void show_rcu_gp_kthreads(void)
-{
-}
-
+static inline unsigned long rcu_batches_started(void) { return 0; }
+static inline unsigned long rcu_batches_started_bh(void) { return 0; }
+static inline unsigned long rcu_batches_started_sched(void) { return 0; }
+static inline unsigned long rcu_batches_completed(void) { return 0; }
+static inline unsigned long rcu_batches_completed_bh(void) { return 0; }
+static inline unsigned long rcu_batches_completed_sched(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
+static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; }
+static inline unsigned long
+srcu_batches_completed(struct srcu_struct *sp) { return 0; }
+static inline void rcu_force_quiescent_state(void) { }
+static inline void rcu_bh_force_quiescent_state(void) { }
+static inline void rcu_sched_force_quiescent_state(void) { }
+static inline void show_rcu_gp_kthreads(void) { }
#else /* #ifdef CONFIG_TINY_RCU */
extern unsigned long rcutorture_testseq;
extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 2b62a38b080f..7649fcd2c4c7 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -36,24 +36,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
}
/*
- * Debug function to actually count the number of callbacks.
- * If the number exceeds the limit specified, return -1.
- */
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
-{
- int cnt = 0;
- struct rcu_head **rhpp = &rclp->head;
-
- for (;;) {
- if (!*rhpp)
- return cnt;
- if (++cnt > lim)
- return -1;
- rhpp = &(*rhpp)->next;
- }
-}
-
-/*
* Dequeue the oldest rcu_head structure from the specified callback
* list. This function assumes that the callback is non-lazy, but
* the caller can later invoke rcu_cblist_dequeued_lazy() if it
@@ -103,17 +85,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
}
/*
- * Is the specified segment of the specified rcu_segcblist structure
- * empty of callbacks?
- */
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
-{
- if (seg == RCU_DONE_TAIL)
- return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
- return rsclp->tails[seg - 1] == rsclp->tails[seg];
-}
-
-/*
* Does the specified rcu_segcblist structure contain callbacks that
* are ready to be invoked?
*/
@@ -134,50 +105,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
}
/*
- * Dequeue and return the first ready-to-invoke callback. If there
- * are no ready-to-invoke callbacks, return NULL. Disables interrupts
- * to avoid interference. Does not protect from interference from other
- * CPUs or tasks.
- */
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
-{
- unsigned long flags;
- int i;
- struct rcu_head *rhp;
-
- local_irq_save(flags);
- if (!rcu_segcblist_ready_cbs(rsclp)) {
- local_irq_restore(flags);
- return NULL;
- }
- rhp = rsclp->head;
- BUG_ON(!rhp);
- rsclp->head = rhp->next;
- for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
- if (rsclp->tails[i] != &rhp->next)
- break;
- rsclp->tails[i] = &rsclp->head;
- }
- smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
- WRITE_ONCE(rsclp->len, rsclp->len - 1);
- local_irq_restore(flags);
- return rhp;
-}
-
-/*
- * Account for the fact that a previously dequeued callback turned out
- * to be marked as lazy.
- */
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- rsclp->len_lazy--;
- local_irq_restore(flags);
-}
-
-/*
* Return a pointer to the first callback in the specified rcu_segcblist
* structure. This is useful for diagnostics.
*/
@@ -203,17 +130,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
}
/*
- * Does the specified rcu_segcblist structure contain callbacks that
- * have not yet been processed beyond having been posted, that is,
- * does it contain callbacks in its last segment?
- */
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
-{
- return rcu_segcblist_is_enabled(rsclp) &&
- !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
-}
-
-/*
* Enqueue the specified callback onto the specified rcu_segcblist
* structure, updating accounting as needed. Note that the ->len
* field may be accessed locklessly, hence the WRITE_ONCE().
@@ -503,3 +419,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
return true;
return false;
}
+
+/*
+ * Merge the source rcu_segcblist structure into the destination
+ * rcu_segcblist structure, then initialize the source. Any pending
+ * callbacks from the source get to start over. It is best to
+ * advance and accelerate both the destination and the source
+ * before merging.
+ */
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp)
+{
+ struct rcu_cblist donecbs;
+ struct rcu_cblist pendcbs;
+
+ rcu_cblist_init(&donecbs);
+ rcu_cblist_init(&pendcbs);
+ rcu_segcblist_extract_count(src_rsclp, &donecbs);
+ rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
+ rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
+ rcu_segcblist_insert_count(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
+ rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
+ rcu_segcblist_init(src_rsclp);
+}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 6e36e36478cd..581c12b63544 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
rclp->len_lazy--;
}
-/*
- * Interim function to return rcu_cblist head pointer. Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
-{
- return rclp->head;
-}
-
-/*
- * Interim function to return rcu_cblist head pointer. Longer term, the
- * rcu_cblist will be used more pervasively, removing the need for this
- * function.
- */
-static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
-{
- WARN_ON_ONCE(!rclp->head);
- return rclp->tail;
-}
-
void rcu_cblist_init(struct rcu_cblist *rclp);
-long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
/*
@@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
void rcu_segcblist_init(struct rcu_segcblist *rsclp);
void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
-struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
-void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
-bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp, bool lazy);
bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
@@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
unsigned long seq);
+void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
+ struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cc18110b612..1f87a02c3399 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = {
.name = "sched"
};
-#ifdef CONFIG_TASKS_RCU
-
/*
* Definitions for RCU-tasks perf testing.
*/
@@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = {
.name = "tasks"
};
-#define RCUPERF_TASKS_OPS &tasks_ops,
-
static bool __maybe_unused torturing_tasks(void)
{
return cur_ops == &tasks_ops;
}
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUPERF_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
- return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
/*
* If performance tests complete, wait for shutdown to commence.
*/
@@ -658,7 +643,7 @@ rcu_perf_init(void)
int firsterr = 0;
static struct rcu_perf_ops *perf_ops[] = {
&rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
- RCUPERF_TASKS_OPS
+ &tasks_ops,
};
if (!torture_init_begin(perf_type, verbose, &perf_runnable))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b8f7f8ce8575..45f2ffbc1e78 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
static u64 notrace rcu_trace_clock_local(void)
{
u64 ts = trace_clock_local();
- unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC);
+
+ (void)do_div(ts, NSEC_PER_USEC);
return ts;
}
#else /* #ifdef CONFIG_RCU_TRACE */
@@ -496,7 +497,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
.fqs = NULL,
.stats = NULL,
.irq_capable = 1,
- .name = "rcu_busted"
+ .name = "busted"
};
/*
@@ -522,7 +523,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
delay = torture_random(rrsp) %
(nrealreaders * 2 * longdelay * uspertick);
- if (!delay)
+ if (!delay && in_task())
schedule_timeout_interruptible(longdelay);
else
rcu_read_delay(rrsp);
@@ -561,44 +562,7 @@ static void srcu_torture_barrier(void)
static void srcu_torture_stats(void)
{
- int __maybe_unused cpu;
- int idx;
-
-#ifdef CONFIG_TREE_SRCU
- idx = srcu_ctlp->srcu_idx & 0x1;
- pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
- torture_type, TORTURE_FLAG, idx);
- for_each_possible_cpu(cpu) {
- unsigned long l0, l1;
- unsigned long u0, u1;
- long c0, c1;
- struct srcu_data *counts;
-
- counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
- u0 = counts->srcu_unlock_count[!idx];
- u1 = counts->srcu_unlock_count[idx];
-
- /*
- * Make sure that a lock is always counted if the corresponding
- * unlock is counted.
- */
- smp_rmb();
-
- l0 = counts->srcu_lock_count[!idx];
- l1 = counts->srcu_lock_count[idx];
-
- c0 = l0 - u0;
- c1 = l1 - u1;
- pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
- }
- pr_cont("\n");
-#elif defined(CONFIG_TINY_SRCU)
- idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
- pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
- torture_type, TORTURE_FLAG, idx,
- READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
- READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
-#endif
+ srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG);
}
static void srcu_torture_synchronize_expedited(void)
@@ -620,6 +584,7 @@ static struct rcu_torture_ops srcu_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .irq_capable = 1,
.name = "srcu"
};
@@ -652,6 +617,7 @@ static struct rcu_torture_ops srcud_ops = {
.call = srcu_torture_call,
.cb_barrier = srcu_torture_barrier,
.stats = srcu_torture_stats,
+ .irq_capable = 1,
.name = "srcud"
};
@@ -696,8 +662,6 @@ static struct rcu_torture_ops sched_ops = {
.name = "sched"
};
-#ifdef CONFIG_TASKS_RCU
-
/*
* Definitions for RCU-tasks torture testing.
*/
@@ -735,24 +699,11 @@ static struct rcu_torture_ops tasks_ops = {
.name = "tasks"
};
-#define RCUTORTURE_TASKS_OPS &tasks_ops,
-
static bool __maybe_unused torturing_tasks(void)
{
return cur_ops == &tasks_ops;
}
-#else /* #ifdef CONFIG_TASKS_RCU */
-
-#define RCUTORTURE_TASKS_OPS
-
-static bool __maybe_unused torturing_tasks(void)
-{
- return false;
-}
-
-#endif /* #else #ifdef CONFIG_TASKS_RCU */
-
/*
* RCU torture priority-boost testing. Runs one real-time thread per
* CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -1114,6 +1065,11 @@ rcu_torture_fakewriter(void *arg)
return 0;
}
+static void rcu_torture_timer_cb(struct rcu_head *rhp)
+{
+ kfree(rhp);
+}
+
/*
* RCU torture reader from timer handler. Dereferences rcu_torture_current,
* incrementing the corresponding element of the pipeline array. The
@@ -1176,6 +1132,14 @@ static void rcu_torture_timer(unsigned long unused)
__this_cpu_inc(rcu_torture_batch[completed]);
preempt_enable();
cur_ops->readunlock(idx);
+
+ /* Test call_rcu() invocation from interrupt handler. */
+ if (cur_ops->call) {
+ struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT);
+
+ if (rhp)
+ cur_ops->call(rhp, rcu_torture_timer_cb);
+ }
}
/*
@@ -1354,11 +1318,12 @@ rcu_torture_stats_print(void)
srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
&flags, &gpnum, &completed);
wtp = READ_ONCE(writer_task);
- pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n",
+ pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n",
rcu_torture_writer_state_getname(),
rcu_torture_writer_state,
gpnum, completed, flags,
- wtp == NULL ? ~0UL : wtp->state);
+ wtp == NULL ? ~0UL : wtp->state,
+ wtp == NULL ? -1 : (int)task_cpu(wtp));
show_rcu_gp_kthreads();
rcu_ftrace_dump(DUMP_ALL);
}
@@ -1749,7 +1714,7 @@ rcu_torture_init(void)
int firsterr = 0;
static struct rcu_torture_ops *torture_ops[] = {
&rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
- &sched_ops, RCUTORTURE_TASKS_OPS
+ &sched_ops, &tasks_ops,
};
if (!torture_init_begin(torture_type, verbose, &torture_runnable))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 1a1c1047d2ed..76ac5f50b2c7 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -33,6 +33,8 @@
#include "rcu_segcblist.h"
#include "rcu.h"
+int rcu_scheduler_active __read_mostly;
+
static int init_srcu_struct_fields(struct srcu_struct *sp)
{
sp->srcu_lock_nesting[0] = 0;
@@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp)
destroy_rcu_head_on_stack(&rs.head);
}
EXPORT_SYMBOL_GPL(synchronize_srcu);
+
+/* Lockdep diagnostics. */
+void __init rcu_scheduler_starting(void)
+{
+ rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
+}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d0ca524bf042..729a8706751d 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444);
static void srcu_invoke_callbacks(struct work_struct *work);
static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
+static void process_srcu(struct work_struct *work);
/*
* Initialize SRCU combining tree. Note that statically allocated
@@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
__call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
wait_for_completion(&rcu.completion);
destroy_rcu_head_on_stack(&rcu.head);
+
+ /*
+ * Make sure that later code is ordered after the SRCU grace
+ * period. This pairs with the raw_spin_lock_irq_rcu_node()
+ * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
+ * because the current CPU might have been totally uninvolved with
+ * (and thus unordered against) that grace period.
+ */
+ smp_mb();
}
/**
@@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
/*
* This is the work-queue function that handles SRCU grace periods.
*/
-void process_srcu(struct work_struct *work)
+static void process_srcu(struct work_struct *work)
{
struct srcu_struct *sp;
@@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work)
srcu_advance_state(sp);
srcu_reschedule(sp, srcu_get_delay(sp));
}
-EXPORT_SYMBOL_GPL(process_srcu);
void srcutorture_get_gp_data(enum rcutorture_type test_type,
struct srcu_struct *sp, int *flags,
@@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
}
EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
+void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf)
+{
+ int cpu;
+ int idx;
+ unsigned long s0 = 0, s1 = 0;
+
+ idx = sp->srcu_idx & 0x1;
+ pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx);
+ for_each_possible_cpu(cpu) {
+ unsigned long l0, l1;
+ unsigned long u0, u1;
+ long c0, c1;
+ struct srcu_data *counts;
+
+ counts = per_cpu_ptr(sp->sda, cpu);
+ u0 = counts->srcu_unlock_count[!idx];
+ u1 = counts->srcu_unlock_count[idx];
+
+ /*
+ * Make sure that a lock is always counted if the corresponding
+ * unlock is counted.
+ */
+ smp_rmb();
+
+ l0 = counts->srcu_lock_count[!idx];
+ l1 = counts->srcu_lock_count[idx];
+
+ c0 = l0 - u0;
+ c1 = l1 - u1;
+ pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
+ s0 += c0;
+ s1 += c1;
+ }
+ pr_cont(" T(%ld,%ld)\n", s0, s1);
+}
+EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
+
static int __init srcu_bootup_announce(void)
{
pr_info("Hierarchical SRCU implementation.\n");
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f8488965250f..a64eee0db39e 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
.curtail = &rcu_bh_ctrlblk.rcucblist,
};
-#include "tiny_plugin.h"
-
void rcu_barrier_bh(void)
{
wait_rcu_gp(call_rcu_bh);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
deleted file mode 100644
index f0a01b2a3062..000000000000
--- a/kernel/rcu/tiny_plugin.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
- * Internal non-public definitions that provide either classic
- * or preemptible semantics.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, you can access it online at
- * http://www.gnu.org/licenses/gpl-2.0.html.
- *
- * Copyright (c) 2010 Linaro
- *
- * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
- */
-
-#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
-#include <linux/kernel_stat.h>
-
-int rcu_scheduler_active __read_mostly;
-EXPORT_SYMBOL_GPL(rcu_scheduler_active);
-
-/*
- * During boot, we forgive RCU lockdep issues. After this function is
- * invoked, we start taking RCU lockdep issues seriously. Note that unlike
- * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
- * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
- * The reason for this is that Tiny RCU does not need kthreads, so does
- * not have to care about the fact that the scheduler is half-initialized
- * at a certain phase of the boot process. Unless SRCU is in the mix.
- */
-void __init rcu_scheduler_starting(void)
-{
- WARN_ON(nr_context_switches() > 0);
- rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
- ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
-}
-
-#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 51d4c3acf32d..84fe96641b2e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \
.gp_state = RCU_GP_IDLE, \
.gpnum = 0UL - 300UL, \
.completed = 0UL - 300UL, \
- .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
- .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
- .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
.name = RCU_STATE_NAME(sname), \
.abbr = sabbr, \
@@ -843,13 +840,9 @@ static void rcu_eqs_enter(bool user)
*/
void rcu_idle_enter(void)
{
- unsigned long flags;
-
- local_irq_save(flags);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");
rcu_eqs_enter(false);
- local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_enter);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -862,7 +855,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
*/
void rcu_user_enter(void)
{
- rcu_eqs_enter(1);
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!");
+ rcu_eqs_enter(true);
}
#endif /* CONFIG_NO_HZ_FULL */
@@ -955,8 +949,10 @@ static void rcu_eqs_exit(bool user)
if (oldval & DYNTICK_TASK_NEST_MASK) {
rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
} else {
+ __this_cpu_inc(disable_rcu_irq_enter);
rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
rcu_eqs_exit_common(oldval, user);
+ __this_cpu_dec(disable_rcu_irq_enter);
}
}
@@ -979,7 +975,6 @@ void rcu_idle_exit(void)
rcu_eqs_exit(false);
local_irq_restore(flags);
}
-EXPORT_SYMBOL_GPL(rcu_idle_exit);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -1358,12 +1353,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
j = jiffies;
gpa = READ_ONCE(rsp->gp_activity);
if (j - gpa > 2 * HZ) {
- pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
+ pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
rsp->name, j - gpa,
rsp->gpnum, rsp->completed,
rsp->gp_flags,
gp_state_getname(rsp->gp_state), rsp->gp_state,
- rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
+ rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
+ rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
if (rsp->gp_kthread) {
sched_show_task(rsp->gp_kthread);
wake_up_process(rsp->gp_kthread);
@@ -2067,8 +2063,8 @@ static bool rcu_gp_init(struct rcu_state *rsp)
}
/*
- * Helper function for wait_event_interruptible_timeout() wakeup
- * at force-quiescent-state time.
+ * Helper function for swait_event_idle() wakeup at force-quiescent-state
+ * time.
*/
static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
{
@@ -2206,9 +2202,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
- swait_event_interruptible(rsp->gp_wq,
- READ_ONCE(rsp->gp_flags) &
- RCU_GP_FLAG_INIT);
+ swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+ RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
@@ -2239,7 +2234,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
- ret = swait_event_interruptible_timeout(rsp->gp_wq,
+ ret = swait_event_idle_timeout(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
@@ -2409,6 +2404,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
return;
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
+ rcu_preempt_blocked_readers_cgp(rnp));
rnp->qsmask &= ~mask;
trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
mask, rnp->qsmask, rnp->level,
@@ -2563,85 +2560,6 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
}
/*
- * Send the specified CPU's RCU callbacks to the orphanage. The
- * specified CPU must be offline, and the caller must hold the
- * ->orphan_lock.
- */
-static void
-rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
- struct rcu_node *rnp, struct rcu_data *rdp)
-{
- lockdep_assert_held(&rsp->orphan_lock);
-
- /* No-CBs CPUs do not have orphanable callbacks. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
- return;
-
- /*
- * Orphan the callbacks. First adjust the counts. This is safe
- * because _rcu_barrier() excludes CPU-hotplug operations, so it
- * cannot be running now. Thus no memory barrier is required.
- */
- rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
- rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
-
- /*
- * Next, move those callbacks still needing a grace period to
- * the orphanage, where some other CPU will pick them up.
- * Some of the callbacks might have gone partway through a grace
- * period, but that is too bad. They get to start over because we
- * cannot assume that grace periods are synchronized across CPUs.
- */
- rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
-
- /*
- * Then move the ready-to-invoke callbacks to the orphanage,
- * where some other CPU will pick them up. These will not be
- * required to pass though another grace period: They are done.
- */
- rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
-
- /* Finally, disallow further callbacks on this CPU. */
- rcu_segcblist_disable(&rdp->cblist);
-}
-
-/*
- * Adopt the RCU callbacks from the specified rcu_state structure's
- * orphanage. The caller must hold the ->orphan_lock.
- */
-static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
-{
- struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
-
- lockdep_assert_held(&rsp->orphan_lock);
-
- /* No-CBs CPUs are handled specially. */
- if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
- rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
- return;
-
- /* Do the accounting first. */
- rdp->n_cbs_adopted += rsp->orphan_done.len;
- if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
- rcu_idle_count_callbacks_posted();
- rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
-
- /*
- * We do not need a memory barrier here because the only way we
- * can get here if there is an rcu_barrier() in flight is if
- * we are the task doing the rcu_barrier().
- */
-
- /* First adopt the ready-to-invoke callbacks, then the done ones. */
- rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
- WARN_ON_ONCE(rsp->orphan_done.head);
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
- WARN_ON_ONCE(rsp->orphan_pend.head);
- WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
- !rcu_segcblist_n_cbs(&rdp->cblist));
-}
-
-/*
* Trace the fact that this CPU is going offline.
*/
static void rcu_cleanup_dying_cpu(struct rcu_state *rsp)
@@ -2704,14 +2622,12 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
/*
* The CPU has been completely removed, and some other CPU is reporting
- * this fact from process context. Do the remainder of the cleanup,
- * including orphaning the outgoing CPU's RCU callbacks, and also
- * adopting them. There can only be one CPU hotplug operation at a time,
- * so no other CPU can be attempting to update rcu_cpu_kthread_task.
+ * this fact from process context. Do the remainder of the cleanup.
+ * There can only be one CPU hotplug operation at a time, so no need for
+ * explicit locking.
*/
static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
{
- unsigned long flags;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
@@ -2720,18 +2636,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
-
- /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
- raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
- rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
- rcu_adopt_orphan_cbs(rsp, flags);
- raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
-
- WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
- !rcu_segcblist_empty(&rdp->cblist),
- "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
- cpu, rcu_segcblist_n_cbs(&rdp->cblist),
- rcu_segcblist_first_cb(&rdp->cblist));
}
/*
@@ -3569,10 +3473,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
struct rcu_state *rsp = rdp->rsp;
if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
- _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("LastCB"), -1,
+ rsp->barrier_sequence);
complete(&rsp->barrier_completion);
} else {
- _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence);
}
}
@@ -3584,14 +3489,15 @@ static void rcu_barrier_func(void *type)
struct rcu_state *rsp = type;
struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
- _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence);
rdp->barrier_head.func = rcu_barrier_callback;
debug_rcu_head_queue(&rdp->barrier_head);
if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
atomic_inc(&rsp->barrier_cpu_count);
} else {
debug_rcu_head_unqueue(&rdp->barrier_head);
- _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1,
+ rsp->barrier_sequence);
}
}
@@ -3605,14 +3511,15 @@ static void _rcu_barrier(struct rcu_state *rsp)
struct rcu_data *rdp;
unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Begin", -1, s);
+ _rcu_barrier_trace(rsp, TPS("Begin"), -1, s);
/* Take mutex to serialize concurrent rcu_barrier() requests. */
mutex_lock(&rsp->barrier_mutex);
/* Did someone else do our work for us? */
if (rcu_seq_done(&rsp->barrier_sequence, s)) {
- _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1,
+ rsp->barrier_sequence);
smp_mb(); /* caller's subsequent code after above check. */
mutex_unlock(&rsp->barrier_mutex);
return;
@@ -3620,7 +3527,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
/* Mark the start of the barrier operation. */
rcu_seq_start(&rsp->barrier_sequence);
- _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence);
/*
* Initialize the count to one rather than to zero in order to
@@ -3643,10 +3550,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
rdp = per_cpu_ptr(rsp->rda, cpu);
if (rcu_is_nocb_cpu(cpu)) {
if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
- _rcu_barrier_trace(rsp, "OfflineNoCB", cpu,
+ _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu,
rsp->barrier_sequence);
} else {
- _rcu_barrier_trace(rsp, "OnlineNoCB", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu,
rsp->barrier_sequence);
smp_mb__before_atomic();
atomic_inc(&rsp->barrier_cpu_count);
@@ -3654,11 +3561,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
rcu_barrier_callback, rsp, cpu, 0);
}
} else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
- _rcu_barrier_trace(rsp, "OnlineQ", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu,
rsp->barrier_sequence);
smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
} else {
- _rcu_barrier_trace(rsp, "OnlineNQ", cpu,
+ _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu,
rsp->barrier_sequence);
}
}
@@ -3675,7 +3582,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
wait_for_completion(&rsp->barrier_completion);
/* Mark the end of the barrier operation. */
- _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence);
+ _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence);
rcu_seq_end(&rsp->barrier_sequence);
/* Other rcu_barrier() invocations can now safely proceed. */
@@ -3777,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
*/
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
- if (!rdp->beenonline)
- WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
rdp->beenonline = true; /* We have now been online. */
rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
rdp->completed = rnp->completed;
@@ -3882,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu)
{
unsigned long flags;
unsigned long mask;
+ int nbits;
+ unsigned long oldmask;
struct rcu_data *rdp;
struct rcu_node *rnp;
struct rcu_state *rsp;
@@ -3892,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu)
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->qsmaskinitnext |= mask;
+ oldmask = rnp->expmaskinitnext;
rnp->expmaskinitnext |= mask;
+ oldmask ^= rnp->expmaskinitnext;
+ nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
+ /* Allow lockless access for expedited grace periods. */
+ smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
+ smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
}
#ifdef CONFIG_HOTPLUG_CPU
@@ -3937,6 +3850,50 @@ void rcu_report_dead(unsigned int cpu)
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_idle_cpu(cpu, rsp);
}
+
+/* Migrate the dead CPU's callbacks to the current CPU. */
+static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ struct rcu_data *my_rdp;
+ struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+ struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
+
+ if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
+ return; /* No callbacks to migrate. */
+
+ local_irq_save(flags);
+ my_rdp = this_cpu_ptr(rsp->rda);
+ if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
+ local_irq_restore(flags);
+ return;
+ }
+ raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
+ rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
+ rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
+ rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
+ WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
+ !rcu_segcblist_n_cbs(&my_rdp->cblist));
+ raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
+ WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
+ !rcu_segcblist_empty(&rdp->cblist),
+ "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
+ cpu, rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_first_cb(&rdp->cblist));
+}
+
+/*
+ * The outgoing CPU has just passed through the dying-idle state,
+ * and we are being invoked from the CPU that was IPIed to continue the
+ * offline operation. We need to migrate the outgoing CPU's callbacks.
+ */
+void rcutree_migrate_callbacks(int cpu)
+{
+ struct rcu_state *rsp;
+
+ for_each_rcu_flavor(rsp)
+ rcu_migrate_callbacks(cpu, rsp);
+}
#endif
/*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9af0f31d6847..8e1f285f0a70 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -219,8 +219,6 @@ struct rcu_data {
/* qlen at last check for QS forcing */
unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
- unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
- unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
unsigned long n_force_qs_snap;
/* did other CPU force QS recently? */
long blimit; /* Upper limit on a processed batch */
@@ -268,7 +266,9 @@ struct rcu_data {
struct rcu_head **nocb_follower_tail;
struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
+ raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
+ struct timer_list nocb_timer; /* Enforce finite deferral. */
/* The following fields are used by the leader, hence own cacheline. */
struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -350,15 +350,6 @@ struct rcu_state {
/* End of fields guarded by root rcu_node's lock. */
- raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
- /* Protect following fields. */
- struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
- /* need a grace period. */
- struct rcu_cblist orphan_done; /* Orphaned callbacks that */
- /* are ready to invoke. */
- /* (Contains counts.) */
- /* End of fields guarded by orphan_lock. */
-
struct mutex barrier_mutex; /* Guards barrier fields. */
atomic_t barrier_cpu_count; /* # CPUs waiting on. */
struct completion barrier_completion; /* Wake at barrier end. */
@@ -495,7 +486,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags);
-static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags);
static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index dd21ca47e4b4..46d61b597731 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
unsigned long flags;
unsigned long mask;
unsigned long oldmask;
- int ncpus = READ_ONCE(rsp->ncpus);
+ int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */
struct rcu_node *rnp;
struct rcu_node *rnp_up;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 908b309d60d7..55bde94b9572 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
struct task_struct *t = current;
lockdep_assert_held(&rnp->lock);
+ WARN_ON_ONCE(rdp->mynode != rnp);
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
/*
* Decide where to queue the newly blocked task. In theory,
@@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
+ WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
+ !(rnp->qsmask & rdp->grpmask));
+ WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
+ !(rnp->expmask & rdp->grpmask));
raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
/*
@@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t)
rnp = t->rcu_blocked_node;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
WARN_ON_ONCE(rnp != t->rcu_blocked_node);
+ WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = sync_rcu_preempt_exp_done(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t)
if (&t->rcu_node_entry == rnp->exp_tasks)
rnp->exp_tasks = np;
if (IS_ENABLED(CONFIG_RCU_BOOST)) {
- if (&t->rcu_node_entry == rnp->boost_tasks)
- rnp->boost_tasks = np;
/* Snapshot ->boost_mtx ownership w/rnp->lock held. */
drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
}
/*
@@ -636,10 +643,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
+ struct task_struct *t;
+
RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
- if (rcu_preempt_has_tasks(rnp))
+ if (rcu_preempt_has_tasks(rnp)) {
rnp->gp_tasks = rnp->blkd_tasks.next;
+ t = container_of(rnp->gp_tasks, struct task_struct,
+ rcu_node_entry);
+ trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
+ rnp->gpnum, t->pid);
+ }
WARN_ON_ONCE(rnp->qsmask);
}
@@ -1788,23 +1802,62 @@ bool rcu_is_nocb_cpu(int cpu)
}
/*
- * Kick the leader kthread for this NOCB group.
+ * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock
+ * and this function releases it.
*/
-static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
{
struct rcu_data *rdp_leader = rdp->nocb_leader;
- if (!READ_ONCE(rdp_leader->nocb_kthread))
+ lockdep_assert_held(&rdp->nocb_lock);
+ if (!READ_ONCE(rdp_leader->nocb_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
return;
- if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
+ }
+ if (rdp_leader->nocb_leader_sleep || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+ del_timer(&rdp->nocb_timer);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
swake_up(&rdp_leader->nocb_wq);
+ } else {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
}
}
/*
+ * Kick the leader kthread for this NOCB group, but caller has not
+ * acquired locks.
+ */
+static void wake_nocb_leader(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ __wake_nocb_leader(rdp, force, flags);
+}
+
+/*
+ * Arrange to wake the leader kthread for this NOCB group at some
+ * future time when it is safe to do so.
+ */
+static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
+ mod_timer(&rdp->nocb_timer, jiffies + 1);
+ WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason);
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+}
+
+/*
* Does the specified CPU need an RCU callback for the specified flavor
* of rcu_barrier()?
*/
@@ -1891,11 +1944,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeEmpty"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE);
- /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeEmptyIsDeferred"));
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
}
rdp->qlen_last_fqs_check = 0;
} else if (len > rdp->qlen_last_fqs_check + qhimark) {
@@ -1905,11 +1955,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
TPS("WakeOvf"));
} else {
- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE);
- /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */
- smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- TPS("WakeOvfIsDeferred"));
+ wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeOvfIsDeferred"));
}
rdp->qlen_last_fqs_check = LONG_MAX / 2;
} else {
@@ -1961,30 +2008,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
* Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
* not a no-CBs CPU.
*/
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
- long ql = rsp->orphan_done.len;
- long qll = rsp->orphan_done.len_lazy;
-
- /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");
if (!rcu_is_nocb_cpu(smp_processor_id()))
- return false;
-
- /* First, enqueue the donelist, if any. This preserves CB ordering. */
- if (rsp->orphan_done.head) {
- __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done),
- rcu_cblist_tail(&rsp->orphan_done),
- ql, qll, flags);
- }
- if (rsp->orphan_pend.head) {
- __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
- rcu_cblist_tail(&rsp->orphan_pend),
- ql, qll, flags);
- }
- rcu_cblist_init(&rsp->orphan_done);
- rcu_cblist_init(&rsp->orphan_pend);
+ return false; /* Not NOCBs CPU, caller must migrate CBs. */
+ __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
+ rcu_segcblist_tail(&rdp->cblist),
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_disable(&rdp->cblist);
return true;
}
@@ -2031,6 +2067,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
static void nocb_leader_wait(struct rcu_data *my_rdp)
{
bool firsttime = true;
+ unsigned long flags;
bool gotcbs;
struct rcu_data *rdp;
struct rcu_head **tail;
@@ -2039,13 +2076,17 @@ wait_again:
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
swait_event_interruptible(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
- /* Memory barrier handled by smp_mb() calls below and repoll. */
+ raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
+ my_rdp->nocb_leader_sleep = true;
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
} else if (firsttime) {
firsttime = false; /* Don't drown trace log with "Poll"! */
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll"));
}
/*
@@ -2054,7 +2095,7 @@ wait_again:
* nocb_gp_head, where they await a grace period.
*/
gotcbs = false;
- smp_mb(); /* wakeup before ->nocb_head reads. */
+ smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
@@ -2066,56 +2107,41 @@ wait_again:
gotcbs = true;
}
- /*
- * If there were no callbacks, sleep a bit, rescan after a
- * memory barrier, and go retry.
- */
+ /* No callbacks? Sleep a bit if polling, and go retry. */
if (unlikely(!gotcbs)) {
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
- "WokeEmpty");
WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
-
- /* Rescan in case we were a victim of memory ordering. */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan. */
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
- if (READ_ONCE(rdp->nocb_head)) {
- /* Found CB, so short-circuit next wait. */
- my_rdp->nocb_leader_sleep = false;
- break;
- }
+ if (rcu_nocb_poll) {
+ schedule_timeout_interruptible(1);
+ } else {
+ trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
+ TPS("WokeEmpty"));
+ }
goto wait_again;
}
/* Wait for one grace period. */
rcu_nocb_wait_gp(my_rdp);
- /*
- * We left ->nocb_leader_sleep unset to reduce cache thrashing.
- * We set it now, but recheck for new callbacks while
- * traversing our follower list.
- */
- my_rdp->nocb_leader_sleep = true;
- smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
-
/* Each pass through the following loop wakes a follower, if needed. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
- if (READ_ONCE(rdp->nocb_head))
+ if (!rcu_nocb_poll &&
+ READ_ONCE(rdp->nocb_head) &&
+ READ_ONCE(my_rdp->nocb_leader_sleep)) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
+ }
if (!rdp->nocb_gp_head)
continue; /* No CBs, so no need to wake follower. */
/* Append callbacks to follower's "done" list. */
- tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ tail = rdp->nocb_follower_tail;
+ rdp->nocb_follower_tail = rdp->nocb_gp_tail;
*tail = rdp->nocb_gp_head;
- smp_mb__after_atomic(); /* Store *tail before wakeup. */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
- /*
- * List was empty, wake up the follower.
- * Memory barriers supplied by atomic_long_add().
- */
+ /* List was empty, so wake up the follower. */
swake_up(&rdp->nocb_wq);
}
}
@@ -2131,28 +2157,16 @@ wait_again:
*/
static void nocb_follower_wait(struct rcu_data *rdp)
{
- bool firsttime = true;
-
for (;;) {
- if (!rcu_nocb_poll) {
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "FollowerSleep");
- swait_event_interruptible(rdp->nocb_wq,
- READ_ONCE(rdp->nocb_follower_head));
- } else if (firsttime) {
- /* Don't drown trace log with "Poll"! */
- firsttime = false;
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
- }
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
+ swait_event_interruptible(rdp->nocb_wq,
+ READ_ONCE(rdp->nocb_follower_head));
if (smp_load_acquire(&rdp->nocb_follower_head)) {
/* ^^^ Ensure CB invocation follows _head test. */
return;
}
- if (!rcu_nocb_poll)
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
- "WokeEmpty");
WARN_ON(signal_pending(current));
- schedule_timeout_interruptible(1);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty"));
}
}
@@ -2165,6 +2179,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
static int rcu_nocb_kthread(void *arg)
{
int c, cl;
+ unsigned long flags;
struct rcu_head *list;
struct rcu_head *next;
struct rcu_head **tail;
@@ -2179,11 +2194,14 @@ static int rcu_nocb_kthread(void *arg)
nocb_follower_wait(rdp);
/* Pull the ready-to-invoke callbacks onto local list. */
- list = READ_ONCE(rdp->nocb_follower_head);
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ list = rdp->nocb_follower_head;
+ rdp->nocb_follower_head = NULL;
+ tail = rdp->nocb_follower_tail;
+ rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
BUG_ON(!list);
- trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
- WRITE_ONCE(rdp->nocb_follower_head, NULL);
- tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
+ trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty"));
/* Each pass through the following loop invokes a callback. */
trace_rcu_batch_start(rdp->rsp->name,
@@ -2226,18 +2244,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
}
/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
{
+ unsigned long flags;
int ndw;
- if (!rcu_nocb_need_deferred_wakeup(rdp))
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+ if (!rcu_nocb_need_deferred_wakeup(rdp)) {
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
return;
+ }
ndw = READ_ONCE(rdp->nocb_defer_wakeup);
WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE);
+ __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
}
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(unsigned long x)
+{
+ do_nocb_deferred_wakeup_common((struct rcu_data *)x);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ if (rcu_nocb_need_deferred_wakeup(rdp))
+ do_nocb_deferred_wakeup_common(rdp);
+}
+
void __init rcu_init_nohz(void)
{
int cpu;
@@ -2287,6 +2326,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
rdp->nocb_tail = &rdp->nocb_head;
init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
+ raw_spin_lock_init(&rdp->nocb_lock);
+ setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer,
+ (unsigned long)rdp);
}
/*
@@ -2459,7 +2501,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
return false;
}
-static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
+static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
struct rcu_data *rdp,
unsigned long flags)
{
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 00e77c470017..5033b66d2753 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
/* Track exiting tasks in order to allow them to be waited for. */
-DEFINE_SRCU(tasks_rcu_exit_srcu);
+DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
@@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void)
mutex_unlock(&rcu_tasks_kthread_mutex);
}
+/* Do the srcu_read_lock() for the above synchronize_srcu(). */
+void exit_tasks_rcu_start(void)
+{
+ preempt_disable();
+ current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
+ preempt_enable();
+}
+
+/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
+void exit_tasks_rcu_finish(void)
+{
+ preempt_disable();
+ __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
+ preempt_enable();
+}
+
#endif /* #ifdef CONFIG_TASKS_RCU */
#ifndef CONFIG_TINY_RCU
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164ed362..78f54932ea1d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
+obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index da39489d2d80..de6d7f4dfcb5 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -71,7 +71,6 @@ static inline struct autogroup *autogroup_create(void)
goto out_fail;
tg = sched_create_group(&root_task_group);
-
if (IS_ERR(tg))
goto out_free;
@@ -101,7 +100,7 @@ out_free:
out_fail:
if (printk_ratelimit()) {
printk(KERN_WARNING "autogroup_create: %s failure.\n",
- ag ? "sched_create_group()" : "kmalloc()");
+ ag ? "sched_create_group()" : "kzalloc()");
}
return autogroup_kref_get(&autogroup_default);
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 13fc5ae9bf2f..5d9131aa846f 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -47,6 +47,13 @@ EXPORT_SYMBOL(complete);
*
* It may be assumed that this function implies a write memory barrier before
* changing the task state if and only if any tasks are woken up.
+ *
+ * Since complete_all() sets the completion of @x permanently to done
+ * to allow multiple waiters to finish, a call to reinit_completion()
+ * must be used on @x if @x is to be used again. The code must make
+ * sure that all waiters have woken and finished before reinitializing
+ * @x. Also note that the function completion_done() can not be used
+ * to know if there are still waiters after complete_all() has been called.
*/
void complete_all(struct completion *x)
{
@@ -297,9 +304,12 @@ EXPORT_SYMBOL(try_wait_for_completion);
* Return: 0 if there are waiters (wait_for_completion() in progress)
* 1 if there are no waiters.
*
+ * Note, this will always return true if complete_all() was called on @X.
*/
bool completion_done(struct completion *x)
{
+ unsigned long flags;
+
if (!READ_ONCE(x->done))
return false;
@@ -307,14 +317,9 @@ bool completion_done(struct completion *x)
* If ->done, we need to wait for complete() to release ->wait.lock
* otherwise we can end up freeing the completion before complete()
* is done referencing it.
- *
- * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
- * the loads of ->done and ->wait.lock such that we cannot observe
- * the lock before complete() acquires it while observing the ->done
- * after it's acquired the lock.
*/
- smp_rmb();
- spin_unlock_wait(&x->wait.lock);
+ spin_lock_irqsave(&x->wait.lock, flags);
+ spin_unlock_irqrestore(&x->wait.lock, flags);
return true;
}
EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 17c667b427b4..c1fcd96cf432 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -951,8 +951,13 @@ struct migration_arg {
static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
struct task_struct *p, int dest_cpu)
{
- if (unlikely(!cpu_active(dest_cpu)))
- return rq;
+ if (p->flags & PF_KTHREAD) {
+ if (unlikely(!cpu_online(dest_cpu)))
+ return rq;
+ } else {
+ if (unlikely(!cpu_active(dest_cpu)))
+ return rq;
+ }
/* Affinity changed (again). */
if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
@@ -2069,7 +2074,7 @@ out:
/**
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
- * @cookie: context's cookie for pinning
+ * @rf: request-queue flags for pinning
*
* Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
prev_state = prev->state;
vtime_task_switch(prev);
perf_event_task_sched_in(prev, current);
+ /*
+ * The membarrier system call requires a full memory barrier
+ * after storing to rq->curr, before going back to user-space.
+ *
+ * TODO: This smp_mb__after_unlock_lock can go away if PPC end
+ * up adding a full barrier to switch_mm(), or we should figure
+ * out if a smp_mb__after_unlock_lock is really the proper API
+ * to use.
+ */
+ smp_mb__after_unlock_lock();
finish_lock_switch(rq, prev);
finish_arch_post_lock_switch();
@@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt)
if (likely(prev != next)) {
rq->nr_switches++;
rq->curr = next;
+ /*
+ * The membarrier system call requires each architecture
+ * to have a full memory barrier after updating
+ * rq->curr, before returning to user-space. For TSO
+ * (e.g. x86), the architecture must provide its own
+ * barrier in switch_mm(). For weakly ordered machines
+ * for which spin_unlock() acts as a full memory
+ * barrier, finish_lock_switch() in common code takes
+ * care of this barrier. For weakly ordered machines for
+ * which spin_unlock() acts as a RELEASE barrier (only
+ * arm64 and PowerPC), arm64 has a full barrier in
+ * switch_to(), and PowerPC has
+ * smp_mb__after_unlock_lock() before
+ * finish_lock_switch().
+ */
++*switch_count;
trace_sched_switch(preempt, prev, next);
@@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void)
* To avoid it, we have to wait for releasing tsk->pi_lock which
* is held by try_to_wake_up()
*/
- smp_mb();
- raw_spin_unlock_wait(&current->pi_lock);
+ raw_spin_lock_irq(&current->pi_lock);
+ raw_spin_unlock_irq(&current->pi_lock);
/* Causes final put_task_struct in finish_task_switch(): */
__set_current_state(TASK_DEAD);
@@ -5103,24 +5133,17 @@ out_unlock:
return retval;
}
-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
int ppid;
- unsigned long state = p->state;
-
- /* Make sure the string lines up properly with the number of task states: */
- BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
if (!try_get_task_stack(p))
return;
- if (state)
- state = __ffs(state) + 1;
- printk(KERN_INFO "%-15.15s %c", p->comm,
- state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
- if (state == TASK_RUNNING)
+
+ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
+
+ if (p->state == TASK_RUNNING)
printk(KERN_CONT " running task ");
#ifdef CONFIG_DEBUG_STACK_USAGE
free = stack_not_used(p);
@@ -5177,11 +5200,6 @@ void show_state_filter(unsigned long state_filter)
debug_show_all_locks();
}
-void init_idle_bootup_task(struct task_struct *idle)
-{
- idle->sched_class = &idle_sched_class;
-}
-
/**
* init_idle - set up an idle thread for a given CPU
* @idle: task in question
@@ -5438,7 +5456,7 @@ static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
*/
next = pick_next_task(rq, &fake_task, rf);
BUG_ON(!next);
- next->sched_class->put_prev_task(rq, next);
+ put_prev_task(rq, next);
/*
* Rules for changing task_struct::cpus_allowed are holding
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index fba235c7d026..8d9562d890d3 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -119,29 +119,29 @@ static inline int cpudl_maximum(struct cpudl *cp)
* @p: the task
* @later_mask: a mask to fill in with the selected CPUs (or NULL)
*
- * Returns: int - best CPU (heap maximum if suitable)
+ * Returns: int - CPUs were found
*/
int cpudl_find(struct cpudl *cp, struct task_struct *p,
struct cpumask *later_mask)
{
- int best_cpu = -1;
const struct sched_dl_entity *dl_se = &p->dl;
if (later_mask &&
cpumask_and(later_mask, cp->free_cpus, &p->cpus_allowed)) {
- best_cpu = cpumask_any(later_mask);
- goto out;
- } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
- dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
- best_cpu = cpudl_maximum(cp);
- if (later_mask)
- cpumask_set_cpu(best_cpu, later_mask);
- }
+ return 1;
+ } else {
+ int best_cpu = cpudl_maximum(cp);
+ WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
-out:
- WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+ if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
+ dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+ if (later_mask)
+ cpumask_set_cpu(best_cpu, later_mask);
- return best_cpu;
+ return 1;
+ }
+ }
+ return 0;
}
/*
@@ -246,7 +246,6 @@ int cpudl_init(struct cpudl *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
raw_spin_lock_init(&cp->lock);
cp->size = 0;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 076a2e31951c..29a397067ffa 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -610,6 +610,11 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = SCHED_CPUFREQ_RT;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ }
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
policy_is_shared(policy) ?
sugov_update_shared :
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 981fcd7dc394..2511aba36b89 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -209,8 +209,6 @@ int cpupri_init(struct cpupri *cp)
{
int i;
- memset(cp, 0, sizeof(*cp));
-
for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
struct cpupri_vec *vec = &cp->pri_to_cpu[i];
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 67c70e287647..14d2dbf97c53 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -611,17 +611,23 @@ static void cputime_adjust(struct task_cputime *curr,
utime = curr->utime;
/*
- * If either stime or both stime and utime are 0, assume all runtime is
- * userspace. Once a task gets some ticks, the monotonicy code at
- * 'update' will ensure things converge to the observed ratio.
+ * If either stime or utime are 0, assume all runtime is userspace.
+ * Once a task gets some ticks, the monotonicy code at 'update:'
+ * will ensure things converge to the observed ratio.
*/
- if (stime != 0) {
- if (utime == 0)
- stime = rtime;
- else
- stime = scale_stime(stime, rtime, stime + utime);
+ if (stime == 0) {
+ utime = rtime;
+ goto update;
}
+ if (utime == 0) {
+ stime = rtime;
+ goto update;
+ }
+
+ stime = scale_stime(stime, rtime, stime + utime);
+
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -673,20 +679,21 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static u64 vtime_delta(struct task_struct *tsk)
+static u64 vtime_delta(struct vtime *vtime)
{
- unsigned long now = READ_ONCE(jiffies);
+ unsigned long long clock;
- if (time_before(now, (unsigned long)tsk->vtime_snap))
+ clock = sched_clock();
+ if (clock < vtime->starttime)
return 0;
- return jiffies_to_nsecs(now - tsk->vtime_snap);
+ return clock - vtime->starttime;
}
-static u64 get_vtime_delta(struct task_struct *tsk)
+static u64 get_vtime_delta(struct vtime *vtime)
{
- unsigned long now = READ_ONCE(jiffies);
- u64 delta, other;
+ u64 delta = vtime_delta(vtime);
+ u64 other;
/*
* Unlike tick based timing, vtime based timing never has lost
@@ -695,104 +702,138 @@ static u64 get_vtime_delta(struct task_struct *tsk)
* elapsed time. Limit account_other_time to prevent rounding
* errors from causing elapsed vtime to go negative.
*/
- delta = jiffies_to_nsecs(now - tsk->vtime_snap);
other = account_other_time(delta);
- WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
- tsk->vtime_snap = now;
+ WARN_ON_ONCE(vtime->state == VTIME_INACTIVE);
+ vtime->starttime += delta;
return delta - other;
}
-static void __vtime_account_system(struct task_struct *tsk)
+static void __vtime_account_system(struct task_struct *tsk,
+ struct vtime *vtime)
+{
+ vtime->stime += get_vtime_delta(vtime);
+ if (vtime->stime >= TICK_NSEC) {
+ account_system_time(tsk, irq_count(), vtime->stime);
+ vtime->stime = 0;
+ }
+}
+
+static void vtime_account_guest(struct task_struct *tsk,
+ struct vtime *vtime)
{
- account_system_time(tsk, irq_count(), get_vtime_delta(tsk));
+ vtime->gtime += get_vtime_delta(vtime);
+ if (vtime->gtime >= TICK_NSEC) {
+ account_guest_time(tsk, vtime->gtime);
+ vtime->gtime = 0;
+ }
}
void vtime_account_system(struct task_struct *tsk)
{
- if (!vtime_delta(tsk))
+ struct vtime *vtime = &tsk->vtime;
+
+ if (!vtime_delta(vtime))
return;
- write_seqcount_begin(&tsk->vtime_seqcount);
- __vtime_account_system(tsk);
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ /* We might have scheduled out from guest path */
+ if (current->flags & PF_VCPU)
+ vtime_account_guest(tsk, vtime);
+ else
+ __vtime_account_system(tsk, vtime);
+ write_seqcount_end(&vtime->seqcount);
}
-void vtime_account_user(struct task_struct *tsk)
+void vtime_user_enter(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- tsk->vtime_snap_whence = VTIME_SYS;
- if (vtime_delta(tsk))
- account_user_time(tsk, get_vtime_delta(tsk));
- write_seqcount_end(&tsk->vtime_seqcount);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk, vtime);
+ vtime->state = VTIME_USER;
+ write_seqcount_end(&vtime->seqcount);
}
-void vtime_user_enter(struct task_struct *tsk)
+void vtime_user_exit(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
- tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seqcount);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->utime += get_vtime_delta(vtime);
+ if (vtime->utime >= TICK_NSEC) {
+ account_user_time(tsk, vtime->utime);
+ vtime->utime = 0;
+ }
+ vtime->state = VTIME_SYS;
+ write_seqcount_end(&vtime->seqcount);
}
void vtime_guest_enter(struct task_struct *tsk)
{
+ struct vtime *vtime = &tsk->vtime;
/*
* The flags must be updated under the lock with
- * the vtime_snap flush and update.
+ * the vtime_starttime flush and update.
* That enforces a right ordering and update sequence
* synchronization against the reader (task_gtime())
* that can thus safely catch up with a tickless delta.
*/
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
+ write_seqcount_begin(&vtime->seqcount);
+ __vtime_account_system(tsk, vtime);
current->flags |= PF_VCPU;
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_enter);
void vtime_guest_exit(struct task_struct *tsk)
{
- write_seqcount_begin(&tsk->vtime_seqcount);
- __vtime_account_system(tsk);
+ struct vtime *vtime = &tsk->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime_account_guest(tsk, vtime);
current->flags &= ~PF_VCPU;
- write_seqcount_end(&tsk->vtime_seqcount);
+ write_seqcount_end(&vtime->seqcount);
}
EXPORT_SYMBOL_GPL(vtime_guest_exit);
void vtime_account_idle(struct task_struct *tsk)
{
- account_idle_time(get_vtime_delta(tsk));
+ account_idle_time(get_vtime_delta(&tsk->vtime));
}
void arch_vtime_task_switch(struct task_struct *prev)
{
- write_seqcount_begin(&prev->vtime_seqcount);
- prev->vtime_snap_whence = VTIME_INACTIVE;
- write_seqcount_end(&prev->vtime_seqcount);
+ struct vtime *vtime = &prev->vtime;
+
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_INACTIVE;
+ write_seqcount_end(&vtime->seqcount);
+
+ vtime = &current->vtime;
- write_seqcount_begin(&current->vtime_seqcount);
- current->vtime_snap_whence = VTIME_SYS;
- current->vtime_snap = jiffies;
- write_seqcount_end(&current->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+ vtime->starttime = sched_clock();
+ write_seqcount_end(&vtime->seqcount);
}
void vtime_init_idle(struct task_struct *t, int cpu)
{
+ struct vtime *vtime = &t->vtime;
unsigned long flags;
local_irq_save(flags);
- write_seqcount_begin(&t->vtime_seqcount);
- t->vtime_snap_whence = VTIME_SYS;
- t->vtime_snap = jiffies;
- write_seqcount_end(&t->vtime_seqcount);
+ write_seqcount_begin(&vtime->seqcount);
+ vtime->state = VTIME_SYS;
+ vtime->starttime = sched_clock();
+ write_seqcount_end(&vtime->seqcount);
local_irq_restore(flags);
}
u64 task_gtime(struct task_struct *t)
{
+ struct vtime *vtime = &t->vtime;
unsigned int seq;
u64 gtime;
@@ -800,13 +841,13 @@ u64 task_gtime(struct task_struct *t)
return t->gtime;
do {
- seq = read_seqcount_begin(&t->vtime_seqcount);
+ seq = read_seqcount_begin(&vtime->seqcount);
gtime = t->gtime;
- if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU)
- gtime += vtime_delta(t);
+ if (vtime->state == VTIME_SYS && t->flags & PF_VCPU)
+ gtime += vtime->gtime + vtime_delta(vtime);
- } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
return gtime;
}
@@ -818,8 +859,9 @@ u64 task_gtime(struct task_struct *t)
*/
void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
{
- u64 delta;
+ struct vtime *vtime = &t->vtime;
unsigned int seq;
+ u64 delta;
if (!vtime_accounting_enabled()) {
*utime = t->utime;
@@ -828,25 +870,25 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime)
}
do {
- seq = read_seqcount_begin(&t->vtime_seqcount);
+ seq = read_seqcount_begin(&vtime->seqcount);
*utime = t->utime;
*stime = t->stime;
/* Task is sleeping, nothing to add */
- if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
+ if (vtime->state == VTIME_INACTIVE || is_idle_task(t))
continue;
- delta = vtime_delta(t);
+ delta = vtime_delta(vtime);
/*
* Task runs either in user or kernel space, add pending nohz time to
* the right place.
*/
- if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
- *utime += delta;
- else if (t->vtime_snap_whence == VTIME_SYS)
- *stime += delta;
- } while (read_seqcount_retry(&t->vtime_seqcount, seq));
+ if (vtime->state == VTIME_USER || t->flags & PF_VCPU)
+ *utime += vtime->utime + delta;
+ else if (vtime->state == VTIME_SYS)
+ *stime += vtime->stime + delta;
+ } while (read_seqcount_retry(&vtime->seqcount, seq));
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a84299f44b5d..d05bd9457a40 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1392,17 +1392,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
struct sched_dl_entity *pi_se = &p->dl;
/*
- * Use the scheduling parameters of the top pi-waiter
- * task if we have one and its (absolute) deadline is
- * smaller than our one... OTW we keep our runtime and
- * deadline.
+ * Use the scheduling parameters of the top pi-waiter task if:
+ * - we have a top pi-waiter which is a SCHED_DEADLINE task AND
+ * - our dl_boosted is set (i.e. the pi-waiter's (absolute) deadline is
+ * smaller than our deadline OR we are a !SCHED_DEADLINE task getting
+ * boosted due to a SCHED_DEADLINE pi-waiter).
+ * Otherwise we keep our runtime and deadline.
*/
- if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
+ if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
pi_se = &pi_task->dl;
} else if (!dl_prio(p->normal_prio)) {
/*
* Special case in which we have a !SCHED_DEADLINE task
- * that is going to be deboosted, but exceedes its
+ * that is going to be deboosted, but exceeds its
* runtime while doing so. No point in replenishing
* it, as it's going to return back to its original
* scheduling class after this.
@@ -1592,7 +1594,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* let's hope p can move out.
*/
if (rq->curr->nr_cpus_allowed == 1 ||
- cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+ !cpudl_find(&rq->rd->cpudl, rq->curr, NULL))
return;
/*
@@ -1600,7 +1602,7 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
* see if it is pushed or pulled somewhere else.
*/
if (p->nr_cpus_allowed != 1 &&
- cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+ cpudl_find(&rq->rd->cpudl, p, NULL))
return;
resched_curr(rq);
@@ -1653,7 +1655,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
return rb_entry(left, struct sched_dl_entity, rb_node);
}
-struct task_struct *
+static struct task_struct *
pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
struct sched_dl_entity *dl_se;
@@ -1796,7 +1798,7 @@ static int find_later_rq(struct task_struct *task)
struct sched_domain *sd;
struct cpumask *later_mask = this_cpu_cpumask_var_ptr(local_cpu_mask_dl);
int this_cpu = smp_processor_id();
- int best_cpu, cpu = task_cpu(task);
+ int cpu = task_cpu(task);
/* Make sure the mask is initialized first */
if (unlikely(!later_mask))
@@ -1809,17 +1811,14 @@ static int find_later_rq(struct task_struct *task)
* We have to consider system topology and task affinity
* first, then we can look for a suitable cpu.
*/
- best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
- task, later_mask);
- if (best_cpu == -1)
+ if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
return -1;
/*
- * If we are here, some target has been found,
- * the most suitable of which is cached in best_cpu.
- * This is, among the runqueues where the current tasks
- * have later deadlines than the task's one, the rq
- * with the latest possible one.
+ * If we are here, some targets have been found, including
+ * the most suitable which is, among the runqueues where the
+ * current tasks have later deadlines than the task's one, the
+ * rq with the latest possible one.
*
* Now we check how well this matches with task's
* affinity and system topology.
@@ -1839,6 +1838,7 @@ static int find_later_rq(struct task_struct *task)
rcu_read_lock();
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) {
+ int best_cpu;
/*
* If possible, preempting this_cpu is
@@ -1850,12 +1850,15 @@ static int find_later_rq(struct task_struct *task)
return this_cpu;
}
+ best_cpu = cpumask_first_and(later_mask,
+ sched_domain_span(sd));
/*
- * Last chance: if best_cpu is valid and is
- * in the mask, that becomes our choice.
+ * Last chance: if a cpu being in both later_mask
+ * and current sd span is valid, that becomes our
+ * choice. Of course, the latest possible cpu is
+ * already under consideration through later_mask.
*/
- if (best_cpu < nr_cpu_ids &&
- cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+ if (best_cpu < nr_cpu_ids) {
rcu_read_unlock();
return best_cpu;
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4fa66de52bd6..4a23bbc3111b 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -327,38 +327,78 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
return table;
}
+static cpumask_var_t sd_sysctl_cpus;
static struct ctl_table_header *sd_sysctl_header;
+
void register_sched_domain_sysctl(void)
{
- int i, cpu_num = num_possible_cpus();
- struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+ static struct ctl_table *cpu_entries;
+ static struct ctl_table **cpu_idx;
char buf[32];
+ int i;
- WARN_ON(sd_ctl_dir[0].child);
- sd_ctl_dir[0].child = entry;
+ if (!cpu_entries) {
+ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1);
+ if (!cpu_entries)
+ return;
- if (entry == NULL)
- return;
+ WARN_ON(sd_ctl_dir[0].child);
+ sd_ctl_dir[0].child = cpu_entries;
+ }
- for_each_possible_cpu(i) {
- snprintf(buf, 32, "cpu%d", i);
- entry->procname = kstrdup(buf, GFP_KERNEL);
- entry->mode = 0555;
- entry->child = sd_alloc_ctl_cpu_table(i);
- entry++;
+ if (!cpu_idx) {
+ struct ctl_table *e = cpu_entries;
+
+ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL);
+ if (!cpu_idx)
+ return;
+
+ /* deal with sparse possible map */
+ for_each_possible_cpu(i) {
+ cpu_idx[i] = e;
+ e++;
+ }
+ }
+
+ if (!cpumask_available(sd_sysctl_cpus)) {
+ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
+ return;
+
+ /* init to possible to not have holes in @cpu_entries */
+ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
+ }
+
+ for_each_cpu(i, sd_sysctl_cpus) {
+ struct ctl_table *e = cpu_idx[i];
+
+ if (e->child)
+ sd_free_ctl_entry(&e->child);
+
+ if (!e->procname) {
+ snprintf(buf, 32, "cpu%d", i);
+ e->procname = kstrdup(buf, GFP_KERNEL);
+ }
+ e->mode = 0555;
+ e->child = sd_alloc_ctl_cpu_table(i);
+
+ __cpumask_clear_cpu(i, sd_sysctl_cpus);
}
WARN_ON(sd_sysctl_header);
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
+void dirty_sched_domain_sysctl(int cpu)
+{
+ if (cpumask_available(sd_sysctl_cpus))
+ __cpumask_set_cpu(cpu, sd_sysctl_cpus);
+}
+
/* may be called multiple times per register */
void unregister_sched_domain_sysctl(void)
{
unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
- if (sd_ctl_dir[0].child)
- sd_free_ctl_entry(&sd_ctl_dir[0].child);
}
#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_SMP */
@@ -421,13 +461,15 @@ static char *task_group_path(struct task_group *tg)
}
#endif
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
if (rq->curr == p)
- SEQ_printf(m, "R");
+ SEQ_printf(m, ">R");
else
- SEQ_printf(m, " ");
+ SEQ_printf(m, " %c", task_state_to_char(p));
SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ",
p->comm, task_pid_nr(p),
@@ -456,9 +498,9 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
SEQ_printf(m,
"\nrunnable tasks:\n"
- " task PID tree-key switches prio"
+ " S task PID tree-key switches prio"
" wait-time sum-exec sum-sleep\n"
- "------------------------------------------------------"
+ "-------------------------------------------------------"
"----------------------------------------------------\n");
rcu_read_lock();
@@ -872,11 +914,12 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
#endif
}
-void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
+ struct seq_file *m)
{
unsigned long nr_switches;
- SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p),
+ SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr_ns(p, ns),
get_nr_threads(p));
SEQ_printf(m,
"---------------------------------------------------------"
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 008c514dc241..8d5868771cb3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -806,7 +806,7 @@ void post_init_entity_util_avg(struct sched_entity *se)
/*
* For !fair tasks do:
*
- update_cfs_rq_load_avg(now, cfs_rq, false);
+ update_cfs_rq_load_avg(now, cfs_rq);
attach_entity_load_avg(cfs_rq, se);
switched_from_fair(rq, p);
*
@@ -1071,6 +1071,29 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+struct numa_group {
+ atomic_t refcount;
+
+ spinlock_t lock; /* nr_tasks, tasks */
+ int nr_tasks;
+ pid_t gid;
+ int active_nodes;
+
+ struct rcu_head rcu;
+ unsigned long total_faults;
+ unsigned long max_faults_cpu;
+ /*
+ * Faults_cpu is used to decide whether memory should move
+ * towards the CPU. As a consequence, these stats are weighted
+ * more by CPU use than by memory faults.
+ */
+ unsigned long *faults_cpu;
+ unsigned long faults[0];
+};
+
+static inline unsigned long group_faults_priv(struct numa_group *ng);
+static inline unsigned long group_faults_shared(struct numa_group *ng);
+
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
@@ -1107,13 +1130,47 @@ static unsigned int task_scan_min(struct task_struct *p)
return max_t(unsigned int, floor, scan);
}
+static unsigned int task_scan_start(struct task_struct *p)
+{
+ unsigned long smin = task_scan_min(p);
+ unsigned long period = smin;
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+ }
+
+ return max(smin, period);
+}
+
static unsigned int task_scan_max(struct task_struct *p)
{
- unsigned int smin = task_scan_min(p);
- unsigned int smax;
+ unsigned long smin = task_scan_min(p);
+ unsigned long smax;
/* Watch for min being lower than max due to floor calculations */
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+
+ /* Scale the maximum scan period with the amount of shared memory. */
+ if (p->numa_group) {
+ struct numa_group *ng = p->numa_group;
+ unsigned long shared = group_faults_shared(ng);
+ unsigned long private = group_faults_priv(ng);
+ unsigned long period = smax;
+
+ period *= atomic_read(&ng->refcount);
+ period *= shared + 1;
+ period /= private + shared + 1;
+
+ smax = max(smax, period);
+ }
+
return max(smin, smax);
}
@@ -1129,26 +1186,6 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
}
-struct numa_group {
- atomic_t refcount;
-
- spinlock_t lock; /* nr_tasks, tasks */
- int nr_tasks;
- pid_t gid;
- int active_nodes;
-
- struct rcu_head rcu;
- unsigned long total_faults;
- unsigned long max_faults_cpu;
- /*
- * Faults_cpu is used to decide whether memory should move
- * towards the CPU. As a consequence, these stats are weighted
- * more by CPU use than by memory faults.
- */
- unsigned long *faults_cpu;
- unsigned long faults[0];
-};
-
/* Shared or private faults. */
#define NR_NUMA_HINT_FAULT_TYPES 2
@@ -1198,6 +1235,30 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
+static inline unsigned long group_faults_priv(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
+ }
+
+ return faults;
+}
+
+static inline unsigned long group_faults_shared(struct numa_group *ng)
+{
+ unsigned long faults = 0;
+ int node;
+
+ for_each_online_node(node) {
+ faults += ng->faults[task_faults_idx(NUMA_MEM, node, 0)];
+ }
+
+ return faults;
+}
+
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
@@ -1378,7 +1439,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
-static unsigned long weighted_cpuload(const int cpu);
+static unsigned long weighted_cpuload(struct rq *rq);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
static unsigned long capacity_of(int cpu);
@@ -1409,7 +1470,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
struct rq *rq = cpu_rq(cpu);
ns->nr_running += rq->nr_running;
- ns->load += weighted_cpuload(cpu);
+ ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu);
cpus++;
@@ -1808,7 +1869,7 @@ static int task_numa_migrate(struct task_struct *p)
* Reset the scan period if the task is being rescheduled on an
* alternative node to recheck if the tasks is now properly placed.
*/
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
if (env.best_task == NULL) {
ret = migrate_task_to(p, env.best_cpu);
@@ -1892,7 +1953,7 @@ static void update_task_scan_period(struct task_struct *p,
unsigned long shared, unsigned long private)
{
unsigned int period_slot;
- int ratio;
+ int lr_ratio, ps_ratio;
int diff;
unsigned long remote = p->numa_faults_locality[0];
@@ -1922,25 +1983,36 @@ static void update_task_scan_period(struct task_struct *p,
* >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
*/
period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
- ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
- if (ratio >= NUMA_PERIOD_THRESHOLD) {
- int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+ if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are local. There is no need to
+ * do fast NUMA scanning, since memory is already local.
+ */
+ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are shared with other tasks.
+ * There is no point in continuing fast NUMA scanning,
+ * since other tasks may just move the memory elsewhere.
+ */
+ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
if (!slot)
slot = 1;
diff = slot * period_slot;
} else {
- diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
-
/*
- * Scale scan rate increases based on sharing. There is an
- * inverse relationship between the degree of sharing and
- * the adjustment made to the scanning period. Broadly
- * speaking the intent is that there is little point
- * scanning faster if shared accesses dominate as it may
- * simply bounce migrations uselessly
+ * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+ * yet they are not on the local NUMA node. Speed up
+ * NUMA scanning to get the memory moved over.
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
- diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ int ratio = max(lr_ratio, ps_ratio);
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
}
p->numa_scan_period = clamp(p->numa_scan_period + diff,
@@ -2448,7 +2520,7 @@ void task_numa_work(struct callback_head *work)
if (p->numa_scan_period == 0) {
p->numa_scan_period_max = task_scan_max(p);
- p->numa_scan_period = task_scan_min(p);
+ p->numa_scan_period = task_scan_start(p);
}
next_scan = now + msecs_to_jiffies(p->numa_scan_period);
@@ -2576,7 +2648,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
if (now > curr->node_stamp + period) {
if (!curr->node_stamp)
- curr->numa_scan_period = task_scan_min(curr);
+ curr->numa_scan_period = task_scan_start(curr);
curr->node_stamp += period;
if (!time_before(jiffies, curr->mm->numa_next_scan)) {
@@ -2586,59 +2658,6 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
}
}
-/*
- * Can a task be moved from prev_cpu to this_cpu without causing a load
- * imbalance that would trigger the load balancer?
- */
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- struct numa_stats prev_load, this_load;
- s64 this_eff_load, prev_eff_load;
-
- update_numa_stats(&prev_load, cpu_to_node(prev_cpu));
- update_numa_stats(&this_load, cpu_to_node(this_cpu));
-
- /*
- * If sync wakeup then subtract the (maximum possible)
- * effect of the currently running task from the load
- * of the current CPU:
- */
- if (sync) {
- unsigned long current_load = task_h_load(current);
-
- if (this_load.load > current_load)
- this_load.load -= current_load;
- else
- this_load.load = 0;
- }
-
- /*
- * In low-load situations, where this_cpu's node is idle due to the
- * sync cause above having dropped this_load.load to 0, move the task.
- * Moving to an idle socket will not create a bad imbalance.
- *
- * Otherwise check if the nodes are near enough in load to allow this
- * task to be woken on this_cpu's node.
- */
- if (this_load.load > 0) {
- unsigned long task_load = task_h_load(p);
-
- this_eff_load = 100;
- this_eff_load *= prev_load.compute_capacity;
-
- prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= this_load.compute_capacity;
-
- this_eff_load *= this_load.load + task_load;
- prev_eff_load *= prev_load.load - task_load;
-
- return this_eff_load <= prev_eff_load;
- }
-
- return true;
-}
#else
static void task_tick_numa(struct rq *rq, struct task_struct *curr)
{
@@ -2652,14 +2671,6 @@ static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
{
}
-#ifdef CONFIG_SMP
-static inline bool numa_wake_affine(struct sched_domain *sd,
- struct task_struct *p, int this_cpu,
- int prev_cpu, int sync)
-{
- return true;
-}
-#endif /* !SMP */
#endif /* CONFIG_NUMA_BALANCING */
static void
@@ -2790,6 +2801,29 @@ static inline void update_cfs_shares(struct sched_entity *se)
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ if (&this_rq()->cfs == cfs_rq) {
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+ cpufreq_update_util(rq_of(cfs_rq), 0);
+ }
+}
+
#ifdef CONFIG_SMP
/*
* Approximate:
@@ -2968,6 +3002,18 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
sa->last_update_time += delta << 10;
/*
+ * running is a subset of runnable (weight) so running can't be set if
+ * runnable is clear. But there are some corner cases where the current
+ * se has been already dequeued but cfs_rq->curr still points to it.
+ * This means that weight will be 0 but not running for a sched_entity
+ * but also for a cfs_rq if the latter becomes idle. As an example,
+ * this happens during idle_balance() which calls
+ * update_blocked_averages()
+ */
+ if (!weight)
+ running = 0;
+
+ /*
* Now we know we crossed measurement unit boundaries. The *_avg
* accrues by two steps:
*
@@ -3276,29 +3322,6 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
-{
- if (&this_rq()->cfs == cfs_rq) {
- /*
- * There are a few boundary cases this might miss but it should
- * get called often enough that that should (hopefully) not be
- * a real problem -- added to that it only calls on the local
- * CPU, so if we enqueue remotely we'll miss an update, but
- * the next tick/schedule should update.
- *
- * It will not get called when we go idle, because the idle
- * thread is a different class (!fair), nor will the utilization
- * number include things like RT tasks.
- *
- * As is, the util number is not freq-invariant (we'd have to
- * implement arch_scale_freq_capacity() for that).
- *
- * See cpu_util().
- */
- cpufreq_update_util(rq_of(cfs_rq), 0);
- }
-}
-
/*
* Unsigned subtract and clamp on underflow.
*
@@ -3320,7 +3343,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* update_cfs_rq_load_avg - update the cfs_rq's load/util averages
* @now: current time, as per cfs_rq_clock_task()
* @cfs_rq: cfs_rq to update
- * @update_freq: should we call cfs_rq_util_change() or will the call do so
*
* The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
* avg. The immediate corollary is that all (fair) tasks must be attached, see
@@ -3334,7 +3356,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
* call update_tg_load_avg() when this function returns true.
*/
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
struct sched_avg *sa = &cfs_rq->avg;
int decayed, removed_load = 0, removed_util = 0;
@@ -3362,7 +3384,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- if (update_freq && (decayed || removed_util))
+ if (decayed || removed_util)
cfs_rq_util_change(cfs_rq);
return decayed || removed_load;
@@ -3390,7 +3412,7 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
__update_load_avg_se(now, cpu, cfs_rq, se);
- decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
+ decayed = update_cfs_rq_load_avg(now, cfs_rq);
decayed |= propagate_entity_load_avg(se);
if (decayed && (flags & UPDATE_TG))
@@ -3534,7 +3556,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
#else /* CONFIG_SMP */
static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
{
return 0;
}
@@ -3544,7 +3566,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
static inline void update_load_avg(struct sched_entity *se, int not_used1)
{
- cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
+ cfs_rq_util_change(cfs_rq_of(se));
}
static inline void
@@ -5125,9 +5147,9 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
}
/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
+static unsigned long weighted_cpuload(struct rq *rq)
{
- return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
+ return cfs_rq_runnable_load_avg(&rq->cfs);
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -5172,7 +5194,7 @@ static void cpu_load_update_idle(struct rq *this_rq)
/*
* bail if there's load or we're actually up-to-date.
*/
- if (weighted_cpuload(cpu_of(this_rq)))
+ if (weighted_cpuload(this_rq))
return;
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), 0);
@@ -5193,7 +5215,7 @@ void cpu_load_update_nohz_start(void)
* concurrently we'll exit nohz. And cpu_load write can race with
* cpu_load_update_idle() but both updater would be writing the same.
*/
- this_rq->cpu_load[0] = weighted_cpuload(cpu_of(this_rq));
+ this_rq->cpu_load[0] = weighted_cpuload(this_rq);
}
/*
@@ -5209,7 +5231,7 @@ void cpu_load_update_nohz_stop(void)
if (curr_jiffies == this_rq->last_load_update_tick)
return;
- load = weighted_cpuload(cpu_of(this_rq));
+ load = weighted_cpuload(this_rq);
rq_lock(this_rq, &rf);
update_rq_clock(this_rq);
cpu_load_update_nohz(this_rq, curr_jiffies, load);
@@ -5235,7 +5257,7 @@ static void cpu_load_update_periodic(struct rq *this_rq, unsigned long load)
*/
void cpu_load_update_active(struct rq *this_rq)
{
- unsigned long load = weighted_cpuload(cpu_of(this_rq));
+ unsigned long load = weighted_cpuload(this_rq);
if (tick_nohz_tick_stopped())
cpu_load_update_nohz(this_rq, READ_ONCE(jiffies), load);
@@ -5253,7 +5275,7 @@ void cpu_load_update_active(struct rq *this_rq)
static unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5268,7 +5290,7 @@ static unsigned long source_load(int cpu, int type)
static unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = weighted_cpuload(cpu);
+ unsigned long total = weighted_cpuload(rq);
if (type == 0 || !sched_feat(LB_BIAS))
return total;
@@ -5290,7 +5312,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
- unsigned long load_avg = weighted_cpuload(cpu);
+ unsigned long load_avg = weighted_cpuload(rq);
if (nr_running)
return load_avg / nr_running;
@@ -5345,20 +5367,115 @@ static int wake_wide(struct task_struct *p)
return 1;
}
+struct llc_stats {
+ unsigned long nr_running;
+ unsigned long load;
+ unsigned long capacity;
+ int has_capacity;
+};
+
+static bool get_llc_stats(struct llc_stats *stats, int cpu)
+{
+ struct sched_domain_shared *sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+
+ if (!sds)
+ return false;
+
+ stats->nr_running = READ_ONCE(sds->nr_running);
+ stats->load = READ_ONCE(sds->load);
+ stats->capacity = READ_ONCE(sds->capacity);
+ stats->has_capacity = stats->nr_running < per_cpu(sd_llc_size, cpu);
+
+ return true;
+}
+
+/*
+ * Can a task be moved from prev_cpu to this_cpu without causing a load
+ * imbalance that would trigger the load balancer?
+ *
+ * Since we're running on 'stale' values, we might in fact create an imbalance
+ * but recomputing these values is expensive, as that'd mean iteration 2 cache
+ * domains worth of CPUs.
+ */
+static bool
+wake_affine_llc(struct sched_domain *sd, struct task_struct *p,
+ int this_cpu, int prev_cpu, int sync)
+{
+ struct llc_stats prev_stats, this_stats;
+ s64 this_eff_load, prev_eff_load;
+ unsigned long task_load;
+
+ if (!get_llc_stats(&prev_stats, prev_cpu) ||
+ !get_llc_stats(&this_stats, this_cpu))
+ return false;
+
+ /*
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current LLC.
+ */
+ if (sync) {
+ unsigned long current_load = task_h_load(current);
+
+ /* in this case load hits 0 and this LLC is considered 'idle' */
+ if (current_load > this_stats.load)
+ return true;
+
+ this_stats.load -= current_load;
+ }
+
+ /*
+ * The has_capacity stuff is not SMT aware, but by trying to balance
+ * the nr_running on both ends we try and fill the domain at equal
+ * rates, thereby first consuming cores before siblings.
+ */
+
+ /* if the old cache has capacity, stay there */
+ if (prev_stats.has_capacity && prev_stats.nr_running < this_stats.nr_running+1)
+ return false;
+
+ /* if this cache has capacity, come here */
+ if (this_stats.has_capacity && this_stats.nr_running < prev_stats.nr_running+1)
+ return true;
+
+ /*
+ * Check to see if we can move the load without causing too much
+ * imbalance.
+ */
+ task_load = task_h_load(p);
+
+ this_eff_load = 100;
+ this_eff_load *= prev_stats.capacity;
+
+ prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
+ prev_eff_load *= this_stats.capacity;
+
+ this_eff_load *= this_stats.load + task_load;
+ prev_eff_load *= prev_stats.load - task_load;
+
+ return this_eff_load <= prev_eff_load;
+}
+
static int wake_affine(struct sched_domain *sd, struct task_struct *p,
int prev_cpu, int sync)
{
int this_cpu = smp_processor_id();
- bool affine = false;
+ bool affine;
/*
- * Common case: CPUs are in the same socket, and select_idle_sibling()
- * will do its thing regardless of what we return:
+ * Default to no affine wakeups; wake_affine() should not effect a task
+ * placement the load-balancer feels inclined to undo. The conservative
+ * option is therefore to not move tasks when they wake up.
*/
- if (cpus_share_cache(prev_cpu, this_cpu))
- affine = true;
- else
- affine = numa_wake_affine(sd, p, this_cpu, prev_cpu, sync);
+ affine = false;
+
+ /*
+ * If the wakeup is across cache domains, try to evaluate if movement
+ * makes sense, otherwise rely on select_idle_siblings() to do
+ * placement inside the cache domain.
+ */
+ if (!cpus_share_cache(prev_cpu, this_cpu))
+ affine = wake_affine_llc(sd, p, this_cpu, prev_cpu, sync);
schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
if (affine) {
@@ -5550,7 +5667,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
shallowest_idle_cpu = i;
}
} else if (shallowest_idle_cpu == -1) {
- load = weighted_cpuload(i);
+ load = weighted_cpuload(cpu_rq(i));
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
least_loaded_cpu = i;
@@ -6187,10 +6304,10 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf
int new_tasks;
again:
-#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
goto idle;
+#ifdef CONFIG_FAIR_GROUP_SCHED
if (prev->sched_class != &fair_sched_class)
goto simple;
@@ -6220,11 +6337,17 @@ again:
/*
* This call to check_cfs_rq_runtime() will do the
* throttle and dequeue its entity in the parent(s).
- * Therefore the 'simple' nr_running test will indeed
+ * Therefore the nr_running test will indeed
* be correct.
*/
- if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+ if (unlikely(check_cfs_rq_runtime(cfs_rq))) {
+ cfs_rq = &rq->cfs;
+
+ if (!cfs_rq->nr_running)
+ goto idle;
+
goto simple;
+ }
}
se = pick_next_entity(cfs_rq, curr);
@@ -6264,12 +6387,8 @@ again:
return p;
simple:
- cfs_rq = &rq->cfs;
#endif
- if (!cfs_rq->nr_running)
- goto idle;
-
put_prev_task(rq, prev);
do {
@@ -6646,10 +6765,10 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
- * Also avoid computing new_dst_cpu if we have already computed
- * one in current iteration.
+ * Avoid computing new_dst_cpu for NEWLY_IDLE or if we have
+ * already computed one in current iteration.
*/
- if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+ if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
return 0;
/* Prevent to re-select dst_cpu via env's cpus */
@@ -6917,7 +7036,7 @@ static void update_blocked_averages(int cpu)
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
update_tg_load_avg(cfs_rq, 0);
/* Propagate pending load changes to the parent, if any: */
@@ -6990,7 +7109,7 @@ static inline void update_blocked_averages(int cpu)
rq_lock_irqsave(rq, &rf);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
rq_unlock_irqrestore(rq, &rf);
}
@@ -7036,6 +7155,7 @@ struct sg_lb_stats {
struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
+ unsigned long total_running;
unsigned long total_load; /* Total load of all groups in sd */
unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
@@ -7055,6 +7175,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
*sds = (struct sd_lb_stats){
.busiest = NULL,
.local = NULL,
+ .total_running = 0UL,
.total_load = 0UL,
.total_capacity = 0UL,
.busiest_stat = {
@@ -7363,7 +7484,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
- sgs->sum_weighted_load += weighted_cpuload(i);
+ sgs->sum_weighted_load += weighted_cpuload(rq);
/*
* No need to call idle_cpu() if nr_running is not 0
*/
@@ -7490,6 +7611,7 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
*/
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
+ struct sched_domain_shared *shared = env->sd->shared;
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
@@ -7546,6 +7668,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
+ sds->total_running += sgs->sum_nr_running;
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
@@ -7561,6 +7684,21 @@ next_group:
env->dst_rq->rd->overload = overload;
}
+ if (!shared)
+ return;
+
+ /*
+ * Since these are sums over groups they can contain some CPUs
+ * multiple times for the NUMA domains.
+ *
+ * Currently only wake_affine_llc() and find_busiest_group()
+ * uses these numbers, only the last is affected by this problem.
+ *
+ * XXX fix that.
+ */
+ WRITE_ONCE(shared->nr_running, sds->total_running);
+ WRITE_ONCE(shared->load, sds->total_load);
+ WRITE_ONCE(shared->capacity, sds->total_capacity);
}
/**
@@ -7790,6 +7928,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
+ /* XXX broken for overlapping NUMA groups */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
@@ -7892,7 +8031,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
capacity = capacity_of(i);
- wl = weighted_cpuload(i);
+ wl = weighted_cpuload(rq);
/*
* When comparing with imbalance, use weighted_cpuload()
@@ -8022,14 +8161,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.tasks = LIST_HEAD_INIT(env.tasks),
};
- /*
- * For NEWLY_IDLE load_balancing, we don't need to consider
- * other cpus in our group
- */
- if (idle == CPU_NEWLY_IDLE)
- env.dst_grpmask = NULL;
-
- cpumask_copy(cpus, cpu_active_mask);
+ cpumask_and(cpus, sched_domain_span(sd), cpu_active_mask);
schedstat_inc(sd->lb_count[idle]);
@@ -8151,7 +8283,15 @@ more_balance:
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
- if (!cpumask_empty(cpus)) {
+ /*
+ * Attempting to continue load balancing at the current
+ * sched_domain level only makes sense if there are
+ * active CPUs remaining as possible busiest CPUs to
+ * pull load from which are not contained within the
+ * destination group that is receiving any migrated
+ * load.
+ */
+ if (!cpumask_subset(cpus, env.dst_grpmask)) {
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
@@ -8447,6 +8587,13 @@ static int active_load_balance_cpu_stop(void *data)
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
+ /*
+ * can_migrate_task() doesn't need to compute new_dst_cpu
+ * for active balancing. Since we have CPU_IDLE, but no
+ * @dst_grpmask we need to make that test go away with lying
+ * about DST_PINNED.
+ */
+ .flags = LBF_DST_PINNED,
};
schedstat_inc(sd->alb_count);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000000..a92fddc22747
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+ *
+ * membarrier system call
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/syscalls.h>
+#include <linux/membarrier.h>
+#include <linux/tick.h>
+#include <linux/cpumask.h>
+
+#include "sched.h" /* for cpu_rq(). */
+
+/*
+ * Bitmask made from a "or" of all commands within enum membarrier_cmd,
+ * except MEMBARRIER_CMD_QUERY.
+ */
+#define MEMBARRIER_CMD_BITMASK \
+ (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
+
+static void ipi_mb(void *info)
+{
+ smp_mb(); /* IPIs should be serializing but paranoid. */
+}
+
+static void membarrier_private_expedited(void)
+{
+ int cpu;
+ bool fallback = false;
+ cpumask_var_t tmpmask;
+
+ if (num_online_cpus() == 1)
+ return;
+
+ /*
+ * Matches memory barriers around rq->curr modification in
+ * scheduler.
+ */
+ smp_mb(); /* system call entry is not a mb. */
+
+ /*
+ * Expedited membarrier commands guarantee that they won't
+ * block, hence the GFP_NOWAIT allocation flag and fallback
+ * implementation.
+ */
+ if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
+ /* Fallback for OOM. */
+ fallback = true;
+ }
+
+ cpus_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ rcu_read_lock();
+ p = task_rcu_dereference(&cpu_rq(cpu)->curr);
+ if (p && p->mm == current->mm) {
+ if (!fallback)
+ __cpumask_set_cpu(cpu, tmpmask);
+ else
+ smp_call_function_single(cpu, ipi_mb, NULL, 1);
+ }
+ rcu_read_unlock();
+ }
+ if (!fallback) {
+ smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ free_cpumask_var(tmpmask);
+ }
+ cpus_read_unlock();
+
+ /*
+ * Memory barrier on the caller thread _after_ we finished
+ * waiting for the last IPI. Matches memory barriers around
+ * rq->curr modification in scheduler.
+ */
+ smp_mb(); /* exit from system call is not a mb */
+}
+
+/**
+ * sys_membarrier - issue memory barriers on a set of threads
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0. For future extensions.
+ *
+ * If this system call is not implemented, -ENOSYS is returned. If the
+ * command specified does not exist, not available on the running
+ * kernel, or if the command argument is invalid, this system call
+ * returns -EINVAL. For a given command, with flags argument set to 0,
+ * this system call is guaranteed to always return the same value until
+ * reboot.
+ *
+ * All memory accesses performed in program order from each targeted thread
+ * is guaranteed to be ordered with respect to sys_membarrier(). If we use
+ * the semantic "barrier()" to represent a compiler barrier forcing memory
+ * accesses to be performed in program order across the barrier, and
+ * smp_mb() to represent explicit memory barriers forcing full memory
+ * ordering across the barrier, we have the following ordering table for
+ * each pair of barrier(), sys_membarrier() and smp_mb():
+ *
+ * The pair ordering is detailed as (O: ordered, X: not ordered):
+ *
+ * barrier() smp_mb() sys_membarrier()
+ * barrier() X X O
+ * smp_mb() X O O
+ * sys_membarrier() O O O
+ */
+SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+{
+ if (unlikely(flags))
+ return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_QUERY:
+ {
+ int cmd_mask = MEMBARRIER_CMD_BITMASK;
+
+ if (tick_nohz_full_enabled())
+ cmd_mask &= ~MEMBARRIER_CMD_SHARED;
+ return cmd_mask;
+ }
+ case MEMBARRIER_CMD_SHARED:
+ /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ if (tick_nohz_full_enabled())
+ return -EINVAL;
+ if (num_online_cpus() > 1)
+ synchronize_sched();
+ return 0;
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
+ membarrier_private_expedited();
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eeef1a3086d1..25e5cb1107f3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1120,11 +1120,15 @@ extern int group_balance_cpu(struct sched_group *sg);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
void register_sched_domain_sysctl(void);
+void dirty_sched_domain_sysctl(int cpu);
void unregister_sched_domain_sysctl(void);
#else
static inline void register_sched_domain_sysctl(void)
{
}
+static inline void dirty_sched_domain_sysctl(int cpu)
+{
+}
static inline void unregister_sched_domain_sysctl(void)
{
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 79895aec281e..6f7b43982f73 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -261,8 +261,6 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
static int init_rootdomain(struct root_domain *rd)
{
- memset(rd, 0, sizeof(*rd));
-
if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
goto out;
if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
@@ -311,7 +309,7 @@ static struct root_domain *alloc_rootdomain(void)
{
struct root_domain *rd;
- rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+ rd = kzalloc(sizeof(*rd), GFP_KERNEL);
if (!rd)
return NULL;
@@ -337,7 +335,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
kfree(sg->sgc);
- kfree(sg);
+ if (atomic_dec_and_test(&sg->ref))
+ kfree(sg);
sg = tmp;
} while (sg != first);
}
@@ -345,15 +344,12 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
static void destroy_sched_domain(struct sched_domain *sd)
{
/*
- * If its an overlapping domain it has private groups, iterate and
- * nuke them all.
+ * A normal sched domain may have multiple group references, an
+ * overlapping domain, having private groups, only one. Iterate,
+ * dropping group/capacity references, freeing where none remain.
*/
- if (sd->flags & SD_OVERLAP) {
- free_sched_groups(sd->groups, 1);
- } else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgc);
- kfree(sd->groups);
- }
+ free_sched_groups(sd->groups, 1);
+
if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
kfree(sd->shared);
kfree(sd);
@@ -463,6 +459,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
rq_attach_root(rq, rd);
tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
+ dirty_sched_domain_sysctl(cpu);
destroy_sched_domains(tmp);
update_top_cache_domain(cpu);
@@ -670,6 +667,7 @@ build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
else
cpumask_copy(sg_span, sched_domain_span(sd));
+ atomic_inc(&sg->ref);
return sg;
}
@@ -1595,7 +1593,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
}
}
-struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
@@ -1854,7 +1852,17 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
/* Let the architecture update CPU core mappings: */
new_topology = arch_update_cpu_topology();
- n = doms_new ? ndoms_new : 0;
+ if (!doms_new) {
+ WARN_ON_ONCE(dattr_new);
+ n = 0;
+ doms_new = alloc_sched_domains(1);
+ if (doms_new) {
+ n = 1;
+ cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+ }
+ } else {
+ n = ndoms_new;
+ }
/* Destroy deleted domains: */
for (i = 0; i < ndoms_cur; i++) {
@@ -1870,11 +1878,10 @@ match1:
}
n = ndoms_cur;
- if (doms_new == NULL) {
+ if (!doms_new) {
n = 0;
doms_new = &fallback_doms;
cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
- WARN_ON_ONCE(dattr_new);
}
/* Build new domains: */
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 17f11c6b0a9f..d6afed6d0752 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
list_for_each_entry_safe(curr, next, &wq_head->head, entry) {
unsigned flags = curr->flags;
-
- if (curr->func(curr, mode, wake_flags, key) &&
- (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
+ int ret = curr->func(curr, mode, wake_flags, key);
+ if (ret < 0)
+ break;
+ if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
break;
}
}
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 65f61077ad50..98b59b5db90b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -13,7 +13,7 @@
* of Berkeley Packet Filters/Linux Socket Filters.
*/
-#include <linux/atomic.h>
+#include <linux/refcount.h>
#include <linux/audit.h>
#include <linux/compat.h>
#include <linux/coredump.h>
@@ -56,7 +56,7 @@
* to a task_struct (other than @usage).
*/
struct seccomp_filter {
- atomic_t usage;
+ refcount_t usage;
struct seccomp_filter *prev;
struct bpf_prog *prog;
};
@@ -378,7 +378,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return ERR_PTR(ret);
}
- atomic_set(&sfilter->usage, 1);
+ refcount_set(&sfilter->usage, 1);
return sfilter;
}
@@ -465,7 +465,7 @@ void get_seccomp_filter(struct task_struct *tsk)
if (!orig)
return;
/* Reference count is bounded by the number of total processes. */
- atomic_inc(&orig->usage);
+ refcount_inc(&orig->usage);
}
static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -481,7 +481,7 @@ void put_seccomp_filter(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
/* Clean up single-reference branches iteratively. */
- while (orig && atomic_dec_and_test(&orig->usage)) {
+ while (orig && refcount_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
orig = orig->prev;
seccomp_filter_free(freeme);
@@ -641,11 +641,12 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
return 0;
case SECCOMP_RET_KILL:
- default: {
- siginfo_t info;
+ default:
audit_seccomp(this_syscall, SIGSYS, action);
/* Dump core only if this is the last remaining thread. */
if (get_nr_threads(current) == 1) {
+ siginfo_t info;
+
/* Show the original registers in the dump. */
syscall_rollback(current, task_pt_regs(current));
/* Trigger a manual coredump since do_exit skips it. */
@@ -654,7 +655,6 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
}
do_exit(SIGSYS);
}
- }
unreachable();
diff --git a/kernel/signal.c b/kernel/signal.c
index 35a570f71f07..ed804a470dcd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
recalc_sigpending_and_wake(t);
}
}
- if (action->sa.sa_handler == SIG_DFL)
+ /*
+ * Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
+ * debugging to leave init killable.
+ */
+ if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
t->signal->flags &= ~SIGNAL_UNKILLABLE;
ret = specific_send_sig_info(sig, info, t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);
@@ -1402,6 +1406,10 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
return ret;
}
+ /* -INT_MIN is undefined. Exclude this case to avoid a UBSAN warning */
+ if (pid == INT_MIN)
+ return -ESRCH;
+
read_lock(&tasklist_lock);
if (pid != -1) {
ret = __kill_pgrp_info(sig, info,
@@ -2776,7 +2784,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from)
* @info: if non-null, the signal's siginfo is returned here
* @ts: upper bound on process time suspension
*/
-int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
+static int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
const struct timespec *ts)
{
ktime_t *to = NULL, timeout = KTIME_MAX;
@@ -2865,6 +2873,40 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
return ret;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait, compat_sigset_t __user *, uthese,
+ struct compat_siginfo __user *, uinfo,
+ struct compat_timespec __user *, uts, compat_size_t, sigsetsize)
+{
+ compat_sigset_t s32;
+ sigset_t s;
+ struct timespec t;
+ siginfo_t info;
+ long ret;
+
+ if (sigsetsize != sizeof(sigset_t))
+ return -EINVAL;
+
+ if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
+ return -EFAULT;
+ sigset_from_compat(&s, &s32);
+
+ if (uts) {
+ if (compat_get_timespec(&t, uts))
+ return -EFAULT;
+ }
+
+ ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
+
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
+ }
+
+ return ret;
+}
+#endif
+
/**
* sys_kill - send a signal to a process
* @pid: the PID of the process
@@ -3121,78 +3163,68 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
}
static int
-do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long sp)
+do_sigaltstack (const stack_t *ss, stack_t *oss, unsigned long sp)
{
- stack_t oss;
- int error;
+ struct task_struct *t = current;
- oss.ss_sp = (void __user *) current->sas_ss_sp;
- oss.ss_size = current->sas_ss_size;
- oss.ss_flags = sas_ss_flags(sp) |
- (current->sas_ss_flags & SS_FLAG_BITS);
+ if (oss) {
+ memset(oss, 0, sizeof(stack_t));
+ oss->ss_sp = (void __user *) t->sas_ss_sp;
+ oss->ss_size = t->sas_ss_size;
+ oss->ss_flags = sas_ss_flags(sp) |
+ (current->sas_ss_flags & SS_FLAG_BITS);
+ }
- if (uss) {
- void __user *ss_sp;
- size_t ss_size;
- unsigned ss_flags;
+ if (ss) {
+ void __user *ss_sp = ss->ss_sp;
+ size_t ss_size = ss->ss_size;
+ unsigned ss_flags = ss->ss_flags;
int ss_mode;
- error = -EFAULT;
- if (!access_ok(VERIFY_READ, uss, sizeof(*uss)))
- goto out;
- error = __get_user(ss_sp, &uss->ss_sp) |
- __get_user(ss_flags, &uss->ss_flags) |
- __get_user(ss_size, &uss->ss_size);
- if (error)
- goto out;
-
- error = -EPERM;
- if (on_sig_stack(sp))
- goto out;
+ if (unlikely(on_sig_stack(sp)))
+ return -EPERM;
ss_mode = ss_flags & ~SS_FLAG_BITS;
- error = -EINVAL;
- if (ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
- ss_mode != 0)
- goto out;
+ if (unlikely(ss_mode != SS_DISABLE && ss_mode != SS_ONSTACK &&
+ ss_mode != 0))
+ return -EINVAL;
if (ss_mode == SS_DISABLE) {
ss_size = 0;
ss_sp = NULL;
} else {
- error = -ENOMEM;
- if (ss_size < MINSIGSTKSZ)
- goto out;
+ if (unlikely(ss_size < MINSIGSTKSZ))
+ return -ENOMEM;
}
- current->sas_ss_sp = (unsigned long) ss_sp;
- current->sas_ss_size = ss_size;
- current->sas_ss_flags = ss_flags;
+ t->sas_ss_sp = (unsigned long) ss_sp;
+ t->sas_ss_size = ss_size;
+ t->sas_ss_flags = ss_flags;
}
-
- error = 0;
- if (uoss) {
- error = -EFAULT;
- if (!access_ok(VERIFY_WRITE, uoss, sizeof(*uoss)))
- goto out;
- error = __put_user(oss.ss_sp, &uoss->ss_sp) |
- __put_user(oss.ss_size, &uoss->ss_size) |
- __put_user(oss.ss_flags, &uoss->ss_flags);
- }
-
-out:
- return error;
+ return 0;
}
+
SYSCALL_DEFINE2(sigaltstack,const stack_t __user *,uss, stack_t __user *,uoss)
{
- return do_sigaltstack(uss, uoss, current_user_stack_pointer());
+ stack_t new, old;
+ int err;
+ if (uss && copy_from_user(&new, uss, sizeof(stack_t)))
+ return -EFAULT;
+ err = do_sigaltstack(uss ? &new : NULL, uoss ? &old : NULL,
+ current_user_stack_pointer());
+ if (!err && uoss && copy_to_user(uoss, &old, sizeof(stack_t)))
+ err = -EFAULT;
+ return err;
}
int restore_altstack(const stack_t __user *uss)
{
- int err = do_sigaltstack(uss, NULL, current_user_stack_pointer());
+ stack_t new;
+ if (copy_from_user(&new, uss, sizeof(stack_t)))
+ return -EFAULT;
+ (void)do_sigaltstack(&new, NULL, current_user_stack_pointer());
/* squash all but EFAULT for now */
- return err == -EFAULT ? err : 0;
+ return 0;
}
int __save_altstack(stack_t __user *uss, unsigned long sp)
@@ -3215,29 +3247,24 @@ COMPAT_SYSCALL_DEFINE2(sigaltstack,
{
stack_t uss, uoss;
int ret;
- mm_segment_t seg;
if (uss_ptr) {
compat_stack_t uss32;
-
- memset(&uss, 0, sizeof(stack_t));
if (copy_from_user(&uss32, uss_ptr, sizeof(compat_stack_t)))
return -EFAULT;
uss.ss_sp = compat_ptr(uss32.ss_sp);
uss.ss_flags = uss32.ss_flags;
uss.ss_size = uss32.ss_size;
}
- seg = get_fs();
- set_fs(KERNEL_DS);
- ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL),
- (stack_t __force __user *) &uoss,
+ ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss,
compat_user_stack_pointer());
- set_fs(seg);
if (ret >= 0 && uoss_ptr) {
- if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(compat_stack_t)) ||
- __put_user(ptr_to_compat(uoss.ss_sp), &uoss_ptr->ss_sp) ||
- __put_user(uoss.ss_flags, &uoss_ptr->ss_flags) ||
- __put_user(uoss.ss_size, &uoss_ptr->ss_size))
+ compat_stack_t old;
+ memset(&old, 0, sizeof(old));
+ old.ss_sp = ptr_to_compat(uoss.ss_sp);
+ old.ss_flags = uoss.ss_flags;
+ old.ss_size = uoss.ss_size;
+ if (copy_to_user(uoss_ptr, &old, sizeof(compat_stack_t)))
ret = -EFAULT;
}
return ret;
@@ -3277,6 +3304,21 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
return sys_rt_sigpending((sigset_t __user *)set, sizeof(old_sigset_t));
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE1(sigpending, compat_old_sigset_t __user *, set32)
+{
+#ifdef __BIG_ENDIAN
+ sigset_t set;
+ int err = do_sigpending(&set, sizeof(set.sig[0]));
+ if (!err)
+ err = put_user(set.sig[0], set32);
+ return err;
+#else
+ return sys_rt_sigpending((sigset_t __user *)set32, sizeof(*set32));
+#endif
+}
+#endif
+
#endif
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/kernel/sys.c b/kernel/sys.c
index 8a94b4eabcaa..2855ee73acd0 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -886,7 +886,7 @@ SYSCALL_DEFINE0(getegid)
return from_kgid_munged(current_user_ns(), current_egid());
}
-void do_sys_times(struct tms *tms)
+static void do_sys_times(struct tms *tms)
{
u64 tgutime, tgstime, cutime, cstime;
@@ -912,6 +912,32 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
return (long) jiffies_64_to_clock_t(get_jiffies_64());
}
+#ifdef CONFIG_COMPAT
+static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
+{
+ return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
+}
+
+COMPAT_SYSCALL_DEFINE1(times, struct compat_tms __user *, tbuf)
+{
+ if (tbuf) {
+ struct tms tms;
+ struct compat_tms tmp;
+
+ do_sys_times(&tms);
+ /* Convert our struct tms to the compat version. */
+ tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
+ tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
+ tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
+ tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
+ if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
+ return -EFAULT;
+ }
+ force_successful_syscall_return();
+ return compat_jiffies_to_clock_t(jiffies);
+}
+#endif
+
/*
* This needs some heavy checking ...
* I just haven't the stomach for it. I also don't fully
@@ -1306,6 +1332,54 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
return ret;
}
+#ifdef CONFIG_COMPAT
+
+COMPAT_SYSCALL_DEFINE2(setrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+ struct compat_rlimit r32;
+
+ if (copy_from_user(&r32, rlim, sizeof(struct compat_rlimit)))
+ return -EFAULT;
+
+ if (r32.rlim_cur == COMPAT_RLIM_INFINITY)
+ r.rlim_cur = RLIM_INFINITY;
+ else
+ r.rlim_cur = r32.rlim_cur;
+ if (r32.rlim_max == COMPAT_RLIM_INFINITY)
+ r.rlim_max = RLIM_INFINITY;
+ else
+ r.rlim_max = r32.rlim_max;
+ return do_prlimit(current, resource, &r, NULL);
+}
+
+COMPAT_SYSCALL_DEFINE2(getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+ int ret;
+
+ ret = do_prlimit(current, resource, NULL, &r);
+ if (!ret) {
+ struct compat_rlimit r32;
+ if (r.rlim_cur > COMPAT_RLIM_INFINITY)
+ r32.rlim_cur = COMPAT_RLIM_INFINITY;
+ else
+ r32.rlim_cur = r.rlim_cur;
+ if (r.rlim_max > COMPAT_RLIM_INFINITY)
+ r32.rlim_max = COMPAT_RLIM_INFINITY;
+ else
+ r32.rlim_max = r.rlim_max;
+
+ if (copy_to_user(rlim, &r32, sizeof(struct compat_rlimit)))
+ return -EFAULT;
+ }
+ return ret;
+}
+
+#endif
+
#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
/*
@@ -1328,6 +1402,30 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
}
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
+ struct compat_rlimit __user *, rlim)
+{
+ struct rlimit r;
+
+ if (resource >= RLIM_NLIMITS)
+ return -EINVAL;
+
+ task_lock(current->group_leader);
+ r = current->signal->rlim[resource];
+ task_unlock(current->group_leader);
+ if (r.rlim_cur > 0x7FFFFFFF)
+ r.rlim_cur = 0x7FFFFFFF;
+ if (r.rlim_max > 0x7FFFFFFF)
+ r.rlim_max = 0x7FFFFFFF;
+
+ if (put_user(r.rlim_cur, &rlim->rlim_cur) ||
+ put_user(r.rlim_max, &rlim->rlim_max))
+ return -EFAULT;
+ return 0;
+}
+#endif
+
#endif
static inline bool rlim64_is_infinity(__u64 rlim64)
@@ -1552,7 +1650,7 @@ static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
r->ru_oublock += task_io_get_oublock(t);
}
-static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
+void getrusage(struct task_struct *p, int who, struct rusage *r)
{
struct task_struct *t;
unsigned long flags;
@@ -1626,20 +1724,16 @@ out:
r->ru_maxrss = maxrss * (PAGE_SIZE / 1024); /* convert pages to KBs */
}
-int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
+SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
{
struct rusage r;
- k_getrusage(p, who, &r);
- return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
-}
-
-SYSCALL_DEFINE2(getrusage, int, who, struct rusage __user *, ru)
-{
if (who != RUSAGE_SELF && who != RUSAGE_CHILDREN &&
who != RUSAGE_THREAD)
return -EINVAL;
- return getrusage(current, who, ru);
+
+ getrusage(current, who, &r);
+ return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}
#ifdef CONFIG_COMPAT
@@ -1651,7 +1745,7 @@ COMPAT_SYSCALL_DEFINE2(getrusage, int, who, struct compat_rusage __user *, ru)
who != RUSAGE_THREAD)
return -EINVAL;
- k_getrusage(current, who, &r);
+ getrusage(current, who, &r);
return put_compat_rusage(&r, ru);
}
#endif
@@ -2266,7 +2360,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_GET_THP_DISABLE:
if (arg2 || arg3 || arg4 || arg5)
return -EINVAL;
- error = !!(me->mm->def_flags & VM_NOHUGEPAGE);
+ error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
break;
case PR_SET_THP_DISABLE:
if (arg3 || arg4 || arg5)
@@ -2274,9 +2368,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (down_write_killable(&me->mm->mmap_sem))
return -EINTR;
if (arg2)
- me->mm->def_flags |= VM_NOHUGEPAGE;
+ set_bit(MMF_DISABLE_THP, &me->mm->flags);
else
- me->mm->def_flags &= ~VM_NOHUGEPAGE;
+ clear_bit(MMF_DISABLE_THP, &me->mm->flags);
up_write(&me->mm->mmap_sem);
break;
case PR_MPX_ENABLE_MANAGEMENT:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a76cc3..6648fbbb8157 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -174,11 +174,32 @@ extern int no_unaligned_warning;
#ifdef CONFIG_PROC_SYSCTL
-#define SYSCTL_WRITES_LEGACY -1
-#define SYSCTL_WRITES_WARN 0
-#define SYSCTL_WRITES_STRICT 1
+/**
+ * enum sysctl_writes_mode - supported sysctl write modes
+ *
+ * @SYSCTL_WRITES_LEGACY: each write syscall must fully contain the sysctl value
+ * to be written, and multiple writes on the same sysctl file descriptor
+ * will rewrite the sysctl value, regardless of file position. No warning
+ * is issued when the initial position is not 0.
+ * @SYSCTL_WRITES_WARN: same as above but warn when the initial file position is
+ * not 0.
+ * @SYSCTL_WRITES_STRICT: writes to numeric sysctl entries must always be at
+ * file position 0 and the value must be fully contained in the buffer
+ * sent to the write syscall. If dealing with strings respect the file
+ * position, but restrict this to the max length of the buffer, anything
+ * passed the max lenght will be ignored. Multiple writes will append
+ * to the buffer.
+ *
+ * These write modes control how current file position affects the behavior of
+ * updating sysctl values through the proc interface on each write.
+ */
+enum sysctl_writes_mode {
+ SYSCTL_WRITES_LEGACY = -1,
+ SYSCTL_WRITES_WARN = 0,
+ SYSCTL_WRITES_STRICT = 1,
+};
-static int sysctl_writes_strict = SYSCTL_WRITES_STRICT;
+static enum sysctl_writes_mode sysctl_writes_strict = SYSCTL_WRITES_STRICT;
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -880,6 +901,14 @@ static struct ctl_table kern_table[] = {
#endif
},
{
+ .procname = "watchdog_cpumask",
+ .data = &watchdog_cpumask_bits,
+ .maxlen = NR_CPUS,
+ .mode = 0644,
+ .proc_handler = proc_watchdog_cpumask,
+ },
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+ {
.procname = "soft_watchdog",
.data = &soft_watchdog_enabled,
.maxlen = sizeof (int),
@@ -889,13 +918,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &one,
},
{
- .procname = "watchdog_cpumask",
- .data = &watchdog_cpumask_bits,
- .maxlen = NR_CPUS,
- .mode = 0644,
- .proc_handler = proc_watchdog_cpumask,
- },
- {
.procname = "softlockup_panic",
.data = &softlockup_panic,
.maxlen = sizeof(int),
@@ -904,27 +926,29 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_SMP
{
- .procname = "hardlockup_panic",
- .data = &hardlockup_panic,
+ .procname = "softlockup_all_cpu_backtrace",
+ .data = &sysctl_softlockup_all_cpu_backtrace,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
+#endif /* CONFIG_SMP */
#endif
-#ifdef CONFIG_SMP
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
{
- .procname = "softlockup_all_cpu_backtrace",
- .data = &sysctl_softlockup_all_cpu_backtrace,
+ .procname = "hardlockup_panic",
+ .data = &hardlockup_panic,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
+#ifdef CONFIG_SMP
{
.procname = "hardlockup_all_cpu_backtrace",
.data = &sysctl_hardlockup_all_cpu_backtrace,
@@ -936,6 +960,8 @@ static struct ctl_table kern_table[] = {
},
#endif /* CONFIG_SMP */
#endif
+#endif
+
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
{
.procname = "unknown_nmi_panic",
@@ -1950,6 +1976,32 @@ static void warn_sysctl_write(struct ctl_table *table)
}
/**
+ * proc_first_pos_non_zero_ignore - check if firs position is allowed
+ * @ppos: file position
+ * @table: the sysctl table
+ *
+ * Returns true if the first position is non-zero and the sysctl_writes_strict
+ * mode indicates this is not allowed for numeric input types. String proc
+ * hadlers can ignore the return value.
+ */
+static bool proc_first_pos_non_zero_ignore(loff_t *ppos,
+ struct ctl_table *table)
+{
+ if (!*ppos)
+ return false;
+
+ switch (sysctl_writes_strict) {
+ case SYSCTL_WRITES_STRICT:
+ return true;
+ case SYSCTL_WRITES_WARN:
+ warn_sysctl_write(table);
+ return false;
+ default:
+ return false;
+ }
+}
+
+/**
* proc_dostring - read a string sysctl
* @table: the sysctl table
* @write: %TRUE if this is a write to the sysctl file
@@ -1969,8 +2021,8 @@ static void warn_sysctl_write(struct ctl_table *table)
int proc_dostring(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- if (write && *ppos && sysctl_writes_strict == SYSCTL_WRITES_WARN)
- warn_sysctl_write(table);
+ if (write)
+ proc_first_pos_non_zero_ignore(ppos, table);
return _proc_do_string((char *)(table->data), table->maxlen, write,
(char __user *)buffer, lenp, ppos);
@@ -2128,19 +2180,18 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
return 0;
}
-static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
- int *valp,
- int write, void *data)
+static int do_proc_douintvec_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
{
if (write) {
- if (*negp)
+ if (*lvalp > UINT_MAX)
return -EINVAL;
if (*lvalp > UINT_MAX)
return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
- *negp = false;
*lvalp = (unsigned long)val;
}
return 0;
@@ -2172,17 +2223,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
conv = do_proc_dointvec_conv;
if (write) {
- if (*ppos) {
- switch (sysctl_writes_strict) {
- case SYSCTL_WRITES_STRICT:
- goto out;
- case SYSCTL_WRITES_WARN:
- warn_sysctl_write(table);
- break;
- default:
- break;
- }
- }
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
@@ -2249,6 +2291,146 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
buffer, lenp, ppos, conv, data);
}
+static int do_proc_douintvec_w(unsigned int *tbl_data,
+ struct ctl_table *table,
+ void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+ bool neg;
+ char *kbuf = NULL, *p;
+
+ left = *lenp;
+
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto bail_early;
+
+ if (left > PAGE_SIZE - 1)
+ left = PAGE_SIZE - 1;
+
+ p = kbuf = memdup_user_nul(buffer, left);
+ if (IS_ERR(kbuf))
+ return -EINVAL;
+
+ left -= proc_skip_spaces(&p);
+ if (!left) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ err = proc_get_long(&p, &left, &lval, &neg,
+ proc_wspace_sep,
+ sizeof(proc_wspace_sep), NULL);
+ if (err || neg) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (conv(&lval, tbl_data, 1, data)) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
+ if (!err && left)
+ left -= proc_skip_spaces(&p);
+
+out_free:
+ kfree(kbuf);
+ if (err)
+ return -EINVAL;
+
+ return 0;
+
+ /* This is in keeping with old __do_proc_dointvec() */
+bail_early:
+ *ppos += *lenp;
+ return err;
+}
+
+static int do_proc_douintvec_r(unsigned int *tbl_data, void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned long lval;
+ int err = 0;
+ size_t left;
+
+ left = *lenp;
+
+ if (conv(&lval, tbl_data, 0, data)) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = proc_put_long(&buffer, &left, lval, false);
+ if (err || !left)
+ goto out;
+
+ err = proc_put_char(&buffer, &left, '\n');
+
+out:
+ *lenp -= left;
+ *ppos += *lenp;
+
+ return err;
+}
+
+static int __do_proc_douintvec(void *tbl_data, struct ctl_table *table,
+ int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ unsigned int *i, vleft;
+
+ if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+ *lenp = 0;
+ return 0;
+ }
+
+ i = (unsigned int *) tbl_data;
+ vleft = table->maxlen / sizeof(*i);
+
+ /*
+ * Arrays are not supported, keep this simple. *Do not* add
+ * support for them.
+ */
+ if (vleft != 1) {
+ *lenp = 0;
+ return -EINVAL;
+ }
+
+ if (!conv)
+ conv = do_proc_douintvec_conv;
+
+ if (write)
+ return do_proc_douintvec_w(i, table, buffer, lenp, ppos,
+ conv, data);
+ return do_proc_douintvec_r(i, buffer, lenp, ppos, conv, data);
+}
+
+static int do_proc_douintvec(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos,
+ int (*conv)(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data),
+ void *data)
+{
+ return __do_proc_douintvec(table->data, table, write,
+ buffer, lenp, ppos, conv, data);
+}
+
/**
* proc_dointvec - read a vector of integers
* @table: the sysctl table
@@ -2284,8 +2466,8 @@ int proc_dointvec(struct ctl_table *table, int write,
int proc_douintvec(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- return do_proc_dointvec(table, write, buffer, lenp, ppos,
- do_proc_douintvec_conv, NULL);
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_conv, NULL);
}
/*
@@ -2390,6 +2572,65 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
do_proc_dointvec_minmax_conv, &param);
}
+struct do_proc_douintvec_minmax_conv_param {
+ unsigned int *min;
+ unsigned int *max;
+};
+
+static int do_proc_douintvec_minmax_conv(unsigned long *lvalp,
+ unsigned int *valp,
+ int write, void *data)
+{
+ struct do_proc_douintvec_minmax_conv_param *param = data;
+
+ if (write) {
+ unsigned int val = *lvalp;
+
+ if ((param->min && *param->min > val) ||
+ (param->max && *param->max < val))
+ return -ERANGE;
+
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
+ *valp = val;
+ } else {
+ unsigned int val = *valp;
+ *lvalp = (unsigned long) val;
+ }
+
+ return 0;
+}
+
+/**
+ * proc_douintvec_minmax - read a vector of unsigned ints with min/max values
+ * @table: the sysctl table
+ * @write: %TRUE if this is a write to the sysctl file
+ * @buffer: the user buffer
+ * @lenp: the size of the user buffer
+ * @ppos: file position
+ *
+ * Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
+ * values from/to the user buffer, treated as an ASCII string. Negative
+ * strings are not allowed.
+ *
+ * This routine will ensure the values are within the range specified by
+ * table->extra1 (min) and table->extra2 (max). There is a final sanity
+ * check for UINT_MAX to avoid having to support wrap around uses from
+ * userspace.
+ *
+ * Returns 0 on success.
+ */
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_douintvec_minmax_conv_param param = {
+ .min = (unsigned int *) table->extra1,
+ .max = (unsigned int *) table->extra2,
+ };
+ return do_proc_douintvec(table, write, buffer, lenp, ppos,
+ do_proc_douintvec_minmax_conv, &param);
+}
+
static void validate_coredump_safety(void)
{
#ifdef CONFIG_COREDUMP
@@ -2447,17 +2688,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
left = *lenp;
if (write) {
- if (*ppos) {
- switch (sysctl_writes_strict) {
- case SYSCTL_WRITES_STRICT:
- goto out;
- case SYSCTL_WRITES_WARN:
- warn_sysctl_write(table);
- break;
- default:
- break;
- }
- }
+ if (proc_first_pos_non_zero_ignore(ppos, table))
+ goto out;
if (left > PAGE_SIZE - 1)
left = PAGE_SIZE - 1;
@@ -2898,6 +3130,12 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_douintvec_minmax(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_jiffies(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -2940,6 +3178,7 @@ EXPORT_SYMBOL(proc_dointvec);
EXPORT_SYMBOL(proc_douintvec);
EXPORT_SYMBOL(proc_dointvec_jiffies);
EXPORT_SYMBOL(proc_dointvec_minmax);
+EXPORT_SYMBOL_GPL(proc_douintvec_minmax);
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
EXPORT_SYMBOL(proc_dointvec_ms_jiffies);
EXPORT_SYMBOL(proc_dostring);
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 939a158eab11..02e1859f2ca8 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1346,7 +1346,7 @@ static void deprecated_sysctl_warning(const int *name, int nlen)
* CTL_KERN/KERN_VERSION is used by older glibc and cannot
* ever go away.
*/
- if (name[0] == CTL_KERN && name[1] == KERN_VERSION)
+ if (nlen >= 2 && name[0] == CTL_KERN && name[1] == KERN_VERSION)
return;
if (printk_ratelimit()) {
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d513051fcca2..836a72a66fba 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -96,20 +96,16 @@ void task_work_run(void)
* work->func() can do task_work_add(), do not set
* work_exited unless the list is empty.
*/
+ raw_spin_lock_irq(&task->pi_lock);
do {
work = READ_ONCE(task->task_works);
head = !work && (task->flags & PF_EXITING) ?
&work_exited : NULL;
} while (cmpxchg(&task->task_works, work, head) != work);
+ raw_spin_unlock_irq(&task->pi_lock);
if (!work)
break;
- /*
- * Synchronize with task_work_cancel(). It can't remove
- * the first entry == work, cmpxchg(task_works) should
- * fail, but it can play with *work and other entries.
- */
- raw_spin_unlock_wait(&task->pi_lock);
do {
next = work->next;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index c991cf212c6d..0b8ff7d257ea 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -712,14 +712,14 @@ static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp,
alarmtimer_freezerset(absexp, type);
restart = &current->restart_block;
if (restart->nanosleep.type != TT_NONE) {
- struct timespec rmt;
+ struct timespec64 rmt;
ktime_t rem;
rem = ktime_sub(absexp, alarm_bases[type].gettime());
if (rem <= 0)
return 0;
- rmt = ktime_to_timespec(rem);
+ rmt = ktime_to_timespec64(rem);
return nanosleep_copyout(restart, &rmt);
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 81da124f1115..88f75f92ef36 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1440,17 +1440,17 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
}
EXPORT_SYMBOL_GPL(hrtimer_init_sleeper);
-int nanosleep_copyout(struct restart_block *restart, struct timespec *ts)
+int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts)
{
switch(restart->nanosleep.type) {
#ifdef CONFIG_COMPAT
case TT_COMPAT:
- if (compat_put_timespec(ts, restart->nanosleep.compat_rmtp))
+ if (compat_put_timespec64(ts, restart->nanosleep.compat_rmtp))
return -EFAULT;
break;
#endif
case TT_NATIVE:
- if (copy_to_user(restart->nanosleep.rmtp, ts, sizeof(struct timespec)))
+ if (put_timespec64(ts, restart->nanosleep.rmtp))
return -EFAULT;
break;
default:
@@ -1485,11 +1485,11 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
restart = &current->restart_block;
if (restart->nanosleep.type != TT_NONE) {
ktime_t rem = hrtimer_expires_remaining(&t->timer);
- struct timespec rmt;
+ struct timespec64 rmt;
if (rem <= 0)
return 0;
- rmt = ktime_to_timespec(rem);
+ rmt = ktime_to_timespec64(rem);
return nanosleep_copyout(restart, &rmt);
}
@@ -1546,19 +1546,17 @@ out:
SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
- struct timespec64 tu64;
- struct timespec tu;
+ struct timespec64 tu;
- if (copy_from_user(&tu, rqtp, sizeof(tu)))
+ if (get_timespec64(&tu, rqtp))
return -EFAULT;
- tu64 = timespec_to_timespec64(tu);
- if (!timespec64_valid(&tu64))
+ if (!timespec64_valid(&tu))
return -EINVAL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#ifdef CONFIG_COMPAT
@@ -1566,19 +1564,17 @@ SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp,
COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
{
- struct timespec64 tu64;
- struct timespec tu;
+ struct timespec64 tu;
- if (compat_get_timespec(&tu, rqtp))
+ if (compat_get_timespec64(&tu, rqtp))
return -EFAULT;
- tu64 = timespec_to_timespec64(tu);
- if (!timespec64_valid(&tu64))
+ if (!timespec64_valid(&tu))
return -EINVAL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return hrtimer_nanosleep(&tu64, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
+ return hrtimer_nanosleep(&tu, HRTIMER_MODE_REL, CLOCK_MONOTONIC);
}
#endif
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 60cb24ac9ebc..a3bd5dbe0dc4 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -1318,12 +1318,8 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
*/
restart = &current->restart_block;
restart->nanosleep.expires = expires;
- if (restart->nanosleep.type != TT_NONE) {
- struct timespec ts;
-
- ts = timespec64_to_timespec(it.it_value);
- error = nanosleep_copyout(restart, &ts);
- }
+ if (restart->nanosleep.type != TT_NONE)
+ error = nanosleep_copyout(restart, &it.it_value);
}
return error;
diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c
index 38f3b20efa29..06f34feb635e 100644
--- a/kernel/time/posix-stubs.c
+++ b/kernel/time/posix-stubs.c
@@ -41,12 +41,6 @@ SYS_NI(setitimer);
#ifdef __ARCH_WANT_SYS_ALARM
SYS_NI(alarm);
#endif
-COMPAT_SYS_NI(timer_create);
-COMPAT_SYS_NI(clock_adjtime);
-COMPAT_SYS_NI(timer_settime);
-COMPAT_SYS_NI(timer_gettime);
-COMPAT_SYS_NI(getitimer);
-COMPAT_SYS_NI(setitimer);
/*
* We preserve minimal support for CLOCK_REALTIME and CLOCK_MONOTONIC
@@ -57,40 +51,52 @@ COMPAT_SYS_NI(setitimer);
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 new_tp;
if (which_clock != CLOCK_REALTIME)
return -EINVAL;
- if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+ if (get_timespec64(&new_tp, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
- return do_sys_settimeofday64(&new_tp64, NULL);
+ return do_sys_settimeofday64(&new_tp, NULL);
}
-SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
- struct timespec __user *,tp)
+int do_clock_gettime(clockid_t which_clock, struct timespec64 *tp)
{
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
-
switch (which_clock) {
- case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
- case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
- case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
- default: return -EINVAL;
+ case CLOCK_REALTIME:
+ ktime_get_real_ts64(tp);
+ break;
+ case CLOCK_MONOTONIC:
+ ktime_get_ts64(tp);
+ break;
+ case CLOCK_BOOTTIME:
+ get_monotonic_boottime64(tp);
+ break;
+ default:
+ return -EINVAL;
}
- kernel_tp = timespec64_to_timespec(kernel_tp64);
- if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+ return 0;
+}
+SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
+ struct timespec __user *, tp)
+{
+ int ret;
+ struct timespec64 kernel_tp;
+
+ ret = do_clock_gettime(which_clock, &kernel_tp);
+ if (ret)
+ return ret;
+
+ if (put_timespec64(&kernel_tp, tp))
return -EFAULT;
return 0;
}
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp)
{
- struct timespec rtn_tp = {
+ struct timespec64 rtn_tp = {
.tv_sec = 0,
.tv_nsec = hrtimer_resolution,
};
@@ -99,7 +105,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __us
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
case CLOCK_BOOTTIME:
- if (copy_to_user(tp, &rtn_tp, sizeof(rtn_tp)))
+ if (put_timespec64(&rtn_tp, tp))
return -EFAULT;
return 0;
default:
@@ -138,44 +144,45 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
}
#ifdef CONFIG_COMPAT
+COMPAT_SYS_NI(timer_create);
+COMPAT_SYS_NI(clock_adjtime);
+COMPAT_SYS_NI(timer_settime);
+COMPAT_SYS_NI(timer_gettime);
+COMPAT_SYS_NI(getitimer);
+COMPAT_SYS_NI(setitimer);
+
COMPAT_SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 new_tp;
if (which_clock != CLOCK_REALTIME)
return -EINVAL;
- if (compat_get_timespec(&new_tp, tp))
+ if (compat_get_timespec64(&new_tp, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
- return do_sys_settimeofday64(&new_tp64, NULL);
+ return do_sys_settimeofday64(&new_tp, NULL);
}
-COMPAT_SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
- struct compat_timespec __user *,tp)
+COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
+ struct compat_timespec __user *, tp)
{
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
+ int ret;
+ struct timespec64 kernel_tp;
- switch (which_clock) {
- case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break;
- case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break;
- case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break;
- default: return -EINVAL;
- }
+ ret = do_clock_gettime(which_clock, &kernel_tp);
+ if (ret)
+ return ret;
- kernel_tp = timespec64_to_timespec(kernel_tp64);
- if (compat_put_timespec(&kernel_tp, tp))
+ if (compat_put_timespec64(&kernel_tp, tp))
return -EFAULT;
return 0;
}
-COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
+COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
- struct timespec rtn_tp = {
+ struct timespec64 rtn_tp = {
.tv_sec = 0,
.tv_nsec = hrtimer_resolution,
};
@@ -184,13 +191,14 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
case CLOCK_REALTIME:
case CLOCK_MONOTONIC:
case CLOCK_BOOTTIME:
- if (compat_put_timespec(&rtn_tp, tp))
+ if (compat_put_timespec64(&rtn_tp, tp))
return -EFAULT;
return 0;
default:
return -EINVAL;
}
}
+
COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
struct compat_timespec __user *, rqtp,
struct compat_timespec __user *, rmtp)
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 82d67be7d9d1..13d6881f908b 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -739,13 +739,11 @@ static int do_timer_gettime(timer_t timer_id, struct itimerspec64 *setting)
SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct itimerspec __user *, setting)
{
- struct itimerspec64 cur_setting64;
+ struct itimerspec64 cur_setting;
- int ret = do_timer_gettime(timer_id, &cur_setting64);
+ int ret = do_timer_gettime(timer_id, &cur_setting);
if (!ret) {
- struct itimerspec cur_setting;
- cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
- if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+ if (put_itimerspec64(&cur_setting, setting))
ret = -EFAULT;
}
return ret;
@@ -755,13 +753,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
COMPAT_SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct compat_itimerspec __user *, setting)
{
- struct itimerspec64 cur_setting64;
+ struct itimerspec64 cur_setting;
- int ret = do_timer_gettime(timer_id, &cur_setting64);
+ int ret = do_timer_gettime(timer_id, &cur_setting);
if (!ret) {
- struct itimerspec cur_setting;
- cur_setting = itimerspec64_to_itimerspec(&cur_setting64);
- if (put_compat_itimerspec(setting, &cur_setting))
+ if (put_compat_itimerspec64(&cur_setting, setting))
ret = -EFAULT;
}
return ret;
@@ -907,23 +903,19 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
const struct itimerspec __user *, new_setting,
struct itimerspec __user *, old_setting)
{
- struct itimerspec64 new_spec64, old_spec64;
- struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL;
- struct itimerspec new_spec;
+ struct itimerspec64 new_spec, old_spec;
+ struct itimerspec64 *rtn = old_setting ? &old_spec : NULL;
int error = 0;
if (!new_setting)
return -EINVAL;
- if (copy_from_user(&new_spec, new_setting, sizeof (new_spec)))
+ if (get_itimerspec64(&new_spec, new_setting))
return -EFAULT;
- new_spec64 = itimerspec_to_itimerspec64(&new_spec);
- error = do_timer_settime(timer_id, flags, &new_spec64, rtn);
+ error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old_setting) {
- struct itimerspec old_spec;
- old_spec = itimerspec64_to_itimerspec(&old_spec64);
- if (copy_to_user(old_setting, &old_spec, sizeof (old_spec)))
+ if (put_itimerspec64(&old_spec, old_setting))
error = -EFAULT;
}
return error;
@@ -934,22 +926,18 @@ COMPAT_SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
struct compat_itimerspec __user *, new,
struct compat_itimerspec __user *, old)
{
- struct itimerspec64 new_spec64, old_spec64;
- struct itimerspec64 *rtn = old ? &old_spec64 : NULL;
- struct itimerspec new_spec;
+ struct itimerspec64 new_spec, old_spec;
+ struct itimerspec64 *rtn = old ? &old_spec : NULL;
int error = 0;
if (!new)
return -EINVAL;
- if (get_compat_itimerspec(&new_spec, new))
+ if (get_compat_itimerspec64(&new_spec, new))
return -EFAULT;
- new_spec64 = itimerspec_to_itimerspec64(&new_spec);
- error = do_timer_settime(timer_id, flags, &new_spec64, rtn);
+ error = do_timer_settime(timer_id, flags, &new_spec, rtn);
if (!error && old) {
- struct itimerspec old_spec;
- old_spec = itimerspec64_to_itimerspec(&old_spec64);
- if (put_compat_itimerspec(old, &old_spec))
+ if (put_compat_itimerspec64(&old_spec, old))
error = -EFAULT;
}
return error;
@@ -1049,34 +1037,30 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 new_tp;
if (!kc || !kc->clock_set)
return -EINVAL;
- if (copy_from_user(&new_tp, tp, sizeof (*tp)))
+ if (get_timespec64(&new_tp, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
- return kc->clock_set(which_clock, &new_tp64);
+ return kc->clock_set(which_clock, &new_tp);
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct timespec __user *,tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
+ struct timespec64 kernel_tp;
int error;
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp64);
- kernel_tp = timespec64_to_timespec(kernel_tp64);
+ error = kc->clock_get(which_clock, &kernel_tp);
- if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
+ if (!error && put_timespec64(&kernel_tp, tp))
error = -EFAULT;
return error;
@@ -1109,17 +1093,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 rtn_tp64;
- struct timespec rtn_tp;
+ struct timespec64 rtn_tp;
int error;
if (!kc)
return -EINVAL;
- error = kc->clock_getres(which_clock, &rtn_tp64);
- rtn_tp = timespec64_to_timespec(rtn_tp64);
+ error = kc->clock_getres(which_clock, &rtn_tp);
- if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
+ if (!error && tp && put_timespec64(&rtn_tp, tp))
error = -EFAULT;
return error;
@@ -1131,38 +1113,33 @@ COMPAT_SYSCALL_DEFINE2(clock_settime, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 new_tp64;
- struct timespec new_tp;
+ struct timespec64 ts;
if (!kc || !kc->clock_set)
return -EINVAL;
- if (compat_get_timespec(&new_tp, tp))
+ if (compat_get_timespec64(&ts, tp))
return -EFAULT;
- new_tp64 = timespec_to_timespec64(new_tp);
-
- return kc->clock_set(which_clock, &new_tp64);
+ return kc->clock_set(which_clock, &ts);
}
COMPAT_SYSCALL_DEFINE2(clock_gettime, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 kernel_tp64;
- struct timespec kernel_tp;
- int error;
+ struct timespec64 ts;
+ int err;
if (!kc)
return -EINVAL;
- error = kc->clock_get(which_clock, &kernel_tp64);
- kernel_tp = timespec64_to_timespec(kernel_tp64);
+ err = kc->clock_get(which_clock, &ts);
- if (!error && compat_put_timespec(&kernel_tp, tp))
- error = -EFAULT;
+ if (!err && compat_put_timespec64(&ts, tp))
+ err = -EFAULT;
- return error;
+ return err;
}
COMPAT_SYSCALL_DEFINE2(clock_adjtime, clockid_t, which_clock,
@@ -1193,21 +1170,19 @@ COMPAT_SYSCALL_DEFINE2(clock_getres, clockid_t, which_clock,
struct compat_timespec __user *, tp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 rtn_tp64;
- struct timespec rtn_tp;
- int error;
+ struct timespec64 ts;
+ int err;
if (!kc)
return -EINVAL;
- error = kc->clock_getres(which_clock, &rtn_tp64);
- rtn_tp = timespec64_to_timespec(rtn_tp64);
-
- if (!error && tp && compat_put_timespec(&rtn_tp, tp))
- error = -EFAULT;
+ err = kc->clock_getres(which_clock, &ts);
+ if (!err && tp && compat_put_timespec64(&ts, tp))
+ return -EFAULT;
- return error;
+ return err;
}
+
#endif
/*
@@ -1226,26 +1201,24 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
struct timespec __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 t64;
- struct timespec t;
+ struct timespec64 t;
if (!kc)
return -EINVAL;
if (!kc->nsleep)
return -ENANOSLEEP_NOTSUP;
- if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
+ if (get_timespec64(&t, rqtp))
return -EFAULT;
- t64 = timespec_to_timespec64(t);
- if (!timespec64_valid(&t64))
+ if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE;
current->restart_block.nanosleep.rmtp = rmtp;
- return kc->nsleep(which_clock, flags, &t64);
+ return kc->nsleep(which_clock, flags, &t);
}
#ifdef CONFIG_COMPAT
@@ -1254,26 +1227,24 @@ COMPAT_SYSCALL_DEFINE4(clock_nanosleep, clockid_t, which_clock, int, flags,
struct compat_timespec __user *, rmtp)
{
const struct k_clock *kc = clockid_to_kclock(which_clock);
- struct timespec64 t64;
- struct timespec t;
+ struct timespec64 t;
if (!kc)
return -EINVAL;
if (!kc->nsleep)
return -ENANOSLEEP_NOTSUP;
- if (compat_get_timespec(&t, rqtp))
+ if (compat_get_timespec64(&t, rqtp))
return -EFAULT;
- t64 = timespec_to_timespec64(t);
- if (!timespec64_valid(&t64))
+ if (!timespec64_valid(&t))
return -EINVAL;
if (flags & TIMER_ABSTIME)
rmtp = NULL;
current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE;
current->restart_block.nanosleep.compat_rmtp = rmtp;
- return kc->nsleep(which_clock, flags, &t64);
+ return kc->nsleep(which_clock, flags, &t);
}
#endif
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 7c89e437c4d7..44a8c1402133 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -890,3 +890,61 @@ struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
return res;
}
+
+int get_timespec64(struct timespec64 *ts,
+ const struct timespec __user *uts)
+{
+ struct timespec kts;
+ int ret;
+
+ ret = copy_from_user(&kts, uts, sizeof(kts));
+ if (ret)
+ return -EFAULT;
+
+ ts->tv_sec = kts.tv_sec;
+ ts->tv_nsec = kts.tv_nsec;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(get_timespec64);
+
+int put_timespec64(const struct timespec64 *ts,
+ struct timespec __user *uts)
+{
+ struct timespec kts = {
+ .tv_sec = ts->tv_sec,
+ .tv_nsec = ts->tv_nsec
+ };
+ return copy_to_user(uts, &kts, sizeof(kts)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL_GPL(put_timespec64);
+
+int get_itimerspec64(struct itimerspec64 *it,
+ const struct itimerspec __user *uit)
+{
+ int ret;
+
+ ret = get_timespec64(&it->it_interval, &uit->it_interval);
+ if (ret)
+ return ret;
+
+ ret = get_timespec64(&it->it_value, &uit->it_value);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(get_itimerspec64);
+
+int put_itimerspec64(const struct itimerspec64 *it,
+ struct itimerspec __user *uit)
+{
+ int ret;
+
+ ret = put_timespec64(&it->it_interval, &uit->it_interval);
+ if (ret)
+ return ret;
+
+ ret = put_timespec64(&it->it_value, &uit->it_value);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(put_itimerspec64);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index cedafa008de5..7e7e61c00d61 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -637,9 +637,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
tk->ktime_sec = seconds;
/* Update the monotonic raw base */
- seconds = tk->raw_sec;
- nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift);
- tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
/* must hold timekeeper_lock */
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 71ce3f4eead3..f2674a056c26 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -203,6 +203,7 @@ struct timer_base {
bool migration_enabled;
bool nohz_active;
bool is_idle;
+ bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
- unsigned long jnow = READ_ONCE(jiffies);
+ unsigned long jnow;
/*
- * We only forward the base when it's idle and we have a delta between
- * base clock and jiffies.
+ * We only forward the base when we are idle or have just come out of
+ * idle (must_forward_clk logic), and have a delta between base clock
+ * and jiffies. In the common case, run_timers will take care of it.
*/
- if (!base->is_idle || (long) (jnow - base->clk) < 2)
+ if (likely(!base->must_forward_clk))
+ return;
+
+ jnow = READ_ONCE(jiffies);
+ base->must_forward_clk = base->is_idle;
+ if ((long)(jnow - base->clk) < 2)
return;
/*
@@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* same array bucket then just return:
*/
if (timer_pending(timer)) {
+ /*
+ * The downside of this optimization is that it can result in
+ * larger granularity than you would get from adding a new
+ * timer with this expiry.
+ */
if (timer->expires == expires)
return 1;
@@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* dequeue/enqueue dance.
*/
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
clk = base->clk;
idx = calc_wheel_index(expires, clk);
@@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
} else {
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
}
ret = detach_if_pending(timer, base, false);
@@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
raw_spin_lock(&base->lock);
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | base->cpu);
+ forward_timer_base(base);
}
}
- /* Try to forward a stale timer base clock */
- forward_timer_base(base);
-
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
@@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | cpu);
}
+ forward_timer_base(base);
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
@@ -1495,12 +1508,18 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
base->is_idle = false;
} else {
if (!is_max_delta)
- expires = basem + (nextevt - basej) * TICK_NSEC;
+ expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
- * If we expect to sleep more than a tick, mark the base idle:
+ * If we expect to sleep more than a tick, mark the base idle.
+ * Also the tick is stopped so any added timer must forward
+ * the base clk itself to keep granularity small. This idle
+ * logic is only maintained for the BASE_STD base, deferrable
+ * timers may still see large granularity skew (by design).
*/
- if ((expires - basem) > TICK_NSEC)
+ if ((expires - basem) > TICK_NSEC) {
+ base->must_forward_clk = true;
base->is_idle = true;
+ }
}
raw_spin_unlock(&base->lock);
@@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ /*
+ * must_forward_clk must be cleared before running timers so that any
+ * timer functions that call mod_timer will not try to forward the
+ * base. idle trcking / clock forwarding logic is only used with
+ * BASE_STD timers.
+ *
+ * The deferrable base does not do idle tracking at all, so we do
+ * not forward it. This can result in very large variations in
+ * granularity for deferrable timers, but they can be deferred for
+ * long periods due to idle.
+ */
+ base->must_forward_clk = false;
+
__run_timers(base);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
diff --git a/kernel/torture.c b/kernel/torture.c
index 55de96529287..637e172835d8 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
torture_type, cpu);
(*n_offl_successes)++;
delta = jiffies - starttime;
- sum_offl += delta;
+ *sum_offl += delta;
if (*min_offl < 0) {
*min_offl = delta;
*max_offl = delta;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 7e06f04e98fe..434c840e2d82 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -667,30 +667,30 @@ config RING_BUFFER_STARTUP_TEST
If unsure, say N
-config TRACE_ENUM_MAP_FILE
- bool "Show enum mappings for trace events"
+config TRACE_EVAL_MAP_FILE
+ bool "Show eval mappings for trace events"
depends on TRACING
help
- The "print fmt" of the trace events will show the enum names instead
- of their values. This can cause problems for user space tools that
- use this string to parse the raw data as user space does not know
+ The "print fmt" of the trace events will show the enum/sizeof names
+ instead of their values. This can cause problems for user space tools
+ that use this string to parse the raw data as user space does not know
how to convert the string to its value.
To fix this, there's a special macro in the kernel that can be used
- to convert the enum into its value. If this macro is used, then the
- print fmt strings will have the enums converted to their values.
+ to convert an enum/sizeof into its value. If this macro is used, then
+ the print fmt strings will be converted to their values.
If something does not get converted properly, this option can be
- used to show what enums the kernel tried to convert.
+ used to show what enums/sizeof the kernel tried to convert.
- This option is for debugging the enum conversions. A file is created
- in the tracing directory called "enum_map" that will show the enum
+ This option is for debugging the conversions. A file is created
+ in the tracing directory called "eval_map" that will show the
names matched with their values and what trace event system they
belong too.
Normally, the mapping of the strings to values will be freed after
boot up or module load. With this option, they will not be freed, as
- they are needed for the "enum_map" file. Enabling this option will
+ they are needed for the "eval_map" file. Enabling this option will
increase the memory footprint of the running kernel.
If unsure, say N
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 460a031c77e5..dc498b605d5d 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -122,8 +122,8 @@ static const struct bpf_func_proto *bpf_get_probe_write_proto(void)
}
/*
- * limited trace_printk()
- * only %d %u %x %ld %lu %lx %lld %llu %llx %p %s conversion specifiers allowed
+ * Only limited trace_printk() conversion specifiers allowed:
+ * %d %i %u %x %ld %li %lu %lx %lld %lli %llu %llx %p %s
*/
BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
u64, arg2, u64, arg3)
@@ -198,15 +198,42 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
i++;
}
- if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x')
+ if (fmt[i] != 'i' && fmt[i] != 'd' &&
+ fmt[i] != 'u' && fmt[i] != 'x')
return -EINVAL;
fmt_cnt++;
}
- return __trace_printk(1/* fake ip will not be printed */, fmt,
- mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
- mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
- mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
+/* Horrid workaround for getting va_list handling working with different
+ * argument type combinations generically for 32 and 64 bit archs.
+ */
+#define __BPF_TP_EMIT() __BPF_ARG3_TP()
+#define __BPF_TP(...) \
+ __trace_printk(1 /* Fake ip will not be printed. */, \
+ fmt, ##__VA_ARGS__)
+
+#define __BPF_ARG1_TP(...) \
+ ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_TP(arg1, ##__VA_ARGS__) \
+ : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_TP((long)arg1, ##__VA_ARGS__) \
+ : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
+
+#define __BPF_ARG2_TP(...) \
+ ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
+ : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
+ : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
+
+#define __BPF_ARG3_TP(...) \
+ ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
+ : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
+ : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
+
+ return __BPF_TP_EMIT();
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
@@ -234,7 +261,8 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
struct bpf_event_entry *ee;
- struct perf_event *event;
+ u64 value = 0;
+ int err;
if (unlikely(flags & ~(BPF_F_INDEX_MASK)))
return -EINVAL;
@@ -247,21 +275,14 @@ BPF_CALL_2(bpf_perf_event_read, struct bpf_map *, map, u64, flags)
if (!ee)
return -ENOENT;
- event = ee->event;
- if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
- event->attr.type != PERF_TYPE_RAW))
- return -EINVAL;
-
- /* make sure event is local and doesn't have pmu::count */
- if (unlikely(event->oncpu != cpu || event->pmu->count))
- return -EINVAL;
-
+ err = perf_event_read_local(ee->event, &value);
/*
- * we don't know if the function is run successfully by the
- * return value. It can be judged in other places, such as
- * eBPF programs.
+ * this api is ugly since we miss [-22..-2] range of valid
+ * counter values, but that's uapi
*/
- return perf_event_read_local(event);
+ if (err)
+ return err;
+ return value;
}
static const struct bpf_func_proto bpf_perf_event_read_proto = {
@@ -272,14 +293,16 @@ static const struct bpf_func_proto bpf_perf_event_read_proto = {
.arg2_type = ARG_ANYTHING,
};
+static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd);
+
static __always_inline u64
__bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
u64 flags, struct perf_raw_record *raw)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
+ struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd);
unsigned int cpu = smp_processor_id();
u64 index = flags & BPF_F_INDEX_MASK;
- struct perf_sample_data sample_data;
struct bpf_event_entry *ee;
struct perf_event *event;
@@ -300,9 +323,9 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map,
if (unlikely(event->oncpu != cpu))
return -EOPNOTSUPP;
- perf_sample_data_init(&sample_data, 0, 0);
- sample_data.raw = raw;
- perf_event_output(event, &sample_data, regs);
+ perf_sample_data_init(sd, 0, 0);
+ sd->raw = raw;
+ perf_event_output(event, sd, regs);
return 0;
}
@@ -483,7 +506,7 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func
/* bpf+kprobe programs can access fields of 'struct pt_regs' */
static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= sizeof(struct pt_regs))
return false;
@@ -566,7 +589,7 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id)
}
static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE)
return false;
@@ -585,40 +608,47 @@ const struct bpf_verifier_ops tracepoint_prog_ops = {
};
static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type,
- enum bpf_reg_type *reg_type)
+ struct bpf_insn_access_aux *info)
{
+ const int size_sp = FIELD_SIZEOF(struct bpf_perf_event_data,
+ sample_period);
+
if (off < 0 || off >= sizeof(struct bpf_perf_event_data))
return false;
if (type != BPF_READ)
return false;
if (off % size != 0)
return false;
- if (off == offsetof(struct bpf_perf_event_data, sample_period)) {
- if (size != sizeof(u64))
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_perf_event_data, sample_period):
+ bpf_ctx_record_field_size(info, size_sp);
+ if (!bpf_ctx_narrow_access_ok(off, size, size_sp))
return false;
- } else {
+ break;
+ default:
if (size != sizeof(long))
return false;
}
+
return true;
}
static u32 pe_prog_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
- struct bpf_prog *prog)
+ struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
switch (si->off) {
case offsetof(struct bpf_perf_event_data, sample_period):
- BUILD_BUG_ON(FIELD_SIZEOF(struct perf_sample_data, period) != sizeof(u64));
-
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
data), si->dst_reg, si->src_reg,
offsetof(struct bpf_perf_event_data_kern, data));
*insn++ = BPF_LDX_MEM(BPF_DW, si->dst_reg, si->dst_reg,
- offsetof(struct perf_sample_data, period));
+ bpf_target_off(struct perf_sample_data, period, 8,
+ target_size));
break;
default:
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_perf_event_data_kern,
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b308be30dfb9..96cea88fa00f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -113,7 +113,7 @@ static int ftrace_disabled __read_mostly;
static DEFINE_MUTEX(ftrace_lock);
-static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops __rcu *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
@@ -169,8 +169,11 @@ int ftrace_nr_registered_ops(void)
mutex_lock(&ftrace_lock);
- for (ops = ftrace_ops_list;
- ops != &ftrace_list_end; ops = ops->next)
+ for (ops = rcu_dereference_protected(ftrace_ops_list,
+ lockdep_is_held(&ftrace_lock));
+ ops != &ftrace_list_end;
+ ops = rcu_dereference_protected(ops->next,
+ lockdep_is_held(&ftrace_lock)))
cnt++;
mutex_unlock(&ftrace_lock);
@@ -275,10 +278,11 @@ static void update_ftrace_function(void)
* If there's only one ftrace_ops registered, the ftrace_ops_list
* will point to the ops we want.
*/
- set_function_trace_op = ftrace_ops_list;
+ set_function_trace_op = rcu_dereference_protected(ftrace_ops_list,
+ lockdep_is_held(&ftrace_lock));
/* If there's no ftrace_ops registered, just call the stub function */
- if (ftrace_ops_list == &ftrace_list_end) {
+ if (set_function_trace_op == &ftrace_list_end) {
func = ftrace_stub;
/*
@@ -286,7 +290,8 @@ static void update_ftrace_function(void)
* recursion safe and not dynamic and the arch supports passing ops,
* then have the mcount trampoline call the function directly.
*/
- } else if (ftrace_ops_list->next == &ftrace_list_end) {
+ } else if (rcu_dereference_protected(ftrace_ops_list->next,
+ lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
func = ftrace_ops_get_list_func(ftrace_ops_list);
} else {
@@ -348,9 +353,11 @@ int using_ftrace_ops_list_func(void)
return ftrace_trace_function == ftrace_ops_list_func;
}
-static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
+static void add_ftrace_ops(struct ftrace_ops __rcu **list,
+ struct ftrace_ops *ops)
{
- ops->next = *list;
+ rcu_assign_pointer(ops->next, *list);
+
/*
* We are entering ops into the list but another
* CPU might be walking that list. We need to make sure
@@ -360,7 +367,8 @@ static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
rcu_assign_pointer(*list, ops);
}
-static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
+static int remove_ftrace_ops(struct ftrace_ops __rcu **list,
+ struct ftrace_ops *ops)
{
struct ftrace_ops **p;
@@ -368,7 +376,10 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
* If we are removing the last function, then simply point
* to the ftrace_stub.
*/
- if (*list == ops && ops->next == &ftrace_list_end) {
+ if (rcu_dereference_protected(*list,
+ lockdep_is_held(&ftrace_lock)) == ops &&
+ rcu_dereference_protected(ops->next,
+ lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
*list = &ftrace_list_end;
return 0;
}
@@ -878,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
function_profile_call(trace->func, 0, NULL, NULL);
+ /* If function graph is shutting down, ret_stack can be NULL */
+ if (!current->ret_stack)
+ return 0;
+
if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
current->ret_stack[index].subtime = 0;
@@ -1293,6 +1308,28 @@ static void ftrace_hash_clear(struct ftrace_hash *hash)
FTRACE_WARN_ON(hash->count);
}
+static void free_ftrace_mod(struct ftrace_mod_load *ftrace_mod)
+{
+ list_del(&ftrace_mod->list);
+ kfree(ftrace_mod->module);
+ kfree(ftrace_mod->func);
+ kfree(ftrace_mod);
+}
+
+static void clear_ftrace_mod_list(struct list_head *head)
+{
+ struct ftrace_mod_load *p, *n;
+
+ /* stack tracer isn't supported yet */
+ if (!head)
+ return;
+
+ mutex_lock(&ftrace_lock);
+ list_for_each_entry_safe(p, n, head, list)
+ free_ftrace_mod(p);
+ mutex_unlock(&ftrace_lock);
+}
+
static void free_ftrace_hash(struct ftrace_hash *hash)
{
if (!hash || hash == EMPTY_HASH)
@@ -1346,6 +1383,35 @@ static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
return hash;
}
+
+static int ftrace_add_mod(struct trace_array *tr,
+ const char *func, const char *module,
+ int enable)
+{
+ struct ftrace_mod_load *ftrace_mod;
+ struct list_head *mod_head = enable ? &tr->mod_trace : &tr->mod_notrace;
+
+ ftrace_mod = kzalloc(sizeof(*ftrace_mod), GFP_KERNEL);
+ if (!ftrace_mod)
+ return -ENOMEM;
+
+ ftrace_mod->func = kstrdup(func, GFP_KERNEL);
+ ftrace_mod->module = kstrdup(module, GFP_KERNEL);
+ ftrace_mod->enable = enable;
+
+ if (!ftrace_mod->func || !ftrace_mod->module)
+ goto out_free;
+
+ list_add(&ftrace_mod->list, mod_head);
+
+ return 0;
+
+ out_free:
+ free_ftrace_mod(ftrace_mod);
+
+ return -ENOMEM;
+}
+
static struct ftrace_hash *
alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
{
@@ -1359,6 +1425,9 @@ alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
if (!new_hash)
return NULL;
+ if (hash)
+ new_hash->flags = hash->flags;
+
/* Empty hash? */
if (ftrace_hash_empty(hash))
return new_hash;
@@ -1403,7 +1472,7 @@ __ftrace_hash_move(struct ftrace_hash *src)
/*
* If the new source is empty, just return the empty_hash.
*/
- if (!src->count)
+ if (ftrace_hash_empty(src))
return EMPTY_HASH;
/*
@@ -1420,6 +1489,8 @@ __ftrace_hash_move(struct ftrace_hash *src)
if (!new_hash)
return NULL;
+ new_hash->flags = src->flags;
+
size = 1 << src->size_bits;
for (i = 0; i < size; i++) {
hhd = &src->buckets[i];
@@ -1513,8 +1584,8 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
return 0;
#endif
- hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
- hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
+ rcu_assign_pointer(hash.filter_hash, ops->func_hash->filter_hash);
+ rcu_assign_pointer(hash.notrace_hash, ops->func_hash->notrace_hash);
if (hash_contains_ip(ip, &hash))
ret = 1;
@@ -1650,7 +1721,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
struct dyn_ftrace *rec;
bool update = false;
int count = 0;
- int all = 0;
+ int all = false;
/* Only update if the ops has been registered */
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
@@ -1671,7 +1742,7 @@ static bool __ftrace_hash_rec_update(struct ftrace_ops *ops,
hash = ops->func_hash->filter_hash;
other_hash = ops->func_hash->notrace_hash;
if (ftrace_hash_empty(hash))
- all = 1;
+ all = true;
} else {
inc = !inc;
hash = ops->func_hash->notrace_hash;
@@ -2784,7 +2855,8 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* If there's no more ops registered with ftrace, run a
* sanity check to make sure all rec flags are cleared.
*/
- if (ftrace_ops_list == &ftrace_list_end) {
+ if (rcu_dereference_protected(ftrace_ops_list,
+ lockdep_is_held(&ftrace_lock)) == &ftrace_list_end) {
struct ftrace_page *pg;
struct dyn_ftrace *rec;
@@ -3061,6 +3133,7 @@ ftrace_allocate_pages(unsigned long num_to_init)
struct ftrace_iterator {
loff_t pos;
loff_t func_pos;
+ loff_t mod_pos;
struct ftrace_page *pg;
struct dyn_ftrace *func;
struct ftrace_func_probe *probe;
@@ -3068,6 +3141,8 @@ struct ftrace_iterator {
struct trace_parser parser;
struct ftrace_hash *hash;
struct ftrace_ops *ops;
+ struct trace_array *tr;
+ struct list_head *mod_list;
int pidx;
int idx;
unsigned flags;
@@ -3152,13 +3227,13 @@ static void *t_probe_start(struct seq_file *m, loff_t *pos)
if (!(iter->flags & FTRACE_ITER_DO_PROBES))
return NULL;
- if (iter->func_pos > *pos)
+ if (iter->mod_pos > *pos)
return NULL;
iter->probe = NULL;
iter->probe_entry = NULL;
iter->pidx = 0;
- for (l = 0; l <= (*pos - iter->func_pos); ) {
+ for (l = 0; l <= (*pos - iter->mod_pos); ) {
p = t_probe_next(m, &l);
if (!p)
break;
@@ -3197,6 +3272,82 @@ t_probe_show(struct seq_file *m, struct ftrace_iterator *iter)
}
static void *
+t_mod_next(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
+
+ (*pos)++;
+ iter->pos = *pos;
+
+ iter->mod_list = iter->mod_list->next;
+
+ if (iter->mod_list == &tr->mod_trace ||
+ iter->mod_list == &tr->mod_notrace) {
+ iter->flags &= ~FTRACE_ITER_MOD;
+ return NULL;
+ }
+
+ iter->mod_pos = *pos;
+
+ return iter;
+}
+
+static void *t_mod_start(struct seq_file *m, loff_t *pos)
+{
+ struct ftrace_iterator *iter = m->private;
+ void *p = NULL;
+ loff_t l;
+
+ if (iter->func_pos > *pos)
+ return NULL;
+
+ iter->mod_pos = iter->func_pos;
+
+ /* probes are only available if tr is set */
+ if (!iter->tr)
+ return NULL;
+
+ for (l = 0; l <= (*pos - iter->func_pos); ) {
+ p = t_mod_next(m, &l);
+ if (!p)
+ break;
+ }
+ if (!p) {
+ iter->flags &= ~FTRACE_ITER_MOD;
+ return t_probe_start(m, pos);
+ }
+
+ /* Only set this if we have an item */
+ iter->flags |= FTRACE_ITER_MOD;
+
+ return iter;
+}
+
+static int
+t_mod_show(struct seq_file *m, struct ftrace_iterator *iter)
+{
+ struct ftrace_mod_load *ftrace_mod;
+ struct trace_array *tr = iter->tr;
+
+ if (WARN_ON_ONCE(!iter->mod_list) ||
+ iter->mod_list == &tr->mod_trace ||
+ iter->mod_list == &tr->mod_notrace)
+ return -EIO;
+
+ ftrace_mod = list_entry(iter->mod_list, struct ftrace_mod_load, list);
+
+ if (ftrace_mod->func)
+ seq_printf(m, "%s", ftrace_mod->func);
+ else
+ seq_putc(m, '*');
+
+ seq_printf(m, ":mod:%s\n", ftrace_mod->module);
+
+ return 0;
+}
+
+static void *
t_func_next(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
@@ -3237,7 +3388,7 @@ static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
- loff_t l = *pos; /* t_hash_start() must use original pos */
+ loff_t l = *pos; /* t_probe_start() must use original pos */
void *ret;
if (unlikely(ftrace_disabled))
@@ -3246,16 +3397,19 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
if (iter->flags & FTRACE_ITER_PROBE)
return t_probe_next(m, pos);
+ if (iter->flags & FTRACE_ITER_MOD)
+ return t_mod_next(m, pos);
+
if (iter->flags & FTRACE_ITER_PRINTALL) {
/* next must increment pos, and t_probe_start does not */
(*pos)++;
- return t_probe_start(m, &l);
+ return t_mod_start(m, &l);
}
ret = t_func_next(m, pos);
if (!ret)
- return t_probe_start(m, &l);
+ return t_mod_start(m, &l);
return ret;
}
@@ -3264,7 +3418,7 @@ static void reset_iter_read(struct ftrace_iterator *iter)
{
iter->pos = 0;
iter->func_pos = 0;
- iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE);
+ iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE | FTRACE_ITER_MOD);
}
static void *t_start(struct seq_file *m, loff_t *pos)
@@ -3293,15 +3447,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
ftrace_hash_empty(iter->hash)) {
iter->func_pos = 1; /* Account for the message */
if (*pos > 0)
- return t_probe_start(m, pos);
+ return t_mod_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
/* reset in case of seek/pread */
iter->flags &= ~FTRACE_ITER_PROBE;
return iter;
}
- if (iter->flags & FTRACE_ITER_PROBE)
- return t_probe_start(m, pos);
+ if (iter->flags & FTRACE_ITER_MOD)
+ return t_mod_start(m, pos);
/*
* Unfortunately, we need to restart at ftrace_pages_start
@@ -3317,7 +3471,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
}
if (!p)
- return t_probe_start(m, pos);
+ return t_mod_start(m, pos);
return iter;
}
@@ -3351,6 +3505,9 @@ static int t_show(struct seq_file *m, void *v)
if (iter->flags & FTRACE_ITER_PROBE)
return t_probe_show(m, iter);
+ if (iter->flags & FTRACE_ITER_MOD)
+ return t_mod_show(m, iter);
+
if (iter->flags & FTRACE_ITER_PRINTALL) {
if (iter->flags & FTRACE_ITER_NOTRACE)
seq_puts(m, "#### no functions disabled ####\n");
@@ -3457,6 +3614,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
{
struct ftrace_iterator *iter;
struct ftrace_hash *hash;
+ struct list_head *mod_head;
+ struct trace_array *tr = ops->private;
int ret = 0;
ftrace_ops_init(ops);
@@ -3475,21 +3634,29 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag,
iter->ops = ops;
iter->flags = flag;
+ iter->tr = tr;
mutex_lock(&ops->func_hash->regex_lock);
- if (flag & FTRACE_ITER_NOTRACE)
+ if (flag & FTRACE_ITER_NOTRACE) {
hash = ops->func_hash->notrace_hash;
- else
+ mod_head = tr ? &tr->mod_notrace : NULL;
+ } else {
hash = ops->func_hash->filter_hash;
+ mod_head = tr ? &tr->mod_trace : NULL;
+ }
+
+ iter->mod_list = mod_head;
if (file->f_mode & FMODE_WRITE) {
const int size_bits = FTRACE_HASH_DEFAULT_BITS;
- if (file->f_flags & O_TRUNC)
+ if (file->f_flags & O_TRUNC) {
iter->hash = alloc_ftrace_hash(size_bits);
- else
+ clear_ftrace_mod_list(mod_head);
+ } else {
iter->hash = alloc_and_copy_ftrace_hash(size_bits, hash);
+ }
if (!iter->hash) {
trace_parser_put(&iter->parser);
@@ -3665,7 +3832,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
int exclude_mod = 0;
int found = 0;
int ret;
- int clear_filter;
+ int clear_filter = 0;
if (func) {
func_g.type = filter_parse_regex(func, len, &func_g.search,
@@ -3761,6 +3928,165 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
return ret;
}
+static bool module_exists(const char *module)
+{
+ /* All modules have the symbol __this_module */
+ const char this_mod[] = "__this_module";
+ const int modname_size = MAX_PARAM_PREFIX_LEN + sizeof(this_mod) + 1;
+ char modname[modname_size + 1];
+ unsigned long val;
+ int n;
+
+ n = snprintf(modname, modname_size + 1, "%s:%s", module, this_mod);
+
+ if (n > modname_size)
+ return false;
+
+ val = module_kallsyms_lookup_name(modname);
+ return val != 0;
+}
+
+static int cache_mod(struct trace_array *tr,
+ const char *func, char *module, int enable)
+{
+ struct ftrace_mod_load *ftrace_mod, *n;
+ struct list_head *head = enable ? &tr->mod_trace : &tr->mod_notrace;
+ int ret;
+
+ mutex_lock(&ftrace_lock);
+
+ /* We do not cache inverse filters */
+ if (func[0] == '!') {
+ func++;
+ ret = -EINVAL;
+
+ /* Look to remove this hash */
+ list_for_each_entry_safe(ftrace_mod, n, head, list) {
+ if (strcmp(ftrace_mod->module, module) != 0)
+ continue;
+
+ /* no func matches all */
+ if (strcmp(func, "*") == 0 ||
+ (ftrace_mod->func &&
+ strcmp(ftrace_mod->func, func) == 0)) {
+ ret = 0;
+ free_ftrace_mod(ftrace_mod);
+ continue;
+ }
+ }
+ goto out;
+ }
+
+ ret = -EINVAL;
+ /* We only care about modules that have not been loaded yet */
+ if (module_exists(module))
+ goto out;
+
+ /* Save this string off, and execute it when the module is loaded */
+ ret = ftrace_add_mod(tr, func, module, enable);
+ out:
+ mutex_unlock(&ftrace_lock);
+
+ return ret;
+}
+
+static int
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
+ int reset, int enable);
+
+#ifdef CONFIG_MODULES
+static void process_mod_list(struct list_head *head, struct ftrace_ops *ops,
+ char *mod, bool enable)
+{
+ struct ftrace_mod_load *ftrace_mod, *n;
+ struct ftrace_hash **orig_hash, *new_hash;
+ LIST_HEAD(process_mods);
+ char *func;
+ int ret;
+
+ mutex_lock(&ops->func_hash->regex_lock);
+
+ if (enable)
+ orig_hash = &ops->func_hash->filter_hash;
+ else
+ orig_hash = &ops->func_hash->notrace_hash;
+
+ new_hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS,
+ *orig_hash);
+ if (!new_hash)
+ goto out; /* warn? */
+
+ mutex_lock(&ftrace_lock);
+
+ list_for_each_entry_safe(ftrace_mod, n, head, list) {
+
+ if (strcmp(ftrace_mod->module, mod) != 0)
+ continue;
+
+ if (ftrace_mod->func)
+ func = kstrdup(ftrace_mod->func, GFP_KERNEL);
+ else
+ func = kstrdup("*", GFP_KERNEL);
+
+ if (!func) /* warn? */
+ continue;
+
+ list_del(&ftrace_mod->list);
+ list_add(&ftrace_mod->list, &process_mods);
+
+ /* Use the newly allocated func, as it may be "*" */
+ kfree(ftrace_mod->func);
+ ftrace_mod->func = func;
+ }
+
+ mutex_unlock(&ftrace_lock);
+
+ list_for_each_entry_safe(ftrace_mod, n, &process_mods, list) {
+
+ func = ftrace_mod->func;
+
+ /* Grabs ftrace_lock, which is why we have this extra step */
+ match_records(new_hash, func, strlen(func), mod);
+ free_ftrace_mod(ftrace_mod);
+ }
+
+ if (enable && list_empty(head))
+ new_hash->flags &= ~FTRACE_HASH_FL_MOD;
+
+ mutex_lock(&ftrace_lock);
+
+ ret = ftrace_hash_move_and_update_ops(ops, orig_hash,
+ new_hash, enable);
+ mutex_unlock(&ftrace_lock);
+
+ out:
+ mutex_unlock(&ops->func_hash->regex_lock);
+
+ free_ftrace_hash(new_hash);
+}
+
+static void process_cached_mods(const char *mod_name)
+{
+ struct trace_array *tr;
+ char *mod;
+
+ mod = kstrdup(mod_name, GFP_KERNEL);
+ if (!mod)
+ return;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (!list_empty(&tr->mod_trace))
+ process_mod_list(&tr->mod_trace, tr->ops, mod, true);
+ if (!list_empty(&tr->mod_notrace))
+ process_mod_list(&tr->mod_notrace, tr->ops, mod, false);
+ }
+ mutex_unlock(&trace_types_lock);
+
+ kfree(mod);
+}
+#endif
+
/*
* We register the module command as a template to show others how
* to register the a command as well.
@@ -3768,10 +4094,16 @@ static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops,
static int
ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
- char *func, char *cmd, char *module, int enable)
+ char *func_orig, char *cmd, char *module, int enable)
{
+ char *func;
int ret;
+ /* match_records() modifies func, and we need the original */
+ func = kstrdup(func_orig, GFP_KERNEL);
+ if (!func)
+ return -ENOMEM;
+
/*
* cmd == 'mod' because we only registered this func
* for the 'mod' ftrace_func_command.
@@ -3780,8 +4112,10 @@ ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash,
* parameter.
*/
ret = match_records(hash, func, strlen(func), module);
+ kfree(func);
+
if (!ret)
- return -EINVAL;
+ return cache_mod(tr, func_orig, module, enable);
if (ret < 0)
return ret;
return 0;
@@ -4725,9 +5059,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
if (file->f_mode & FMODE_WRITE) {
filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
- if (filter_hash)
+ if (filter_hash) {
orig_hash = &iter->ops->func_hash->filter_hash;
- else
+ if (iter->tr && !list_empty(&iter->tr->mod_trace))
+ iter->hash->flags |= FTRACE_HASH_FL_MOD;
+ } else
orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
@@ -5385,6 +5721,7 @@ void ftrace_release_mod(struct module *mod)
if (pg == ftrace_pages)
ftrace_pages = next_to_ftrace_page(last_pg);
+ ftrace_update_tot_cnt -= pg->index;
*last_pg = pg->next;
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
free_pages((unsigned long)pg->records, order);
@@ -5463,6 +5800,8 @@ void ftrace_module_enable(struct module *mod)
out_unlock:
mutex_unlock(&ftrace_lock);
+
+ process_cached_mods(mod->name);
}
void ftrace_module_init(struct module *mod)
@@ -5501,6 +5840,7 @@ void __init ftrace_free_init_mem(void)
if (!rec)
continue;
pg->index--;
+ ftrace_update_tot_cnt--;
if (!pg->index) {
*last_pg = pg->next;
order = get_count_order(pg->size / ENTRIES_PER_PAGE);
@@ -5567,6 +5907,8 @@ static void ftrace_update_trampoline(struct ftrace_ops *ops)
void ftrace_init_trace_array(struct trace_array *tr)
{
INIT_LIST_HEAD(&tr->func_probes);
+ INIT_LIST_HEAD(&tr->mod_trace);
+ INIT_LIST_HEAD(&tr->mod_notrace);
}
#else
@@ -6127,7 +6469,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
if (ftrace_enabled) {
/* we are starting ftrace again */
- if (ftrace_ops_list != &ftrace_list_end)
+ if (rcu_dereference_protected(ftrace_ops_list,
+ lockdep_is_held(&ftrace_lock)) != &ftrace_list_end)
update_ftrace_function();
ftrace_startup_sysctl();
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4ae268e687fe..81279c6602ff 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1136,12 +1136,12 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
for (i = 0; i < nr_pages; i++) {
struct page *page;
/*
- * __GFP_NORETRY flag makes sure that the allocation fails
- * gracefully without invoking oom-killer and the system is
- * not destabilized.
+ * __GFP_RETRY_MAYFAIL flag makes sure that the allocation fails
+ * gracefully without invoking oom-killer and the system is not
+ * destabilized.
*/
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- GFP_KERNEL | __GFP_NORETRY,
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL,
cpu_to_node(cpu));
if (!bpage)
goto free_pages;
@@ -1149,7 +1149,7 @@ static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu)
list_add(&bpage->list, pages);
page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
+ GFP_KERNEL | __GFP_RETRY_MAYFAIL, 0);
if (!page)
goto free_pages;
bpage->page = page_address(page);
@@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* the page that was allocated, with the read page of the buffer.
*
* Returns:
- * The page allocated, or NULL on error.
+ * The page allocated, or ERR_PTR
*/
void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
{
- struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
+ struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_page *bpage = NULL;
unsigned long flags;
struct page *page;
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return ERR_PTR(-ENODEV);
+
+ cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
if (!page)
- return NULL;
+ return ERR_PTR(-ENOMEM);
bpage = page_address(page);
@@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
*
* for example:
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
- * if (!rpage)
- * return error;
+ * if (IS_ERR(rpage))
+ * return PTR_ERR(rpage);
* ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
* if (ret >= 0)
* process_page(rpage, ret);
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 9fbcaf567886..68ee79afe31c 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -113,7 +113,7 @@ static enum event_status read_page(int cpu)
int i;
bpage = ring_buffer_alloc_read_page(buffer, cpu);
- if (!bpage)
+ if (IS_ERR(bpage))
return EVENT_DROPPED;
ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 091e801145c9..44004d8aa3b3 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -87,7 +87,7 @@ dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
* tracing is active, only save the comm when a trace event
* occurred.
*/
-static DEFINE_PER_CPU(bool, trace_cmdline_save);
+static DEFINE_PER_CPU(bool, trace_taskinfo_save);
/*
* Kill all tracing for good (never come back).
@@ -120,41 +120,41 @@ enum ftrace_dump_mode ftrace_dump_on_oops;
/* When set, tracing will stop when a WARN*() is hit */
int __disable_trace_on_warning;
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
-/* Map of enums to their values, for "enum_map" file */
-struct trace_enum_map_head {
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
+/* Map of enums to their values, for "eval_map" file */
+struct trace_eval_map_head {
struct module *mod;
unsigned long length;
};
-union trace_enum_map_item;
+union trace_eval_map_item;
-struct trace_enum_map_tail {
+struct trace_eval_map_tail {
/*
* "end" is first and points to NULL as it must be different
- * than "mod" or "enum_string"
+ * than "mod" or "eval_string"
*/
- union trace_enum_map_item *next;
+ union trace_eval_map_item *next;
const char *end; /* points to NULL */
};
-static DEFINE_MUTEX(trace_enum_mutex);
+static DEFINE_MUTEX(trace_eval_mutex);
/*
- * The trace_enum_maps are saved in an array with two extra elements,
+ * The trace_eval_maps are saved in an array with two extra elements,
* one at the beginning, and one at the end. The beginning item contains
* the count of the saved maps (head.length), and the module they
* belong to if not built in (head.mod). The ending item contains a
- * pointer to the next array of saved enum_map items.
+ * pointer to the next array of saved eval_map items.
*/
-union trace_enum_map_item {
- struct trace_enum_map map;
- struct trace_enum_map_head head;
- struct trace_enum_map_tail tail;
+union trace_eval_map_item {
+ struct trace_eval_map map;
+ struct trace_eval_map_head head;
+ struct trace_eval_map_tail tail;
};
-static union trace_enum_map_item *trace_enum_maps;
-#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+static union trace_eval_map_item *trace_eval_maps;
+#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
static int tracing_set_tracer(struct trace_array *tr, const char *buf);
@@ -790,7 +790,7 @@ EXPORT_SYMBOL_GPL(tracing_on);
static __always_inline void
__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
{
- __this_cpu_write(trace_cmdline_save, true);
+ __this_cpu_write(trace_taskinfo_save, true);
/* If this is the temp buffer, we need to commit fully */
if (this_cpu_read(trace_buffered_event) == event) {
@@ -1141,9 +1141,9 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
/*
* TRACE_FLAGS is defined as a tuple matching bit masks with strings.
- * It uses C(a, b) where 'a' is the enum name and 'b' is the string that
+ * It uses C(a, b) where 'a' is the eval (enum) name and 'b' is the string that
* matches it. By defining "C(a, b) b", TRACE_FLAGS becomes a list
- * of strings in the order that the enums were defined.
+ * of strings in the order that the evals (enum) were defined.
*/
#undef C
#define C(a, b) b
@@ -1709,6 +1709,8 @@ void tracing_reset_all_online_cpus(void)
}
}
+static int *tgid_map;
+
#define SAVED_CMDLINES_DEFAULT 128
#define NO_CMDLINE_MAP UINT_MAX
static arch_spinlock_t trace_cmdline_lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -1722,7 +1724,7 @@ struct saved_cmdlines_buffer {
static struct saved_cmdlines_buffer *savedcmd;
/* temporary disable recording */
-static atomic_t trace_record_cmdline_disabled __read_mostly;
+static atomic_t trace_record_taskinfo_disabled __read_mostly;
static inline char *get_saved_cmdlines(int idx)
{
@@ -1910,13 +1912,15 @@ static void tracing_stop_tr(struct trace_array *tr)
raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
-void trace_stop_cmdline_recording(void);
-
static int trace_save_cmdline(struct task_struct *tsk)
{
unsigned pid, idx;
- if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
+ /* treat recording of idle task as a success */
+ if (!tsk->pid)
+ return 1;
+
+ if (unlikely(tsk->pid > PID_MAX_DEFAULT))
return 0;
/*
@@ -1992,16 +1996,107 @@ void trace_find_cmdline(int pid, char comm[])
preempt_enable();
}
-void tracing_record_cmdline(struct task_struct *tsk)
+int trace_find_tgid(int pid)
+{
+ if (unlikely(!tgid_map || !pid || pid > PID_MAX_DEFAULT))
+ return 0;
+
+ return tgid_map[pid];
+}
+
+static int trace_save_tgid(struct task_struct *tsk)
{
- if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
+ /* treat recording of idle task as a success */
+ if (!tsk->pid)
+ return 1;
+
+ if (unlikely(!tgid_map || tsk->pid > PID_MAX_DEFAULT))
+ return 0;
+
+ tgid_map[tsk->pid] = tsk->tgid;
+ return 1;
+}
+
+static bool tracing_record_taskinfo_skip(int flags)
+{
+ if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID))))
+ return true;
+ if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on())
+ return true;
+ if (!__this_cpu_read(trace_taskinfo_save))
+ return true;
+ return false;
+}
+
+/**
+ * tracing_record_taskinfo - record the task info of a task
+ *
+ * @task - task to record
+ * @flags - TRACE_RECORD_CMDLINE for recording comm
+ * - TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo(struct task_struct *task, int flags)
+{
+ bool done;
+
+ if (tracing_record_taskinfo_skip(flags))
return;
- if (!__this_cpu_read(trace_cmdline_save))
+ /*
+ * Record as much task information as possible. If some fail, continue
+ * to try to record the others.
+ */
+ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(task);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(task);
+
+ /* If recording any information failed, retry again soon. */
+ if (!done)
return;
- if (trace_save_cmdline(tsk))
- __this_cpu_write(trace_cmdline_save, false);
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/**
+ * tracing_record_taskinfo_sched_switch - record task info for sched_switch
+ *
+ * @prev - previous task during sched_switch
+ * @next - next task during sched_switch
+ * @flags - TRACE_RECORD_CMDLINE for recording comm
+ * TRACE_RECORD_TGID for recording tgid
+ */
+void tracing_record_taskinfo_sched_switch(struct task_struct *prev,
+ struct task_struct *next, int flags)
+{
+ bool done;
+
+ if (tracing_record_taskinfo_skip(flags))
+ return;
+
+ /*
+ * Record as much task information as possible. If some fail, continue
+ * to try to record the others.
+ */
+ done = !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(prev);
+ done &= !(flags & TRACE_RECORD_CMDLINE) || trace_save_cmdline(next);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(prev);
+ done &= !(flags & TRACE_RECORD_TGID) || trace_save_tgid(next);
+
+ /* If recording any information failed, retry again soon. */
+ if (!done)
+ return;
+
+ __this_cpu_write(trace_taskinfo_save, false);
+}
+
+/* Helpers to record a specific task information */
+void tracing_record_cmdline(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_CMDLINE);
+}
+
+void tracing_record_tgid(struct task_struct *task)
+{
+ tracing_record_taskinfo(task, TRACE_RECORD_TGID);
}
/*
@@ -3146,7 +3241,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
#endif
if (!iter->snapshot)
- atomic_inc(&trace_record_cmdline_disabled);
+ atomic_inc(&trace_record_taskinfo_disabled);
if (*pos != iter->pos) {
iter->ent = NULL;
@@ -3191,7 +3286,7 @@ static void s_stop(struct seq_file *m, void *p)
#endif
if (!iter->snapshot)
- atomic_dec(&trace_record_cmdline_disabled);
+ atomic_dec(&trace_record_taskinfo_disabled);
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
@@ -3248,23 +3343,38 @@ static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
seq_puts(m, "#\n");
}
-static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
+static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m,
+ unsigned int flags)
{
+ bool tgid = flags & TRACE_ITER_RECORD_TGID;
+
print_event_info(buf, m);
- seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"
- "# | | | | |\n");
+
+ seq_printf(m, "# TASK-PID CPU# %s TIMESTAMP FUNCTION\n", tgid ? "TGID " : "");
+ seq_printf(m, "# | | | %s | |\n", tgid ? " | " : "");
}
-static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
+static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m,
+ unsigned int flags)
{
- print_event_info(buf, m);
- seq_puts(m, "# _-----=> irqs-off\n"
- "# / _----=> need-resched\n"
- "# | / _---=> hardirq/softirq\n"
- "# || / _--=> preempt-depth\n"
- "# ||| / delay\n"
- "# TASK-PID CPU# |||| TIMESTAMP FUNCTION\n"
- "# | | | |||| | |\n");
+ bool tgid = flags & TRACE_ITER_RECORD_TGID;
+ const char tgid_space[] = " ";
+ const char space[] = " ";
+
+ seq_printf(m, "# %s _-----=> irqs-off\n",
+ tgid ? tgid_space : space);
+ seq_printf(m, "# %s / _----=> need-resched\n",
+ tgid ? tgid_space : space);
+ seq_printf(m, "# %s| / _---=> hardirq/softirq\n",
+ tgid ? tgid_space : space);
+ seq_printf(m, "# %s|| / _--=> preempt-depth\n",
+ tgid ? tgid_space : space);
+ seq_printf(m, "# %s||| / delay\n",
+ tgid ? tgid_space : space);
+ seq_printf(m, "# TASK-PID CPU#%s|||| TIMESTAMP FUNCTION\n",
+ tgid ? " TGID " : space);
+ seq_printf(m, "# | | | %s|||| | |\n",
+ tgid ? " | " : space);
}
void
@@ -3580,9 +3690,11 @@ void trace_default_header(struct seq_file *m)
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->trace_buffer, m);
+ print_func_help_header_irq(iter->trace_buffer,
+ m, trace_flags);
else
- print_func_help_header(iter->trace_buffer, m);
+ print_func_help_header(iter->trace_buffer, m,
+ trace_flags);
}
}
}
@@ -4238,6 +4350,18 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_CMD)
trace_event_enable_cmd_record(enabled);
+ if (mask == TRACE_ITER_RECORD_TGID) {
+ if (!tgid_map)
+ tgid_map = kzalloc((PID_MAX_DEFAULT + 1) * sizeof(*tgid_map),
+ GFP_KERNEL);
+ if (!tgid_map) {
+ tr->trace_flags &= ~TRACE_ITER_RECORD_TGID;
+ return -ENOMEM;
+ }
+
+ trace_event_enable_tgid_record(enabled);
+ }
+
if (mask == TRACE_ITER_EVENT_FORK)
trace_event_follow_fork(tr, enabled);
@@ -4473,7 +4597,8 @@ static const char readme_msg[] =
#endif
#if defined(CONFIG_KPROBE_EVENTS) || defined(CONFIG_UPROBE_EVENTS)
"\t accepts: event-definitions (one definition per line)\n"
- "\t Format: p|r[:[<group>/]<event>] <place> [<args>]\n"
+ "\t Format: p[:[<group>/]<event>] <place> [<args>]\n"
+ "\t r[maxactive][:[<group>/]<event>] <place> [<args>]\n"
"\t -:[<group>/]<event>\n"
#ifdef CONFIG_KPROBE_EVENTS
"\t place: [<module>:]<symbol>[+<offset>]|<memaddr>\n"
@@ -4597,6 +4722,76 @@ static const struct file_operations tracing_readme_fops = {
.llseek = generic_file_llseek,
};
+static void *saved_tgids_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ int *ptr = v;
+
+ if (*pos || m->count)
+ ptr++;
+
+ (*pos)++;
+
+ for (; ptr <= &tgid_map[PID_MAX_DEFAULT]; ptr++) {
+ if (trace_find_tgid(*ptr))
+ return ptr;
+ }
+
+ return NULL;
+}
+
+static void *saved_tgids_start(struct seq_file *m, loff_t *pos)
+{
+ void *v;
+ loff_t l = 0;
+
+ if (!tgid_map)
+ return NULL;
+
+ v = &tgid_map[0];
+ while (l <= *pos) {
+ v = saved_tgids_next(m, v, &l);
+ if (!v)
+ return NULL;
+ }
+
+ return v;
+}
+
+static void saved_tgids_stop(struct seq_file *m, void *v)
+{
+}
+
+static int saved_tgids_show(struct seq_file *m, void *v)
+{
+ int pid = (int *)v - tgid_map;
+
+ seq_printf(m, "%d %d\n", pid, trace_find_tgid(pid));
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_tgids_seq_ops = {
+ .start = saved_tgids_start,
+ .stop = saved_tgids_stop,
+ .next = saved_tgids_next,
+ .show = saved_tgids_show,
+};
+
+static int tracing_saved_tgids_open(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ return seq_open(filp, &tracing_saved_tgids_seq_ops);
+}
+
+
+static const struct file_operations tracing_saved_tgids_fops = {
+ .open = tracing_saved_tgids_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
{
unsigned int *ptr = v;
@@ -4746,11 +4941,11 @@ static const struct file_operations tracing_saved_cmdlines_size_fops = {
.write = tracing_saved_cmdlines_size_write,
};
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
-static union trace_enum_map_item *
-update_enum_map(union trace_enum_map_item *ptr)
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
+static union trace_eval_map_item *
+update_eval_map(union trace_eval_map_item *ptr)
{
- if (!ptr->map.enum_string) {
+ if (!ptr->map.eval_string) {
if (ptr->tail.next) {
ptr = ptr->tail.next;
/* Set ptr to the next real item (skip head) */
@@ -4761,15 +4956,15 @@ update_enum_map(union trace_enum_map_item *ptr)
return ptr;
}
-static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
+static void *eval_map_next(struct seq_file *m, void *v, loff_t *pos)
{
- union trace_enum_map_item *ptr = v;
+ union trace_eval_map_item *ptr = v;
/*
* Paranoid! If ptr points to end, we don't want to increment past it.
* This really should never happen.
*/
- ptr = update_enum_map(ptr);
+ ptr = update_eval_map(ptr);
if (WARN_ON_ONCE(!ptr))
return NULL;
@@ -4777,104 +4972,104 @@ static void *enum_map_next(struct seq_file *m, void *v, loff_t *pos)
(*pos)++;
- ptr = update_enum_map(ptr);
+ ptr = update_eval_map(ptr);
return ptr;
}
-static void *enum_map_start(struct seq_file *m, loff_t *pos)
+static void *eval_map_start(struct seq_file *m, loff_t *pos)
{
- union trace_enum_map_item *v;
+ union trace_eval_map_item *v;
loff_t l = 0;
- mutex_lock(&trace_enum_mutex);
+ mutex_lock(&trace_eval_mutex);
- v = trace_enum_maps;
+ v = trace_eval_maps;
if (v)
v++;
while (v && l < *pos) {
- v = enum_map_next(m, v, &l);
+ v = eval_map_next(m, v, &l);
}
return v;
}
-static void enum_map_stop(struct seq_file *m, void *v)
+static void eval_map_stop(struct seq_file *m, void *v)
{
- mutex_unlock(&trace_enum_mutex);
+ mutex_unlock(&trace_eval_mutex);
}
-static int enum_map_show(struct seq_file *m, void *v)
+static int eval_map_show(struct seq_file *m, void *v)
{
- union trace_enum_map_item *ptr = v;
+ union trace_eval_map_item *ptr = v;
seq_printf(m, "%s %ld (%s)\n",
- ptr->map.enum_string, ptr->map.enum_value,
+ ptr->map.eval_string, ptr->map.eval_value,
ptr->map.system);
return 0;
}
-static const struct seq_operations tracing_enum_map_seq_ops = {
- .start = enum_map_start,
- .next = enum_map_next,
- .stop = enum_map_stop,
- .show = enum_map_show,
+static const struct seq_operations tracing_eval_map_seq_ops = {
+ .start = eval_map_start,
+ .next = eval_map_next,
+ .stop = eval_map_stop,
+ .show = eval_map_show,
};
-static int tracing_enum_map_open(struct inode *inode, struct file *filp)
+static int tracing_eval_map_open(struct inode *inode, struct file *filp)
{
if (tracing_disabled)
return -ENODEV;
- return seq_open(filp, &tracing_enum_map_seq_ops);
+ return seq_open(filp, &tracing_eval_map_seq_ops);
}
-static const struct file_operations tracing_enum_map_fops = {
- .open = tracing_enum_map_open,
+static const struct file_operations tracing_eval_map_fops = {
+ .open = tracing_eval_map_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release,
};
-static inline union trace_enum_map_item *
-trace_enum_jmp_to_tail(union trace_enum_map_item *ptr)
+static inline union trace_eval_map_item *
+trace_eval_jmp_to_tail(union trace_eval_map_item *ptr)
{
/* Return tail of array given the head */
return ptr + ptr->head.length + 1;
}
static void
-trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
+trace_insert_eval_map_file(struct module *mod, struct trace_eval_map **start,
int len)
{
- struct trace_enum_map **stop;
- struct trace_enum_map **map;
- union trace_enum_map_item *map_array;
- union trace_enum_map_item *ptr;
+ struct trace_eval_map **stop;
+ struct trace_eval_map **map;
+ union trace_eval_map_item *map_array;
+ union trace_eval_map_item *ptr;
stop = start + len;
/*
- * The trace_enum_maps contains the map plus a head and tail item,
+ * The trace_eval_maps contains the map plus a head and tail item,
* where the head holds the module and length of array, and the
* tail holds a pointer to the next list.
*/
map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL);
if (!map_array) {
- pr_warn("Unable to allocate trace enum mapping\n");
+ pr_warn("Unable to allocate trace eval mapping\n");
return;
}
- mutex_lock(&trace_enum_mutex);
+ mutex_lock(&trace_eval_mutex);
- if (!trace_enum_maps)
- trace_enum_maps = map_array;
+ if (!trace_eval_maps)
+ trace_eval_maps = map_array;
else {
- ptr = trace_enum_maps;
+ ptr = trace_eval_maps;
for (;;) {
- ptr = trace_enum_jmp_to_tail(ptr);
+ ptr = trace_eval_jmp_to_tail(ptr);
if (!ptr->tail.next)
break;
ptr = ptr->tail.next;
@@ -4892,34 +5087,34 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start,
}
memset(map_array, 0, sizeof(*map_array));
- mutex_unlock(&trace_enum_mutex);
+ mutex_unlock(&trace_eval_mutex);
}
-static void trace_create_enum_file(struct dentry *d_tracer)
+static void trace_create_eval_file(struct dentry *d_tracer)
{
- trace_create_file("enum_map", 0444, d_tracer,
- NULL, &tracing_enum_map_fops);
+ trace_create_file("eval_map", 0444, d_tracer,
+ NULL, &tracing_eval_map_fops);
}
-#else /* CONFIG_TRACE_ENUM_MAP_FILE */
-static inline void trace_create_enum_file(struct dentry *d_tracer) { }
-static inline void trace_insert_enum_map_file(struct module *mod,
- struct trace_enum_map **start, int len) { }
-#endif /* !CONFIG_TRACE_ENUM_MAP_FILE */
+#else /* CONFIG_TRACE_EVAL_MAP_FILE */
+static inline void trace_create_eval_file(struct dentry *d_tracer) { }
+static inline void trace_insert_eval_map_file(struct module *mod,
+ struct trace_eval_map **start, int len) { }
+#endif /* !CONFIG_TRACE_EVAL_MAP_FILE */
-static void trace_insert_enum_map(struct module *mod,
- struct trace_enum_map **start, int len)
+static void trace_insert_eval_map(struct module *mod,
+ struct trace_eval_map **start, int len)
{
- struct trace_enum_map **map;
+ struct trace_eval_map **map;
if (len <= 0)
return;
map = start;
- trace_event_enum_update(map, len);
+ trace_event_eval_update(map, len);
- trace_insert_enum_map_file(mod, start, len);
+ trace_insert_eval_map_file(mod, start, len);
}
static ssize_t
@@ -6403,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
- ssize_t ret;
+ ssize_t ret = 0;
ssize_t size;
if (!count)
@@ -6417,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
iter->cpu_file);
- info->spare_cpu = iter->cpu_file;
+ if (IS_ERR(info->spare)) {
+ ret = PTR_ERR(info->spare);
+ info->spare = NULL;
+ } else {
+ info->spare_cpu = iter->cpu_file;
+ }
}
if (!info->spare)
- return -ENOMEM;
+ return ret;
/* Do we have previous read data to read? */
if (info->read < PAGE_SIZE)
@@ -6595,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
ref->ref = 1;
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
- if (!ref->page) {
- ret = -ENOMEM;
+ if (IS_ERR(ref->page)) {
+ ret = PTR_ERR(ref->page);
+ ref->page = NULL;
kfree(ref);
break;
}
@@ -6739,33 +6940,18 @@ static const struct file_operations tracing_stats_fops = {
#ifdef CONFIG_DYNAMIC_FTRACE
-int __weak ftrace_arch_read_dyn_info(char *buf, int size)
-{
- return 0;
-}
-
static ssize_t
tracing_read_dyn_info(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- static char ftrace_dyn_info_buffer[1024];
- static DEFINE_MUTEX(dyn_info_mutex);
unsigned long *p = filp->private_data;
- char *buf = ftrace_dyn_info_buffer;
- int size = ARRAY_SIZE(ftrace_dyn_info_buffer);
+ char buf[64]; /* Not too big for a shallow stack */
int r;
- mutex_lock(&dyn_info_mutex);
- r = sprintf(buf, "%ld ", *p);
-
- r += ftrace_arch_read_dyn_info(buf+r, (size-1)-r);
+ r = scnprintf(buf, 63, "%ld", *p);
buf[r++] = '\n';
- r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-
- mutex_unlock(&dyn_info_mutex);
-
- return r;
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
static const struct file_operations tracing_dyn_info_fops = {
@@ -7594,6 +7780,7 @@ static int instance_rmdir(const char *name)
}
kfree(tr->topts);
+ free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -7737,21 +7924,21 @@ struct dentry *tracing_init_dentry(void)
return NULL;
}
-extern struct trace_enum_map *__start_ftrace_enum_maps[];
-extern struct trace_enum_map *__stop_ftrace_enum_maps[];
+extern struct trace_eval_map *__start_ftrace_eval_maps[];
+extern struct trace_eval_map *__stop_ftrace_eval_maps[];
-static void __init trace_enum_init(void)
+static void __init trace_eval_init(void)
{
int len;
- len = __stop_ftrace_enum_maps - __start_ftrace_enum_maps;
- trace_insert_enum_map(NULL, __start_ftrace_enum_maps, len);
+ len = __stop_ftrace_eval_maps - __start_ftrace_eval_maps;
+ trace_insert_eval_map(NULL, __start_ftrace_eval_maps, len);
}
#ifdef CONFIG_MODULES
-static void trace_module_add_enums(struct module *mod)
+static void trace_module_add_evals(struct module *mod)
{
- if (!mod->num_trace_enums)
+ if (!mod->num_trace_evals)
return;
/*
@@ -7761,40 +7948,40 @@ static void trace_module_add_enums(struct module *mod)
if (trace_module_has_bad_taint(mod))
return;
- trace_insert_enum_map(mod, mod->trace_enums, mod->num_trace_enums);
+ trace_insert_eval_map(mod, mod->trace_evals, mod->num_trace_evals);
}
-#ifdef CONFIG_TRACE_ENUM_MAP_FILE
-static void trace_module_remove_enums(struct module *mod)
+#ifdef CONFIG_TRACE_EVAL_MAP_FILE
+static void trace_module_remove_evals(struct module *mod)
{
- union trace_enum_map_item *map;
- union trace_enum_map_item **last = &trace_enum_maps;
+ union trace_eval_map_item *map;
+ union trace_eval_map_item **last = &trace_eval_maps;
- if (!mod->num_trace_enums)
+ if (!mod->num_trace_evals)
return;
- mutex_lock(&trace_enum_mutex);
+ mutex_lock(&trace_eval_mutex);
- map = trace_enum_maps;
+ map = trace_eval_maps;
while (map) {
if (map->head.mod == mod)
break;
- map = trace_enum_jmp_to_tail(map);
+ map = trace_eval_jmp_to_tail(map);
last = &map->tail.next;
map = map->tail.next;
}
if (!map)
goto out;
- *last = trace_enum_jmp_to_tail(map)->tail.next;
+ *last = trace_eval_jmp_to_tail(map)->tail.next;
kfree(map);
out:
- mutex_unlock(&trace_enum_mutex);
+ mutex_unlock(&trace_eval_mutex);
}
#else
-static inline void trace_module_remove_enums(struct module *mod) { }
-#endif /* CONFIG_TRACE_ENUM_MAP_FILE */
+static inline void trace_module_remove_evals(struct module *mod) { }
+#endif /* CONFIG_TRACE_EVAL_MAP_FILE */
static int trace_module_notify(struct notifier_block *self,
unsigned long val, void *data)
@@ -7803,10 +7990,10 @@ static int trace_module_notify(struct notifier_block *self,
switch (val) {
case MODULE_STATE_COMING:
- trace_module_add_enums(mod);
+ trace_module_add_evals(mod);
break;
case MODULE_STATE_GOING:
- trace_module_remove_enums(mod);
+ trace_module_remove_evals(mod);
break;
}
@@ -7844,9 +8031,12 @@ static __init int tracer_init_tracefs(void)
trace_create_file("saved_cmdlines_size", 0644, d_tracer,
NULL, &tracing_saved_cmdlines_size_fops);
- trace_enum_init();
+ trace_create_file("saved_tgids", 0444, d_tracer,
+ NULL, &tracing_saved_tgids_fops);
+
+ trace_eval_init();
- trace_create_enum_file(d_tracer);
+ trace_create_eval_file(d_tracer);
#ifdef CONFIG_MODULES
register_module_notifier(&trace_module_nb);
@@ -8109,6 +8299,7 @@ __init static int tracer_alloc_buffers(void)
if (ret < 0)
goto out_free_cpumask;
/* Used for event triggers */
+ ret = -ENOMEM;
temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
if (!temp_buffer)
goto out_rm_hp_state;
@@ -8223,4 +8414,4 @@ __init static int clear_boot_tracer(void)
}
fs_initcall(tracer_init_tracefs);
-late_initcall(clear_boot_tracer);
+late_initcall_sync(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 39fd77330aab..490ba229931d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -263,7 +263,10 @@ struct trace_array {
struct ftrace_ops *ops;
struct trace_pid_list __rcu *function_pids;
#ifdef CONFIG_DYNAMIC_FTRACE
+ /* All of these are protected by the ftrace_lock */
struct list_head func_probes;
+ struct list_head mod_trace;
+ struct list_head mod_notrace;
#endif
/* function tracing enabled */
int function_enabled;
@@ -637,6 +640,9 @@ void set_graph_array(struct trace_array *tr);
void tracing_start_cmdline_record(void);
void tracing_stop_cmdline_record(void);
+void tracing_start_tgid_record(void);
+void tracing_stop_tgid_record(void);
+
int register_tracer(struct tracer *type);
int is_tracing_stopped(void);
@@ -697,6 +703,7 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,
extern u64 ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
+extern int trace_find_tgid(int pid);
extern void trace_event_follow_fork(struct trace_array *tr, bool enable);
#ifdef CONFIG_DYNAMIC_FTRACE
@@ -761,10 +768,24 @@ enum print_line_t print_trace_line(struct trace_iterator *iter);
extern char trace_find_mark(unsigned long long duration);
+struct ftrace_hash;
+
+struct ftrace_mod_load {
+ struct list_head list;
+ char *func;
+ char *module;
+ int enable;
+};
+
+enum {
+ FTRACE_HASH_FL_MOD = (1 << 0),
+};
+
struct ftrace_hash {
unsigned long size_bits;
struct hlist_head *buckets;
unsigned long count;
+ unsigned long flags;
struct rcu_head rcu;
};
@@ -773,7 +794,7 @@ ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip);
static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash)
{
- return !hash || !hash->count;
+ return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD));
}
/* Standard output formatting function used for function return traces */
@@ -1107,6 +1128,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \
C(LATENCY_FMT, "latency-format"), \
C(RECORD_CMD, "record-cmd"), \
+ C(RECORD_TGID, "record-tgid"), \
C(OVERWRITE, "overwrite"), \
C(STOP_ON_FREE, "disable_on_free"), \
C(IRQ_INFO, "irq-info"), \
@@ -1188,9 +1210,9 @@ struct ftrace_event_field {
struct event_filter {
int n_preds; /* Number assigned */
int a_preds; /* allocated */
- struct filter_pred *preds;
- struct filter_pred *root;
- char *filter_string;
+ struct filter_pred __rcu *preds;
+ struct filter_pred __rcu *root;
+ char *filter_string;
};
struct event_subsystem {
@@ -1423,6 +1445,8 @@ struct ftrace_event_field *
trace_find_event_field(struct trace_event_call *call, char *name);
extern void trace_event_enable_cmd_record(bool enable);
+extern void trace_event_enable_tgid_record(bool enable);
+
extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
extern int event_trace_del_tracer(struct trace_array *tr);
@@ -1773,10 +1797,10 @@ static inline const char *get_syscall_name(int syscall)
#ifdef CONFIG_EVENT_TRACING
void trace_event_init(void);
-void trace_event_enum_update(struct trace_enum_map **map, int len);
+void trace_event_eval_update(struct trace_eval_map **map, int len);
#else
static inline void __init trace_event_init(void) { }
-static inline void trace_event_enum_update(struct trace_enum_map **map, int len) { }
+static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { }
#endif
extern struct trace_iterator *tracepoint_print_iter;
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 562fa69df5d3..13ba2d3f6a91 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -306,6 +306,7 @@ static void
perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ops, struct pt_regs *pt_regs)
{
+ struct perf_event *event;
struct ftrace_entry *entry;
struct hlist_head *head;
struct pt_regs regs;
@@ -329,8 +330,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
entry->ip = ip;
entry->parent_ip = parent_ip;
+ event = container_of(ops, struct perf_event, ftrace_ops);
perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN,
- 1, &regs, head, NULL);
+ 1, &regs, head, NULL, event);
#undef ENTRY_SIZE
}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e7973e10398c..36132f9280e6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -343,6 +343,28 @@ void trace_event_enable_cmd_record(bool enable)
mutex_unlock(&event_mutex);
}
+void trace_event_enable_tgid_record(bool enable)
+{
+ struct trace_event_file *file;
+ struct trace_array *tr;
+
+ mutex_lock(&event_mutex);
+ do_for_each_event_file(tr, file) {
+ if (!(file->flags & EVENT_FILE_FL_ENABLED))
+ continue;
+
+ if (enable) {
+ tracing_start_tgid_record();
+ set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
+ } else {
+ tracing_stop_tgid_record();
+ clear_bit(EVENT_FILE_FL_RECORDED_TGID_BIT,
+ &file->flags);
+ }
+ } while_for_each_event_file();
+ mutex_unlock(&event_mutex);
+}
+
static int __ftrace_event_enable_disable(struct trace_event_file *file,
int enable, int soft_disable)
{
@@ -381,6 +403,12 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
tracing_stop_cmdline_record();
clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
+
+ if (file->flags & EVENT_FILE_FL_RECORDED_TGID) {
+ tracing_stop_tgid_record();
+ clear_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
+ }
+
call->class->reg(call, TRACE_REG_UNREGISTER, file);
}
/* If in SOFT_MODE, just set the SOFT_DISABLE_BIT, else clear it */
@@ -407,18 +435,30 @@ static int __ftrace_event_enable_disable(struct trace_event_file *file,
}
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
+ bool cmd = false, tgid = false;
/* Keep the event disabled, when going to SOFT_MODE. */
if (soft_disable)
set_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags);
if (tr->trace_flags & TRACE_ITER_RECORD_CMD) {
+ cmd = true;
tracing_start_cmdline_record();
set_bit(EVENT_FILE_FL_RECORDED_CMD_BIT, &file->flags);
}
+
+ if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+ tgid = true;
+ tracing_start_tgid_record();
+ set_bit(EVENT_FILE_FL_RECORDED_TGID_BIT, &file->flags);
+ }
+
ret = call->class->reg(call, TRACE_REG_REGISTER, file);
if (ret) {
- tracing_stop_cmdline_record();
+ if (cmd)
+ tracing_stop_cmdline_record();
+ if (tgid)
+ tracing_stop_tgid_record();
pr_info("event trace: Could not enable event "
"%s\n", trace_event_name(call));
break;
@@ -2067,18 +2107,18 @@ __register_event(struct trace_event_call *call, struct module *mod)
return 0;
}
-static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
+static char *eval_replace(char *ptr, struct trace_eval_map *map, int len)
{
int rlen;
int elen;
- /* Find the length of the enum value as a string */
- elen = snprintf(ptr, 0, "%ld", map->enum_value);
+ /* Find the length of the eval value as a string */
+ elen = snprintf(ptr, 0, "%ld", map->eval_value);
/* Make sure there's enough room to replace the string with the value */
if (len < elen)
return NULL;
- snprintf(ptr, elen + 1, "%ld", map->enum_value);
+ snprintf(ptr, elen + 1, "%ld", map->eval_value);
/* Get the rest of the string of ptr */
rlen = strlen(ptr + len);
@@ -2090,11 +2130,11 @@ static char *enum_replace(char *ptr, struct trace_enum_map *map, int len)
}
static void update_event_printk(struct trace_event_call *call,
- struct trace_enum_map *map)
+ struct trace_eval_map *map)
{
char *ptr;
int quote = 0;
- int len = strlen(map->enum_string);
+ int len = strlen(map->eval_string);
for (ptr = call->print_fmt; *ptr; ptr++) {
if (*ptr == '\\') {
@@ -2125,16 +2165,16 @@ static void update_event_printk(struct trace_event_call *call,
continue;
}
if (isalpha(*ptr) || *ptr == '_') {
- if (strncmp(map->enum_string, ptr, len) == 0 &&
+ if (strncmp(map->eval_string, ptr, len) == 0 &&
!isalnum(ptr[len]) && ptr[len] != '_') {
- ptr = enum_replace(ptr, map, len);
- /* Hmm, enum string smaller than value */
+ ptr = eval_replace(ptr, map, len);
+ /* enum/sizeof string smaller than value */
if (WARN_ON_ONCE(!ptr))
return;
/*
- * No need to decrement here, as enum_replace()
+ * No need to decrement here, as eval_replace()
* returns the pointer to the character passed
- * the enum, and two enums can not be placed
+ * the eval, and two evals can not be placed
* back to back without something in between.
* We can skip that something in between.
*/
@@ -2165,7 +2205,7 @@ static void update_event_printk(struct trace_event_call *call,
}
}
-void trace_event_enum_update(struct trace_enum_map **map, int len)
+void trace_event_eval_update(struct trace_eval_map **map, int len)
{
struct trace_event_call *call, *p;
const char *last_system = NULL;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 59a411ff60c7..181e139a8057 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call,
if (err && set_str)
append_filter_err(ps, filter);
}
+ if (err && !set_str) {
+ free_event_filter(filter);
+ filter = NULL;
+ }
create_filter_finish(ps);
*filterp = filter;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index b53c8d369163..8a907e12b6b9 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -598,6 +598,14 @@ static struct notifier_block trace_kprobe_module_nb = {
.priority = 1 /* Invoked after kprobe module callback */
};
+/* Convert certain expected symbols into '_' when generating event names */
+static inline void sanitize_event_name(char *name)
+{
+ while (*name++ != '\0')
+ if (*name == ':' || *name == '.')
+ *name = '_';
+}
+
static int create_trace_kprobe(int argc, char **argv)
{
/*
@@ -720,7 +728,7 @@ static int create_trace_kprobe(int argc, char **argv)
return ret;
}
if (offset && is_return &&
- !function_offset_within_entry(NULL, symbol, offset)) {
+ !kprobe_on_func_entry(NULL, symbol, offset)) {
pr_info("Given offset is not valid for return probe.\n");
return -EINVAL;
}
@@ -736,6 +744,7 @@ static int create_trace_kprobe(int argc, char **argv)
else
snprintf(buf, MAX_EVENT_NAME_LEN, "%c_0x%p",
is_return ? 'r' : 'p', addr);
+ sanitize_event_name(buf);
event = buf;
}
tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive,
@@ -1191,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
memset(&entry[1], 0, dsize);
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
NOKPROBE_SYMBOL(kprobe_perf_func);
@@ -1227,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry->ret_ip = (unsigned long)ri->ret_addr;
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 08f9bab8089e..bac629af2285 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -340,31 +340,41 @@ static inline const char *kretprobed(const char *name)
static void
seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
{
-#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_KALLSYMS
const char *name;
kallsyms_lookup(address, NULL, NULL, NULL, str);
name = kretprobed(str);
- trace_seq_printf(s, fmt, name);
+ if (name && strlen(name)) {
+ trace_seq_printf(s, fmt, name);
+ return;
+ }
#endif
+ snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
+ trace_seq_printf(s, fmt, str);
}
static void
seq_print_sym_offset(struct trace_seq *s, const char *fmt,
unsigned long address)
{
-#ifdef CONFIG_KALLSYMS
char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_KALLSYMS
const char *name;
sprint_symbol(str, address);
name = kretprobed(str);
- trace_seq_printf(s, fmt, name);
+ if (name && strlen(name)) {
+ trace_seq_printf(s, fmt, name);
+ return;
+ }
#endif
+ snprintf(str, KSYM_SYMBOL_LEN, "0x%08lx", address);
+ trace_seq_printf(s, fmt, str);
}
#ifndef CONFIG_64BIT
@@ -587,6 +597,15 @@ int trace_print_context(struct trace_iterator *iter)
trace_seq_printf(s, "%16s-%-5d [%03d] ",
comm, entry->pid, iter->cpu);
+ if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
+ unsigned int tgid = trace_find_tgid(entry->pid);
+
+ if (!tgid)
+ trace_seq_printf(s, "(-----) ");
+ else
+ trace_seq_printf(s, "(%5d) ", tgid);
+ }
+
if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 4c896a0101bd..b341c02730be 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -12,27 +12,38 @@
#include "trace.h"
-static int sched_ref;
+#define RECORD_CMDLINE 1
+#define RECORD_TGID 2
+
+static int sched_cmdline_ref;
+static int sched_tgid_ref;
static DEFINE_MUTEX(sched_register_mutex);
static void
probe_sched_switch(void *ignore, bool preempt,
struct task_struct *prev, struct task_struct *next)
{
- if (unlikely(!sched_ref))
- return;
+ int flags;
+
+ flags = (RECORD_TGID * !!sched_tgid_ref) +
+ (RECORD_CMDLINE * !!sched_cmdline_ref);
- tracing_record_cmdline(prev);
- tracing_record_cmdline(next);
+ if (!flags)
+ return;
+ tracing_record_taskinfo_sched_switch(prev, next, flags);
}
static void
probe_sched_wakeup(void *ignore, struct task_struct *wakee)
{
- if (unlikely(!sched_ref))
- return;
+ int flags;
+
+ flags = (RECORD_TGID * !!sched_tgid_ref) +
+ (RECORD_CMDLINE * !!sched_cmdline_ref);
- tracing_record_cmdline(current);
+ if (!flags)
+ return;
+ tracing_record_taskinfo(current, flags);
}
static int tracing_sched_register(void)
@@ -75,28 +86,61 @@ static void tracing_sched_unregister(void)
unregister_trace_sched_wakeup(probe_sched_wakeup, NULL);
}
-static void tracing_start_sched_switch(void)
+static void tracing_start_sched_switch(int ops)
{
+ bool sched_register = (!sched_cmdline_ref && !sched_tgid_ref);
mutex_lock(&sched_register_mutex);
- if (!(sched_ref++))
+
+ switch (ops) {
+ case RECORD_CMDLINE:
+ sched_cmdline_ref++;
+ break;
+
+ case RECORD_TGID:
+ sched_tgid_ref++;
+ break;
+ }
+
+ if (sched_register && (sched_cmdline_ref || sched_tgid_ref))
tracing_sched_register();
mutex_unlock(&sched_register_mutex);
}
-static void tracing_stop_sched_switch(void)
+static void tracing_stop_sched_switch(int ops)
{
mutex_lock(&sched_register_mutex);
- if (!(--sched_ref))
+
+ switch (ops) {
+ case RECORD_CMDLINE:
+ sched_cmdline_ref--;
+ break;
+
+ case RECORD_TGID:
+ sched_tgid_ref--;
+ break;
+ }
+
+ if (!sched_cmdline_ref && !sched_tgid_ref)
tracing_sched_unregister();
mutex_unlock(&sched_register_mutex);
}
void tracing_start_cmdline_record(void)
{
- tracing_start_sched_switch();
+ tracing_start_sched_switch(RECORD_CMDLINE);
}
void tracing_stop_cmdline_record(void)
{
- tracing_stop_sched_switch();
+ tracing_stop_sched_switch(RECORD_CMDLINE);
+}
+
+void tracing_start_tgid_record(void)
+{
+ tracing_start_sched_switch(RECORD_TGID);
+}
+
+void tracing_stop_tgid_record(void)
+{
+ tracing_stop_sched_switch(RECORD_TGID);
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index b4a751e8f9d6..a4df67cbc711 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -406,6 +406,8 @@ static const struct file_operations stack_trace_fops = {
.release = seq_release,
};
+#ifdef CONFIG_DYNAMIC_FTRACE
+
static int
stack_trace_filter_open(struct inode *inode, struct file *file)
{
@@ -423,6 +425,8 @@ static const struct file_operations stack_trace_filter_fops = {
.release = ftrace_regex_release,
};
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
int
stack_trace_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
@@ -477,8 +481,10 @@ static __init int stack_trace_init(void)
trace_create_file("stack_trace", 0444, d_tracer,
NULL, &stack_trace_fops);
+#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("stack_trace_filter", 0444, d_tracer,
&trace_ops, &stack_trace_filter_fops);
+#endif
if (stack_trace_filter_buf[0])
ftrace_set_early_filter(&trace_ops, stack_trace_filter_buf, 1);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5e10395da88e..74d9a86eccc0 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -596,7 +596,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id)
(unsigned long *)&rec->args);
perf_trace_buf_submit(rec, size, rctx,
sys_data->enter_event->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
}
static int perf_sysenter_enable(struct trace_event_call *call)
@@ -667,7 +667,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
rec->nr = syscall_nr;
rec->ret = syscall_get_return_value(current, regs);
perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type,
- 1, regs, head, NULL);
+ 1, regs, head, NULL, NULL);
}
static int perf_sysexit_enable(struct trace_event_call *call)
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index a7581fec9681..4525e0271a53 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
}
perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs,
- head, NULL);
+ head, NULL, NULL);
out:
preempt_enable();
}
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 0a689bbb78ef..305039b122fa 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a)
if (!a)
return;
- if (!a->pages) {
- kfree(a);
- return;
- }
+ if (!a->pages)
+ goto free;
for (i = 0; i < a->n_pages; i++) {
if (!a->pages[i])
break;
free_page((unsigned long)a->pages[i]);
}
+
+ kfree(a->pages);
+
+ free:
+ kfree(a);
}
struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 03e0b69bb5bf..f5d52024f6b7 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -9,7 +9,7 @@
* to those contributors as well.
*/
-#define pr_fmt(fmt) "NMI watchdog: " fmt
+#define pr_fmt(fmt) "watchdog: " fmt
#include <linux/mm.h>
#include <linux/cpu.h>
@@ -29,15 +29,58 @@
#include <linux/kvm_para.h>
#include <linux/kthread.h>
+/* Watchdog configuration */
static DEFINE_MUTEX(watchdog_proc_mutex);
-#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
-unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+int __read_mostly nmi_watchdog_enabled;
+
+#if defined(CONFIG_HARDLOCKUP_DETECTOR) || defined(CONFIG_HAVE_NMI_WATCHDOG)
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED |
+ NMI_WATCHDOG_ENABLED;
#else
unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
#endif
-int __read_mostly nmi_watchdog_enabled;
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+unsigned int __read_mostly hardlockup_panic =
+ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
+
+static int __init hardlockup_panic_setup(char *str)
+{
+ if (!strncmp(str, "panic", 5))
+ hardlockup_panic = 1;
+ else if (!strncmp(str, "nopanic", 7))
+ hardlockup_panic = 0;
+ else if (!strncmp(str, "0", 1))
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ else if (!strncmp(str, "1", 1))
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+
+#endif
+
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
int __read_mostly soft_watchdog_enabled;
+#endif
+
int __read_mostly watchdog_user_enabled;
int __read_mostly watchdog_thresh = 10;
@@ -45,15 +88,9 @@ int __read_mostly watchdog_thresh = 10;
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
#endif
-static struct cpumask watchdog_cpumask __read_mostly;
+struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
-/* Helper for online, unparked cpus. */
-#define for_each_watchdog_cpu(cpu) \
- for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
-
-atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
-
/*
* The 'watchdog_running' variable is set to 1 when the watchdog threads
* are registered/started and is set to 0 when the watchdog threads are
@@ -72,7 +109,47 @@ static int __read_mostly watchdog_running;
* of 'watchdog_running' cannot change while the watchdog is deactivated
* temporarily (see related code in 'proc' handlers).
*/
-static int __read_mostly watchdog_suspended;
+int __read_mostly watchdog_suspended;
+
+/*
+ * These functions can be overridden if an architecture implements its
+ * own hardlockup detector.
+ *
+ * watchdog_nmi_enable/disable can be implemented to start and stop when
+ * softlockup watchdog threads start and stop. The arch must select the
+ * SOFTLOCKUP_DETECTOR Kconfig.
+ */
+int __weak watchdog_nmi_enable(unsigned int cpu)
+{
+ return 0;
+}
+void __weak watchdog_nmi_disable(unsigned int cpu)
+{
+}
+
+/*
+ * watchdog_nmi_reconfigure can be implemented to be notified after any
+ * watchdog configuration change. The arch hardlockup watchdog should
+ * respond to the following variables:
+ * - nmi_watchdog_enabled
+ * - watchdog_thresh
+ * - watchdog_cpumask
+ * - sysctl_hardlockup_all_cpu_backtrace
+ * - hardlockup_panic
+ * - watchdog_suspended
+ */
+void __weak watchdog_nmi_reconfigure(void)
+{
+}
+
+
+#ifdef CONFIG_SOFTLOCKUP_DETECTOR
+
+/* Helper for online, unparked cpus. */
+#define for_each_watchdog_cpu(cpu) \
+ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+
+atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
static u64 __read_mostly sample_period;
@@ -120,6 +197,7 @@ static int __init softlockup_all_cpu_backtrace_setup(char *str)
return 1;
}
__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#ifdef CONFIG_HARDLOCKUP_DETECTOR
static int __init hardlockup_all_cpu_backtrace_setup(char *str)
{
sysctl_hardlockup_all_cpu_backtrace =
@@ -128,6 +206,7 @@ static int __init hardlockup_all_cpu_backtrace_setup(char *str)
}
__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup);
#endif
+#endif
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -161,6 +240,7 @@ static void set_sample_period(void)
* hardlockup detector generates a warning
*/
sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ watchdog_update_hrtimer_threshold(sample_period);
}
/* Commands for resetting the watchdog */
@@ -213,18 +293,6 @@ void touch_softlockup_watchdog_sync(void)
__this_cpu_write(watchdog_touch_ts, 0);
}
-/* watchdog detector functions */
-bool is_hardlockup(void)
-{
- unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-
- if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
- return true;
-
- __this_cpu_write(hrtimer_interrupts_saved, hrint);
- return false;
-}
-
static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
@@ -237,21 +305,21 @@ static int is_softlockup(unsigned long touch_ts)
return 0;
}
-static void watchdog_interrupt_count(void)
+/* watchdog detector functions */
+bool is_hardlockup(void)
{
- __this_cpu_inc(hrtimer_interrupts);
-}
+ unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
-/*
- * These two functions are mostly architecture specific
- * defining them as weak here.
- */
-int __weak watchdog_nmi_enable(unsigned int cpu)
-{
- return 0;
+ if (__this_cpu_read(hrtimer_interrupts_saved) == hrint)
+ return true;
+
+ __this_cpu_write(hrtimer_interrupts_saved, hrint);
+ return false;
}
-void __weak watchdog_nmi_disable(unsigned int cpu)
+
+static void watchdog_interrupt_count(void)
{
+ __this_cpu_inc(hrtimer_interrupts);
}
static int watchdog_enable_all_cpus(void);
@@ -502,57 +570,6 @@ static void watchdog_unpark_threads(void)
kthread_unpark(per_cpu(softlockup_watchdog, cpu));
}
-/*
- * Suspend the hard and soft lockup detector by parking the watchdog threads.
- */
-int lockup_detector_suspend(void)
-{
- int ret = 0;
-
- get_online_cpus();
- mutex_lock(&watchdog_proc_mutex);
- /*
- * Multiple suspend requests can be active in parallel (counted by
- * the 'watchdog_suspended' variable). If the watchdog threads are
- * running, the first caller takes care that they will be parked.
- * The state of 'watchdog_running' cannot change while a suspend
- * request is active (see related code in 'proc' handlers).
- */
- if (watchdog_running && !watchdog_suspended)
- ret = watchdog_park_threads();
-
- if (ret == 0)
- watchdog_suspended++;
- else {
- watchdog_disable_all_cpus();
- pr_err("Failed to suspend lockup detectors, disabled\n");
- watchdog_enabled = 0;
- }
-
- mutex_unlock(&watchdog_proc_mutex);
-
- return ret;
-}
-
-/*
- * Resume the hard and soft lockup detector by unparking the watchdog threads.
- */
-void lockup_detector_resume(void)
-{
- mutex_lock(&watchdog_proc_mutex);
-
- watchdog_suspended--;
- /*
- * The watchdog threads are unparked if they were previously running
- * and if there is no more active suspend request.
- */
- if (watchdog_running && !watchdog_suspended)
- watchdog_unpark_threads();
-
- mutex_unlock(&watchdog_proc_mutex);
- put_online_cpus();
-}
-
static int update_watchdog_all_cpus(void)
{
int ret;
@@ -605,6 +622,100 @@ static void watchdog_disable_all_cpus(void)
}
#ifdef CONFIG_SYSCTL
+static int watchdog_update_cpus(void)
+{
+ return smpboot_update_cpumask_percpu_thread(
+ &watchdog_threads, &watchdog_cpumask);
+}
+#endif
+
+#else /* SOFTLOCKUP */
+static int watchdog_park_threads(void)
+{
+ return 0;
+}
+
+static void watchdog_unpark_threads(void)
+{
+}
+
+static int watchdog_enable_all_cpus(void)
+{
+ return 0;
+}
+
+static void watchdog_disable_all_cpus(void)
+{
+}
+
+#ifdef CONFIG_SYSCTL
+static int watchdog_update_cpus(void)
+{
+ return 0;
+}
+#endif
+
+static void set_sample_period(void)
+{
+}
+#endif /* SOFTLOCKUP */
+
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+ int ret = 0;
+
+ get_online_cpus();
+ mutex_lock(&watchdog_proc_mutex);
+ /*
+ * Multiple suspend requests can be active in parallel (counted by
+ * the 'watchdog_suspended' variable). If the watchdog threads are
+ * running, the first caller takes care that they will be parked.
+ * The state of 'watchdog_running' cannot change while a suspend
+ * request is active (see related code in 'proc' handlers).
+ */
+ if (watchdog_running && !watchdog_suspended)
+ ret = watchdog_park_threads();
+
+ if (ret == 0)
+ watchdog_suspended++;
+ else {
+ watchdog_disable_all_cpus();
+ pr_err("Failed to suspend lockup detectors, disabled\n");
+ watchdog_enabled = 0;
+ }
+
+ watchdog_nmi_reconfigure();
+
+ mutex_unlock(&watchdog_proc_mutex);
+
+ return ret;
+}
+
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
+{
+ mutex_lock(&watchdog_proc_mutex);
+
+ watchdog_suspended--;
+ /*
+ * The watchdog threads are unparked if they were previously running
+ * and if there is no more active suspend request.
+ */
+ if (watchdog_running && !watchdog_suspended)
+ watchdog_unpark_threads();
+
+ watchdog_nmi_reconfigure();
+
+ mutex_unlock(&watchdog_proc_mutex);
+ put_online_cpus();
+}
+
+#ifdef CONFIG_SYSCTL
/*
* Update the run state of the lockup detectors.
@@ -625,6 +736,8 @@ static int proc_watchdog_update(void)
else
watchdog_disable_all_cpus();
+ watchdog_nmi_reconfigure();
+
return err;
}
@@ -810,10 +923,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
* a temporary cpumask, so we are likely not in a
* position to do much else to make things better.
*/
- if (smpboot_update_cpumask_percpu_thread(
- &watchdog_threads, &watchdog_cpumask) != 0)
+ if (watchdog_update_cpus() != 0)
pr_err("cpumask update failed\n");
}
+
+ watchdog_nmi_reconfigure();
}
out:
mutex_unlock(&watchdog_proc_mutex);
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 54a427d1f344..3a09ea1b1d3d 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -22,41 +22,9 @@ static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-/* boot commands */
-/*
- * Should we panic when a soft-lockup or hard-lockup occurs:
- */
-unsigned int __read_mostly hardlockup_panic =
- CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
static unsigned long hardlockup_allcpu_dumped;
-/*
- * We may not want to enable hard lockup detection by default in all cases,
- * for example when running the kernel as a guest on a hypervisor. In these
- * cases this function can be called to disable hard lockup detection. This
- * function should only be executed once by the boot processor before the
- * kernel command line parameters are parsed, because otherwise it is not
- * possible to override this in hardlockup_panic_setup().
- */
-void hardlockup_detector_disable(void)
-{
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-}
-static int __init hardlockup_panic_setup(char *str)
-{
- if (!strncmp(str, "panic", 5))
- hardlockup_panic = 1;
- else if (!strncmp(str, "nopanic", 7))
- hardlockup_panic = 0;
- else if (!strncmp(str, "0", 1))
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
- else if (!strncmp(str, "1", 1))
- watchdog_enabled |= NMI_WATCHDOG_ENABLED;
- return 1;
-}
-__setup("nmi_watchdog=", hardlockup_panic_setup);
-
-void touch_nmi_watchdog(void)
+void arch_touch_nmi_watchdog(void)
{
/*
* Using __raw here because some code paths have
@@ -66,9 +34,64 @@ void touch_nmi_watchdog(void)
* going off.
*/
raw_cpu_write(watchdog_nmi_touch, true);
- touch_softlockup_watchdog();
}
-EXPORT_SYMBOL(touch_nmi_watchdog);
+EXPORT_SYMBOL(arch_touch_nmi_watchdog);
+
+#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
+static DEFINE_PER_CPU(ktime_t, last_timestamp);
+static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
+static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
+
+void watchdog_update_hrtimer_threshold(u64 period)
+{
+ /*
+ * The hrtimer runs with a period of (watchdog_threshold * 2) / 5
+ *
+ * So it runs effectively with 2.5 times the rate of the NMI
+ * watchdog. That means the hrtimer should fire 2-3 times before
+ * the NMI watchdog expires. The NMI watchdog on x86 is based on
+ * unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
+ * might run way faster than expected and the NMI fires in a
+ * smaller period than the one deduced from the nominal CPU
+ * frequency. Depending on the Turbo-Mode factor this might be fast
+ * enough to get the NMI period smaller than the hrtimer watchdog
+ * period and trigger false positives.
+ *
+ * The sample threshold is used to check in the NMI handler whether
+ * the minimum time between two NMI samples has elapsed. That
+ * prevents false positives.
+ *
+ * Set this to 4/5 of the actual watchdog threshold period so the
+ * hrtimer is guaranteed to fire at least once within the real
+ * watchdog threshold.
+ */
+ watchdog_hrtimer_sample_threshold = period * 2;
+}
+
+static bool watchdog_check_timestamp(void)
+{
+ ktime_t delta, now = ktime_get_mono_fast_ns();
+
+ delta = now - __this_cpu_read(last_timestamp);
+ if (delta < watchdog_hrtimer_sample_threshold) {
+ /*
+ * If ktime is jiffies based, a stalled timer would prevent
+ * jiffies from being incremented and the filter would look
+ * at a stale timestamp and never trigger.
+ */
+ if (__this_cpu_inc_return(nmi_rearmed) < 10)
+ return false;
+ }
+ __this_cpu_write(nmi_rearmed, 0);
+ __this_cpu_write(last_timestamp, now);
+ return true;
+}
+#else
+static inline bool watchdog_check_timestamp(void)
+{
+ return true;
+}
+#endif
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
@@ -94,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
}
+ if (!watchdog_check_timestamp())
+ return;
+
/* check for a hardlockup
* This is done by making sure our timer interrupt
* is incrementing. The timer interrupt should have
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a86688fabc55..ca937b0c3a96 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3577,6 +3577,13 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
/* yeap, return possible CPUs in @node that @attrs wants */
cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+
+ if (cpumask_empty(cpumask)) {
+ pr_warn_once("WARNING: workqueue cpumask: online intersect > "
+ "possible intersect\n");
+ return false;
+ }
+
return !cpumask_equal(cpumask, attrs->cpumask);
use_dfl:
@@ -3744,8 +3751,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL;
/* creating multiple pwqs breaks ordering guarantee */
- if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
- return -EINVAL;
+ if (!list_empty(&wq->pwqs)) {
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
+ return -EINVAL;
+
+ wq->flags &= ~__WQ_ORDERED;
+ }
ctx = apply_wqattrs_prepare(wq, attrs);
if (!ctx)
@@ -3929,6 +3940,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
+ /*
+ * Unbound && max_active == 1 used to imply ordered, which is no
+ * longer the case on NUMA machines due to per-node pools. While
+ * alloc_ordered_workqueue() is the right way to create an ordered
+ * workqueue, keep the previous behavior to avoid subtle breakages
+ * on NUMA.
+ */
+ if ((flags & WQ_UNBOUND) && max_active == 1)
+ flags |= __WQ_ORDERED;
+
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
flags |= WQ_UNBOUND;
@@ -4119,13 +4140,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
struct pool_workqueue *pwq;
/* disallow meddling with max_active for ordered workqueues */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return;
max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
mutex_lock(&wq->mutex);
+ wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;
for_each_pwq(pwq, wq)
@@ -5253,7 +5275,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
* attributes breaks ordering guarantee. Disallow exposing ordered
* workqueues.
*/
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return -EINVAL;
wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);