summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/hashtab.c15
-rw-r--r--kernel/bpf/inode.c4
-rw-r--r--kernel/bpf/syscall.c2
-rw-r--r--kernel/bpf/sysfs_btf.c6
-rw-r--r--kernel/bpf/verifier.c8
-rw-r--r--kernel/entry/common.c43
-rw-r--r--kernel/events/core.c5
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/irq/chip.c36
-rw-r--r--kernel/irq/debugfs.c4
-rw-r--r--kernel/irq/internals.h9
-rw-r--r--kernel/irq/irqdomain.c99
-rw-r--r--kernel/irq/msi.c83
-rw-r--r--kernel/irq/pm.c34
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/irq/resend.c15
-rw-r--r--kernel/irq/settings.h7
-rw-r--r--kernel/kcsan/core.c210
-rw-r--r--kernel/kcsan/debugfs.c130
-rw-r--r--kernel/kcsan/kcsan-test.c128
-rw-r--r--kernel/kcsan/kcsan.h12
-rw-r--r--kernel/kcsan/report.c10
-rw-r--r--kernel/kcsan/selftest.c8
-rw-r--r--kernel/kprobes.c36
-rw-r--r--kernel/locking/lockdep.c990
-rw-r--r--kernel/locking/lockdep_internals.h9
-rw-r--r--kernel/locking/percpu-rwsem.c4
-rw-r--r--kernel/padata.c5
-rw-r--r--kernel/rcu/rcu.h2
-rw-r--r--kernel/rcu/tasks.h2
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/rcu/update.c2
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/deadline.c34
-rw-r--r--kernel/sched/debug.c56
-rw-r--r--kernel/sched/fair.c103
-rw-r--r--kernel/sched/features.h2
-rw-r--r--kernel/sched/membarrier.c136
-rw-r--r--kernel/sched/topology.c69
-rw-r--r--kernel/seccomp.c24
-rw-r--r--kernel/softirq.c1
-rw-r--r--kernel/stackleak.c2
-rw-r--r--kernel/stacktrace.c8
-rw-r--r--kernel/sysctl.c11
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/hrtimer.c4
-rw-r--r--kernel/time/sched_clock.c6
-rw-r--r--kernel/time/timekeeping.c119
-rw-r--r--kernel/time/timer.c6
-rw-r--r--kernel/trace/ftrace.c12
-rw-r--r--kernel/trace/trace.c48
-rw-r--r--kernel/trace/trace_events_hist.c1
-rw-r--r--kernel/trace/trace_output.c12
-rw-r--r--kernel/trace/trace_preemptirq.c4
-rw-r--r--kernel/umh.c9
-rw-r--r--kernel/workqueue.c4
57 files changed, 1899 insertions, 718 deletions
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 78dfff6a501b..7df28a45c66b 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1622,7 +1622,6 @@ struct bpf_iter_seq_hash_map_info {
struct bpf_map *map;
struct bpf_htab *htab;
void *percpu_value_buf; // non-zero means percpu hash
- unsigned long flags;
u32 bucket_id;
u32 skip_elems;
};
@@ -1632,7 +1631,6 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
struct htab_elem *prev_elem)
{
const struct bpf_htab *htab = info->htab;
- unsigned long flags = info->flags;
u32 skip_elems = info->skip_elems;
u32 bucket_id = info->bucket_id;
struct hlist_nulls_head *head;
@@ -1656,19 +1654,18 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
/* not found, unlock and go to the next bucket */
b = &htab->buckets[bucket_id++];
- htab_unlock_bucket(htab, b, flags);
+ rcu_read_unlock();
skip_elems = 0;
}
for (i = bucket_id; i < htab->n_buckets; i++) {
b = &htab->buckets[i];
- flags = htab_lock_bucket(htab, b);
+ rcu_read_lock();
count = 0;
head = &b->head;
hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
if (count >= skip_elems) {
- info->flags = flags;
info->bucket_id = i;
info->skip_elems = count;
return elem;
@@ -1676,7 +1673,7 @@ bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
count++;
}
- htab_unlock_bucket(htab, b, flags);
+ rcu_read_unlock();
skip_elems = 0;
}
@@ -1754,14 +1751,10 @@ static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
{
- struct bpf_iter_seq_hash_map_info *info = seq->private;
-
if (!v)
(void)__bpf_hash_map_seq_show(seq, NULL);
else
- htab_unlock_bucket(info->htab,
- &info->htab->buckets[info->bucket_id],
- info->flags);
+ rcu_read_unlock();
}
static int bpf_iter_init_hash_map(void *priv_data,
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index fb878ba3f22f..18f4969552ac 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -226,10 +226,12 @@ static void *map_seq_next(struct seq_file *m, void *v, loff_t *pos)
else
prev_key = key;
+ rcu_read_lock();
if (map->ops->map_get_next_key(map, prev_key, key)) {
map_iter(m)->done = true;
- return NULL;
+ key = NULL;
}
+ rcu_read_unlock();
return key;
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 1bf960aa615c..b999e7ff2583 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2634,7 +2634,7 @@ static int bpf_raw_tp_link_fill_link_info(const struct bpf_link *link,
u32 ulen = info->raw_tracepoint.tp_name_len;
size_t tp_len = strlen(tp_name);
- if (ulen && !ubuf)
+ if (!ulen ^ !ubuf)
return -EINVAL;
info->raw_tracepoint.tp_name_len = tp_len + 1;
diff --git a/kernel/bpf/sysfs_btf.c b/kernel/bpf/sysfs_btf.c
index 3b495773de5a..11b3380887fa 100644
--- a/kernel/bpf/sysfs_btf.c
+++ b/kernel/bpf/sysfs_btf.c
@@ -30,15 +30,15 @@ static struct kobject *btf_kobj;
static int __init btf_vmlinux_init(void)
{
- if (!__start_BTF)
+ bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
+
+ if (!__start_BTF || bin_attr_btf_vmlinux.size == 0)
return 0;
btf_kobj = kobject_create_and_add("btf", kernel_kobj);
if (!btf_kobj)
return -ENOMEM;
- bin_attr_btf_vmlinux.size = __stop_BTF - __start_BTF;
-
return sysfs_create_bin_file(btf_kobj, &bin_attr_btf_vmlinux);
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 47e74f09fa37..fba52d9ec8fc 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5667,8 +5667,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
bool src_known = tnum_subreg_is_const(src_reg->var_off);
bool dst_known = tnum_subreg_is_const(dst_reg->var_off);
struct tnum var32_off = tnum_subreg(dst_reg->var_off);
- s32 smin_val = src_reg->smin_value;
- u32 umin_val = src_reg->umin_value;
+ s32 smin_val = src_reg->s32_min_value;
+ u32 umin_val = src_reg->u32_min_value;
/* Assuming scalar64_min_max_or will be called so it is safe
* to skip updating register for known case.
@@ -5691,8 +5691,8 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg,
/* ORing two positives gives a positive, so safe to
* cast result into s64.
*/
- dst_reg->s32_min_value = dst_reg->umin_value;
- dst_reg->s32_max_value = dst_reg->umax_value;
+ dst_reg->s32_min_value = dst_reg->u32_min_value;
+ dst_reg->s32_max_value = dst_reg->u32_max_value;
}
}
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index fcae019158ca..145ab11b8318 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -60,31 +60,56 @@ static long syscall_trace_enter(struct pt_regs *regs, long syscall,
return ret;
}
+ /* Either of the above might have changed the syscall number */
+ syscall = syscall_get_nr(current, regs);
+
if (unlikely(ti_work & _TIF_SYSCALL_TRACEPOINT))
trace_sys_enter(regs, syscall);
syscall_enter_audit(regs, syscall);
- /* The above might have changed the syscall number */
- return ret ? : syscall_get_nr(current, regs);
+ return ret ? : syscall;
}
-noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+static __always_inline long
+__syscall_enter_from_user_work(struct pt_regs *regs, long syscall)
{
unsigned long ti_work;
- enter_from_user_mode(regs);
- instrumentation_begin();
-
- local_irq_enable();
ti_work = READ_ONCE(current_thread_info()->flags);
if (ti_work & SYSCALL_ENTER_WORK)
syscall = syscall_trace_enter(regs, syscall, ti_work);
- instrumentation_end();
return syscall;
}
+long syscall_enter_from_user_mode_work(struct pt_regs *regs, long syscall)
+{
+ return __syscall_enter_from_user_work(regs, syscall);
+}
+
+noinstr long syscall_enter_from_user_mode(struct pt_regs *regs, long syscall)
+{
+ long ret;
+
+ enter_from_user_mode(regs);
+
+ instrumentation_begin();
+ local_irq_enable();
+ ret = __syscall_enter_from_user_work(regs, syscall);
+ instrumentation_end();
+
+ return ret;
+}
+
+noinstr void syscall_enter_from_user_mode_prepare(struct pt_regs *regs)
+{
+ enter_from_user_mode(regs);
+ instrumentation_begin();
+ local_irq_enable();
+ instrumentation_end();
+}
+
/**
* exit_to_user_mode - Fixup state when exiting to user mode
*
@@ -183,7 +208,7 @@ static inline bool report_single_step(unsigned long ti_work)
/*
* If TIF_SYSCALL_EMU is set, then the only reason to report is when
* TIF_SINGLESTEP is set (i.e. PTRACE_SYSEMU_SINGLESTEP). This syscall
- * instruction has been already reported in syscall_enter_from_usermode().
+ * instruction has been already reported in syscall_enter_from_user_mode().
*/
#define SYSEMU_STEP (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 7ed5248f0445..e8bf92202542 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -99,7 +99,7 @@ static void remote_function(void *data)
* retry due to any failures in smp_call_function_single(), such as if the
* task_cpu() goes offline concurrently.
*
- * returns @func return value or -ESRCH when the process isn't running
+ * returns @func return value or -ESRCH or -ENXIO when the process isn't running
*/
static int
task_function_call(struct task_struct *p, remote_function_f func, void *info)
@@ -115,7 +115,8 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
for (;;) {
ret = smp_call_function_single(task_cpu(p), remote_function,
&data, 1);
- ret = !ret ? data.ret : -EAGAIN;
+ if (!ret)
+ ret = data.ret;
if (ret != -EAGAIN)
break;
diff --git a/kernel/fork.c b/kernel/fork.c
index 4d32190861bd..da8d360fb032 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -589,7 +589,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
mm->map_count++;
if (!(tmp->vm_flags & VM_WIPEONFORK))
- retval = copy_page_range(mm, oldmm, mpnt);
+ retval = copy_page_range(mm, oldmm, mpnt, tmp);
if (tmp->vm_ops && tmp->vm_ops->open)
tmp->vm_ops->open(tmp);
@@ -1011,6 +1011,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
+ atomic_set(&mm->has_pinned, 0);
atomic64_set(&mm->pinned_vm, 0);
memset(&mm->rss_stat, 0, sizeof(mm->rss_stat));
spin_lock_init(&mm->page_table_lock);
@@ -3014,7 +3015,7 @@ int unshare_files(struct files_struct **displaced)
}
int sysctl_max_threads(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int ret;
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 908fdf5098c3..53c67c87f141 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -19,7 +19,9 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if (__GNUC__ >= 7)
+#if (__GNUC__ >= 10)
+#define GCOV_COUNTERS 8
+#elif (__GNUC__ >= 7)
#define GCOV_COUNTERS 9
#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 857f5f4c8098..b9b9618e1aca 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -945,6 +945,33 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
}
/**
+ * handle_percpu_devid_fasteoi_ipi - Per CPU local IPI handler with per cpu
+ * dev ids
+ * @desc: the interrupt description structure for this irq
+ *
+ * The biggest difference with the IRQ version is that the interrupt is
+ * EOIed early, as the IPI could result in a context switch, and we need to
+ * make sure the IPI can fire again. We also assume that the arch code has
+ * registered an action. If not, we are positively doomed.
+ */
+void handle_percpu_devid_fasteoi_ipi(struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct irqaction *action = desc->action;
+ unsigned int irq = irq_desc_get_irq(desc);
+ irqreturn_t res;
+
+ __kstat_incr_irqs_this_cpu(desc);
+
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
+
+ trace_irq_handler_entry(irq, action);
+ res = action->handler(irq, raw_cpu_ptr(action->percpu_dev_id));
+ trace_irq_handler_exit(irq, action, res);
+}
+
+/**
* handle_percpu_devid_fasteoi_nmi - Per CPU local NMI handler with per cpu
* dev ids
* @desc: the interrupt description structure for this irq
@@ -1541,18 +1568,17 @@ EXPORT_SYMBOL_GPL(irq_chip_release_resources_parent);
*/
int irq_chip_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
{
- struct irq_data *pos = NULL;
+ struct irq_data *pos;
-#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
- for (; data; data = data->parent_data)
-#endif
+ for (pos = NULL; !pos && data; data = irqd_get_parent_data(data)) {
if (data->chip && data->chip->irq_compose_msi_msg)
pos = data;
+ }
+
if (!pos)
return -ENOSYS;
pos->chip->irq_compose_msi_msg(pos, msg);
-
return 0;
}
diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c
index b95ff5d5f4bd..e4cff358b437 100644
--- a/kernel/irq/debugfs.c
+++ b/kernel/irq/debugfs.c
@@ -57,6 +57,7 @@ static const struct irq_bit_descr irqchip_flags[] = {
BIT_MASK_DESCR(IRQCHIP_EOI_THREADED),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_LEVEL_MSI),
BIT_MASK_DESCR(IRQCHIP_SUPPORTS_NMI),
+ BIT_MASK_DESCR(IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND),
};
static void
@@ -125,6 +126,8 @@ static const struct irq_bit_descr irqdata_states[] = {
BIT_MASK_DESCR(IRQD_DEFAULT_TRIGGER_SET),
BIT_MASK_DESCR(IRQD_HANDLE_ENFORCE_IRQCTX),
+
+ BIT_MASK_DESCR(IRQD_IRQ_ENABLED_ON_SUSPEND),
};
static const struct irq_bit_descr irqdesc_states[] = {
@@ -136,6 +139,7 @@ static const struct irq_bit_descr irqdesc_states[] = {
BIT_MASK_DESCR(_IRQ_PER_CPU_DEVID),
BIT_MASK_DESCR(_IRQ_IS_POLLED),
BIT_MASK_DESCR(_IRQ_DISABLE_UNLAZY),
+ BIT_MASK_DESCR(_IRQ_HIDDEN),
};
static const struct irq_bit_descr irqdesc_istates[] = {
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 7db284b10ac9..54363527feea 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -473,6 +473,15 @@ static inline void irq_domain_deactivate_irq(struct irq_data *data)
}
#endif
+static inline struct irq_data *irqd_get_parent_data(struct irq_data *irqd)
+{
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ return irqd->parent_data;
+#else
+ return NULL;
+#endif
+}
+
#ifdef CONFIG_GENERIC_IRQ_DEBUGFS
#include <linux/debugfs.h>
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 76cd7ebd1178..cf8b374b892d 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -1136,6 +1136,17 @@ static struct irq_data *irq_domain_insert_irq_data(struct irq_domain *domain,
return irq_data;
}
+static void __irq_domain_free_hierarchy(struct irq_data *irq_data)
+{
+ struct irq_data *tmp;
+
+ while (irq_data) {
+ tmp = irq_data;
+ irq_data = irq_data->parent_data;
+ kfree(tmp);
+ }
+}
+
static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
{
struct irq_data *irq_data, *tmp;
@@ -1147,12 +1158,83 @@ static void irq_domain_free_irq_data(unsigned int virq, unsigned int nr_irqs)
irq_data->parent_data = NULL;
irq_data->domain = NULL;
- while (tmp) {
- irq_data = tmp;
- tmp = tmp->parent_data;
- kfree(irq_data);
+ __irq_domain_free_hierarchy(tmp);
+ }
+}
+
+/**
+ * irq_domain_disconnect_hierarchy - Mark the first unused level of a hierarchy
+ * @domain: IRQ domain from which the hierarchy is to be disconnected
+ * @virq: IRQ number where the hierarchy is to be trimmed
+ *
+ * Marks the @virq level belonging to @domain as disconnected.
+ * Returns -EINVAL if @virq doesn't have a valid irq_data pointing
+ * to @domain.
+ *
+ * Its only use is to be able to trim levels of hierarchy that do not
+ * have any real meaning for this interrupt, and that the driver marks
+ * as such from its .alloc() callback.
+ */
+int irq_domain_disconnect_hierarchy(struct irq_domain *domain,
+ unsigned int virq)
+{
+ struct irq_data *irqd;
+
+ irqd = irq_domain_get_irq_data(domain, virq);
+ if (!irqd)
+ return -EINVAL;
+
+ irqd->chip = ERR_PTR(-ENOTCONN);
+ return 0;
+}
+
+static int irq_domain_trim_hierarchy(unsigned int virq)
+{
+ struct irq_data *tail, *irqd, *irq_data;
+
+ irq_data = irq_get_irq_data(virq);
+ tail = NULL;
+
+ /* The first entry must have a valid irqchip */
+ if (!irq_data->chip || IS_ERR(irq_data->chip))
+ return -EINVAL;
+
+ /*
+ * Validate that the irq_data chain is sane in the presence of
+ * a hierarchy trimming marker.
+ */
+ for (irqd = irq_data->parent_data; irqd; irq_data = irqd, irqd = irqd->parent_data) {
+ /* Can't have a valid irqchip after a trim marker */
+ if (irqd->chip && tail)
+ return -EINVAL;
+
+ /* Can't have an empty irqchip before a trim marker */
+ if (!irqd->chip && !tail)
+ return -EINVAL;
+
+ if (IS_ERR(irqd->chip)) {
+ /* Only -ENOTCONN is a valid trim marker */
+ if (PTR_ERR(irqd->chip) != -ENOTCONN)
+ return -EINVAL;
+
+ tail = irq_data;
}
}
+
+ /* No trim marker, nothing to do */
+ if (!tail)
+ return 0;
+
+ pr_info("IRQ%d: trimming hierarchy from %s\n",
+ virq, tail->parent_data->domain->name);
+
+ /* Sever the inner part of the hierarchy... */
+ irqd = tail;
+ tail = tail->parent_data;
+ irqd->parent_data = NULL;
+ __irq_domain_free_hierarchy(tail);
+
+ return 0;
}
static int irq_domain_alloc_irq_data(struct irq_domain *domain,
@@ -1362,6 +1444,15 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
mutex_unlock(&irq_domain_mutex);
goto out_free_irq_data;
}
+
+ for (i = 0; i < nr_irqs; i++) {
+ ret = irq_domain_trim_hierarchy(virq + i);
+ if (ret) {
+ mutex_unlock(&irq_domain_mutex);
+ goto out_free_irq_data;
+ }
+ }
+
for (i = 0; i < nr_irqs; i++)
irq_domain_insert_irq(virq + i);
mutex_unlock(&irq_domain_mutex);
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index eb95f6106a1e..2c0c4d6d0f83 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -187,7 +187,6 @@ static const struct irq_domain_ops msi_domain_ops = {
.deactivate = msi_domain_deactivate,
};
-#ifdef GENERIC_MSI_DOMAIN_OPS
static irq_hw_number_t msi_domain_ops_get_hwirq(struct msi_domain_info *info,
msi_alloc_info_t *arg)
{
@@ -206,11 +205,6 @@ static void msi_domain_ops_set_desc(msi_alloc_info_t *arg,
{
arg->desc = desc;
}
-#else
-#define msi_domain_ops_get_hwirq NULL
-#define msi_domain_ops_prepare NULL
-#define msi_domain_ops_set_desc NULL
-#endif /* !GENERIC_MSI_DOMAIN_OPS */
static int msi_domain_ops_init(struct irq_domain *domain,
struct msi_domain_info *info,
@@ -235,11 +229,13 @@ static int msi_domain_ops_check(struct irq_domain *domain,
}
static struct msi_domain_ops msi_domain_ops_default = {
- .get_hwirq = msi_domain_ops_get_hwirq,
- .msi_init = msi_domain_ops_init,
- .msi_check = msi_domain_ops_check,
- .msi_prepare = msi_domain_ops_prepare,
- .set_desc = msi_domain_ops_set_desc,
+ .get_hwirq = msi_domain_ops_get_hwirq,
+ .msi_init = msi_domain_ops_init,
+ .msi_check = msi_domain_ops_check,
+ .msi_prepare = msi_domain_ops_prepare,
+ .set_desc = msi_domain_ops_set_desc,
+ .domain_alloc_irqs = __msi_domain_alloc_irqs,
+ .domain_free_irqs = __msi_domain_free_irqs,
};
static void msi_domain_update_dom_ops(struct msi_domain_info *info)
@@ -251,6 +247,14 @@ static void msi_domain_update_dom_ops(struct msi_domain_info *info)
return;
}
+ if (ops->domain_alloc_irqs == NULL)
+ ops->domain_alloc_irqs = msi_domain_ops_default.domain_alloc_irqs;
+ if (ops->domain_free_irqs == NULL)
+ ops->domain_free_irqs = msi_domain_ops_default.domain_free_irqs;
+
+ if (!(info->flags & MSI_FLAG_USE_DEF_DOM_OPS))
+ return;
+
if (ops->get_hwirq == NULL)
ops->get_hwirq = msi_domain_ops_default.get_hwirq;
if (ops->msi_init == NULL)
@@ -284,8 +288,7 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode,
{
struct irq_domain *domain;
- if (info->flags & MSI_FLAG_USE_DEF_DOM_OPS)
- msi_domain_update_dom_ops(info);
+ msi_domain_update_dom_ops(info);
if (info->flags & MSI_FLAG_USE_DEF_CHIP_OPS)
msi_domain_update_chip_ops(info);
@@ -370,8 +373,13 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
{
struct msi_desc *desc;
- if (domain->bus_token != DOMAIN_BUS_PCI_MSI)
+ switch(domain->bus_token) {
+ case DOMAIN_BUS_PCI_MSI:
+ case DOMAIN_BUS_VMD_MSI:
+ break;
+ default:
return false;
+ }
if (!(info->flags & MSI_FLAG_MUST_REACTIVATE))
return false;
@@ -387,17 +395,8 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
}
-/**
- * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
- * @domain: The domain to allocate from
- * @dev: Pointer to device struct of the device for which the interrupts
- * are allocated
- * @nvec: The number of interrupts to allocate
- *
- * Returns 0 on success or an error code.
- */
-int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
- int nvec)
+int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
@@ -491,12 +490,24 @@ cleanup:
}
/**
- * msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
- * @domain: The domain to managing the interrupts
+ * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain
+ * @domain: The domain to allocate from
* @dev: Pointer to device struct of the device for which the interrupts
- * are free
+ * are allocated
+ * @nvec: The number of interrupts to allocate
+ *
+ * Returns 0 on success or an error code.
*/
-void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
+ int nvec)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+
+ return ops->domain_alloc_irqs(domain, dev, nvec);
+}
+
+void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
struct msi_desc *desc;
@@ -514,6 +525,20 @@ void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
}
/**
+ * __msi_domain_free_irqs - Free interrupts from a MSI interrupt @domain associated tp @dev
+ * @domain: The domain to managing the interrupts
+ * @dev: Pointer to device struct of the device for which the interrupts
+ * are free
+ */
+void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
+{
+ struct msi_domain_info *info = domain->host_data;
+ struct msi_domain_ops *ops = info->ops;
+
+ return ops->domain_free_irqs(domain, dev);
+}
+
+/**
* msi_get_domain_info - Get the MSI interrupt domain info for @domain
* @domain: The interrupt domain to retrieve data from
*
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index c6c7e187ae74..ce0adb22ee96 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -69,12 +69,26 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
static bool suspend_device_irq(struct irq_desc *desc)
{
+ unsigned long chipflags = irq_desc_get_chip(desc)->flags;
+ struct irq_data *irqd = &desc->irq_data;
+
if (!desc->action || irq_desc_is_chained(desc) ||
desc->no_suspend_depth)
return false;
- if (irqd_is_wakeup_set(&desc->irq_data)) {
- irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ if (irqd_is_wakeup_set(irqd)) {
+ irqd_set(irqd, IRQD_WAKEUP_ARMED);
+
+ if ((chipflags & IRQCHIP_ENABLE_WAKEUP_ON_SUSPEND) &&
+ irqd_irq_disabled(irqd)) {
+ /*
+ * Interrupt marked for wakeup is in disabled state.
+ * Enable interrupt here to unmask/enable in irqchip
+ * to be able to resume with such interrupts.
+ */
+ __enable_irq(desc);
+ irqd_set(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND);
+ }
/*
* We return true here to force the caller to issue
* synchronize_irq(). We need to make sure that the
@@ -93,7 +107,7 @@ static bool suspend_device_irq(struct irq_desc *desc)
* chip level. The chip implementation indicates that with
* IRQCHIP_MASK_ON_SUSPEND.
*/
- if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+ if (chipflags & IRQCHIP_MASK_ON_SUSPEND)
mask_irq(desc);
return true;
}
@@ -137,7 +151,19 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
static void resume_irq(struct irq_desc *desc)
{
- irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ struct irq_data *irqd = &desc->irq_data;
+
+ irqd_clear(irqd, IRQD_WAKEUP_ARMED);
+
+ if (irqd_is_enabled_on_suspend(irqd)) {
+ /*
+ * Interrupt marked for wakeup was enabled during suspend
+ * entry. Disable such interrupts to restore them back to
+ * original state.
+ */
+ __disable_irq(desc);
+ irqd_clear(irqd, IRQD_IRQ_ENABLED_ON_SUSPEND);
+ }
if (desc->istate & IRQS_SUSPENDED)
goto resume;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 32c071d7bc03..72513ed2a5fc 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -485,7 +485,7 @@ int show_interrupts(struct seq_file *p, void *v)
rcu_read_lock();
desc = irq_to_desc(i);
- if (!desc)
+ if (!desc || irq_settings_is_hidden(desc))
goto outsparse;
if (desc->kstat_irqs)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index c48ce19a257f..8ccd32a0cc80 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -86,6 +86,18 @@ static int irq_sw_resend(struct irq_desc *desc)
}
#endif
+static int try_retrigger(struct irq_desc *desc)
+{
+ if (desc->irq_data.chip->irq_retrigger)
+ return desc->irq_data.chip->irq_retrigger(&desc->irq_data);
+
+#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
+ return irq_chip_retrigger_hierarchy(&desc->irq_data);
+#else
+ return 0;
+#endif
+}
+
/*
* IRQ resend
*
@@ -113,8 +125,7 @@ int check_irq_resend(struct irq_desc *desc, bool inject)
desc->istate &= ~IRQS_PENDING;
- if (!desc->irq_data.chip->irq_retrigger ||
- !desc->irq_data.chip->irq_retrigger(&desc->irq_data))
+ if (!try_retrigger(desc))
err = irq_sw_resend(desc);
/* If the retrigger was successfull, mark it with the REPLAY bit */
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index e43795cd2ccf..403378b9947b 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -17,6 +17,7 @@ enum {
_IRQ_PER_CPU_DEVID = IRQ_PER_CPU_DEVID,
_IRQ_IS_POLLED = IRQ_IS_POLLED,
_IRQ_DISABLE_UNLAZY = IRQ_DISABLE_UNLAZY,
+ _IRQ_HIDDEN = IRQ_HIDDEN,
_IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
};
@@ -31,6 +32,7 @@ enum {
#define IRQ_PER_CPU_DEVID GOT_YOU_MORON
#define IRQ_IS_POLLED GOT_YOU_MORON
#define IRQ_DISABLE_UNLAZY GOT_YOU_MORON
+#define IRQ_HIDDEN GOT_YOU_MORON
#undef IRQF_MODIFY_MASK
#define IRQF_MODIFY_MASK GOT_YOU_MORON
@@ -167,3 +169,8 @@ static inline void irq_settings_clr_disable_unlazy(struct irq_desc *desc)
{
desc->status_use_accessors &= ~_IRQ_DISABLE_UNLAZY;
}
+
+static inline bool irq_settings_is_hidden(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_HIDDEN;
+}
diff --git a/kernel/kcsan/core.c b/kernel/kcsan/core.c
index 9147ff6a12e5..3994a217bde7 100644
--- a/kernel/kcsan/core.c
+++ b/kernel/kcsan/core.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "kcsan: " fmt
+
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/delay.h>
@@ -98,6 +100,9 @@ static atomic_long_t watchpoints[CONFIG_KCSAN_NUM_WATCHPOINTS + NUM_SLOTS-1];
*/
static DEFINE_PER_CPU(long, kcsan_skip);
+/* For kcsan_prandom_u32_max(). */
+static DEFINE_PER_CPU(struct rnd_state, kcsan_rand_state);
+
static __always_inline atomic_long_t *find_watchpoint(unsigned long addr,
size_t size,
bool expect_write,
@@ -223,7 +228,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx
if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC) &&
(type & KCSAN_ACCESS_WRITE) && size <= sizeof(long) &&
- IS_ALIGNED((unsigned long)ptr, size))
+ !(type & KCSAN_ACCESS_COMPOUND) && IS_ALIGNED((unsigned long)ptr, size))
return true; /* Assume aligned writes up to word size are atomic. */
if (ctx->atomic_next > 0) {
@@ -269,11 +274,28 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *
return true;
}
+/*
+ * Returns a pseudo-random number in interval [0, ep_ro). See prandom_u32_max()
+ * for more details.
+ *
+ * The open-coded version here is using only safe primitives for all contexts
+ * where we can have KCSAN instrumentation. In particular, we cannot use
+ * prandom_u32() directly, as its tracepoint could cause recursion.
+ */
+static u32 kcsan_prandom_u32_max(u32 ep_ro)
+{
+ struct rnd_state *state = &get_cpu_var(kcsan_rand_state);
+ const u32 res = prandom_u32_state(state);
+
+ put_cpu_var(kcsan_rand_state);
+ return (u32)(((u64) res * ep_ro) >> 32);
+}
+
static inline void reset_kcsan_skip(void)
{
long skip_count = kcsan_skip_watch -
(IS_ENABLED(CONFIG_KCSAN_SKIP_WATCH_RANDOMIZE) ?
- prandom_u32_max(kcsan_skip_watch) :
+ kcsan_prandom_u32_max(kcsan_skip_watch) :
0);
this_cpu_write(kcsan_skip, skip_count);
}
@@ -283,12 +305,18 @@ static __always_inline bool kcsan_is_enabled(void)
return READ_ONCE(kcsan_enabled) && get_ctx()->disable_count == 0;
}
-static inline unsigned int get_delay(void)
+/* Introduce delay depending on context and configuration. */
+static void delay_access(int type)
{
unsigned int delay = in_task() ? kcsan_udelay_task : kcsan_udelay_interrupt;
- return delay - (IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ?
- prandom_u32_max(delay) :
- 0);
+ /* For certain access types, skew the random delay to be longer. */
+ unsigned int skew_delay_order =
+ (type & (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_ASSERT)) ? 1 : 0;
+
+ delay -= IS_ENABLED(CONFIG_KCSAN_DELAY_RANDOMIZE) ?
+ kcsan_prandom_u32_max(delay >> skew_delay_order) :
+ 0;
+ udelay(delay);
}
void kcsan_save_irqtrace(struct task_struct *task)
@@ -361,13 +389,13 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
* already removed the watchpoint, or another thread consumed
* the watchpoint before this thread.
*/
- kcsan_counter_inc(KCSAN_COUNTER_REPORT_RACES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_REPORT_RACES]);
}
if ((type & KCSAN_ACCESS_ASSERT) != 0)
- kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
else
- kcsan_counter_inc(KCSAN_COUNTER_DATA_RACES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_DATA_RACES]);
user_access_restore(flags);
}
@@ -408,7 +436,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
goto out;
if (!check_encodable((unsigned long)ptr, size)) {
- kcsan_counter_inc(KCSAN_COUNTER_UNENCODABLE_ACCESSES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_UNENCODABLE_ACCESSES]);
goto out;
}
@@ -428,12 +456,12 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* with which should_watch() returns true should be tweaked so
* that this case happens very rarely.
*/
- kcsan_counter_inc(KCSAN_COUNTER_NO_CAPACITY);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_NO_CAPACITY]);
goto out_unlock;
}
- kcsan_counter_inc(KCSAN_COUNTER_SETUP_WATCHPOINTS);
- kcsan_counter_inc(KCSAN_COUNTER_USED_WATCHPOINTS);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_SETUP_WATCHPOINTS]);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
/*
* Read the current value, to later check and infer a race if the data
@@ -459,7 +487,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (IS_ENABLED(CONFIG_KCSAN_DEBUG)) {
kcsan_disable_current();
- pr_err("KCSAN: watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n",
+ pr_err("watching %s, size: %zu, addr: %px [slot: %d, encoded: %lx]\n",
is_write ? "write" : "read", size, ptr,
watchpoint_slot((unsigned long)ptr),
encode_watchpoint((unsigned long)ptr, size, is_write));
@@ -470,7 +498,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* Delay this thread, to increase probability of observing a racy
* conflicting access.
*/
- udelay(get_delay());
+ delay_access(type);
/*
* Re-read value, and check if it is as expected; if not, we infer a
@@ -535,16 +563,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* increment this counter.
*/
if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
- kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
kcsan_report(ptr, size, type, value_change, KCSAN_REPORT_RACE_SIGNAL,
watchpoint - watchpoints);
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
/* Inferring a race, since the value should not have changed. */
- kcsan_counter_inc(KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN]);
if (is_assert)
- kcsan_counter_inc(KCSAN_COUNTER_ASSERT_FAILURES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
kcsan_report(ptr, size, type, KCSAN_VALUE_CHANGE_TRUE,
@@ -557,7 +585,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* reused after this point.
*/
remove_watchpoint(watchpoint);
- kcsan_counter_dec(KCSAN_COUNTER_USED_WATCHPOINTS);
+ atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
out_unlock:
if (!kcsan_interrupt_watcher)
local_irq_restore(irq_flags);
@@ -614,13 +642,16 @@ void __init kcsan_init(void)
BUG_ON(!in_task());
kcsan_debugfs_init();
+ prandom_seed_full_state(&kcsan_rand_state);
/*
* We are in the init task, and no other tasks should be running;
* WRITE_ONCE without memory barrier is sufficient.
*/
- if (kcsan_early_enable)
+ if (kcsan_early_enable) {
+ pr_info("enabled early\n");
WRITE_ONCE(kcsan_enabled, true);
+ }
}
/* === Exported interface =================================================== */
@@ -793,7 +824,17 @@ EXPORT_SYMBOL(__kcsan_check_access);
EXPORT_SYMBOL(__tsan_write##size); \
void __tsan_unaligned_write##size(void *ptr) \
__alias(__tsan_write##size); \
- EXPORT_SYMBOL(__tsan_unaligned_write##size)
+ EXPORT_SYMBOL(__tsan_unaligned_write##size); \
+ void __tsan_read_write##size(void *ptr); \
+ void __tsan_read_write##size(void *ptr) \
+ { \
+ check_access(ptr, size, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE); \
+ } \
+ EXPORT_SYMBOL(__tsan_read_write##size); \
+ void __tsan_unaligned_read_write##size(void *ptr) \
+ __alias(__tsan_read_write##size); \
+ EXPORT_SYMBOL(__tsan_unaligned_read_write##size)
DEFINE_TSAN_READ_WRITE(1);
DEFINE_TSAN_READ_WRITE(2);
@@ -879,3 +920,130 @@ void __tsan_init(void)
{
}
EXPORT_SYMBOL(__tsan_init);
+
+/*
+ * Instrumentation for atomic builtins (__atomic_*, __sync_*).
+ *
+ * Normal kernel code _should not_ be using them directly, but some
+ * architectures may implement some or all atomics using the compilers'
+ * builtins.
+ *
+ * Note: If an architecture decides to fully implement atomics using the
+ * builtins, because they are implicitly instrumented by KCSAN (and KASAN,
+ * etc.), implementing the ARCH_ATOMIC interface (to get instrumentation via
+ * atomic-instrumented) is no longer necessary.
+ *
+ * TSAN instrumentation replaces atomic accesses with calls to any of the below
+ * functions, whose job is to also execute the operation itself.
+ */
+
+#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \
+ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \
+ u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \
+ { \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \
+ } \
+ return __atomic_load_n(ptr, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_load); \
+ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \
+ void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \
+ { \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \
+ } \
+ __atomic_store_n(ptr, v, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_store)
+
+#define DEFINE_TSAN_ATOMIC_RMW(op, bits, suffix) \
+ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \
+ u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \
+ { \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC); \
+ } \
+ return __atomic_##op##suffix(ptr, v, memorder); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_##op)
+
+/*
+ * Note: CAS operations are always classified as write, even in case they
+ * fail. We cannot perform check_access() after a write, as it might lead to
+ * false positives, in cases such as:
+ *
+ * T0: __atomic_compare_exchange_n(&p->flag, &old, 1, ...)
+ *
+ * T1: if (__atomic_load_n(&p->flag, ...)) {
+ * modify *p;
+ * p->flag = 0;
+ * }
+ *
+ * The only downside is that, if there are 3 threads, with one CAS that
+ * succeeds, another CAS that fails, and an unmarked racing operation, we may
+ * point at the wrong CAS as the source of the race. However, if we assume that
+ * all CAS can succeed in some other execution, the data race is still valid.
+ */
+#define DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strength, weak) \
+ int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
+ u##bits val, int mo, int fail_mo); \
+ int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
+ u##bits val, int mo, int fail_mo) \
+ { \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC); \
+ } \
+ return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_##strength)
+
+#define DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits) \
+ u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
+ int mo, int fail_mo); \
+ u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
+ int mo, int fail_mo) \
+ { \
+ if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
+ check_access(ptr, bits / BITS_PER_BYTE, \
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
+ KCSAN_ACCESS_ATOMIC); \
+ } \
+ __atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \
+ return exp; \
+ } \
+ EXPORT_SYMBOL(__tsan_atomic##bits##_compare_exchange_val)
+
+#define DEFINE_TSAN_ATOMIC_OPS(bits) \
+ DEFINE_TSAN_ATOMIC_LOAD_STORE(bits); \
+ DEFINE_TSAN_ATOMIC_RMW(exchange, bits, _n); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_add, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_sub, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_and, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_or, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_xor, bits, ); \
+ DEFINE_TSAN_ATOMIC_RMW(fetch_nand, bits, ); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG(bits, strong, 0); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG(bits, weak, 1); \
+ DEFINE_TSAN_ATOMIC_CMPXCHG_VAL(bits)
+
+DEFINE_TSAN_ATOMIC_OPS(8);
+DEFINE_TSAN_ATOMIC_OPS(16);
+DEFINE_TSAN_ATOMIC_OPS(32);
+DEFINE_TSAN_ATOMIC_OPS(64);
+
+void __tsan_atomic_thread_fence(int memorder);
+void __tsan_atomic_thread_fence(int memorder)
+{
+ __atomic_thread_fence(memorder);
+}
+EXPORT_SYMBOL(__tsan_atomic_thread_fence);
+
+void __tsan_atomic_signal_fence(int memorder);
+void __tsan_atomic_signal_fence(int memorder) { }
+EXPORT_SYMBOL(__tsan_atomic_signal_fence);
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index 023e49c58d55..3c8093a371b1 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "kcsan: " fmt
+
#include <linux/atomic.h>
#include <linux/bsearch.h>
#include <linux/bug.h>
@@ -15,10 +17,19 @@
#include "kcsan.h"
-/*
- * Statistics counters.
- */
-static atomic_long_t counters[KCSAN_COUNTER_COUNT];
+atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT];
+static const char *const counter_names[] = {
+ [KCSAN_COUNTER_USED_WATCHPOINTS] = "used_watchpoints",
+ [KCSAN_COUNTER_SETUP_WATCHPOINTS] = "setup_watchpoints",
+ [KCSAN_COUNTER_DATA_RACES] = "data_races",
+ [KCSAN_COUNTER_ASSERT_FAILURES] = "assert_failures",
+ [KCSAN_COUNTER_NO_CAPACITY] = "no_capacity",
+ [KCSAN_COUNTER_REPORT_RACES] = "report_races",
+ [KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN] = "races_unknown_origin",
+ [KCSAN_COUNTER_UNENCODABLE_ACCESSES] = "unencodable_accesses",
+ [KCSAN_COUNTER_ENCODING_FALSE_POSITIVES] = "encoding_false_positives",
+};
+static_assert(ARRAY_SIZE(counter_names) == KCSAN_COUNTER_COUNT);
/*
* Addresses for filtering functions from reporting. This list can be used as a
@@ -39,34 +50,6 @@ static struct {
};
static DEFINE_SPINLOCK(report_filterlist_lock);
-static const char *counter_to_name(enum kcsan_counter_id id)
-{
- switch (id) {
- case KCSAN_COUNTER_USED_WATCHPOINTS: return "used_watchpoints";
- case KCSAN_COUNTER_SETUP_WATCHPOINTS: return "setup_watchpoints";
- case KCSAN_COUNTER_DATA_RACES: return "data_races";
- case KCSAN_COUNTER_ASSERT_FAILURES: return "assert_failures";
- case KCSAN_COUNTER_NO_CAPACITY: return "no_capacity";
- case KCSAN_COUNTER_REPORT_RACES: return "report_races";
- case KCSAN_COUNTER_RACES_UNKNOWN_ORIGIN: return "races_unknown_origin";
- case KCSAN_COUNTER_UNENCODABLE_ACCESSES: return "unencodable_accesses";
- case KCSAN_COUNTER_ENCODING_FALSE_POSITIVES: return "encoding_false_positives";
- case KCSAN_COUNTER_COUNT:
- BUG();
- }
- return NULL;
-}
-
-void kcsan_counter_inc(enum kcsan_counter_id id)
-{
- atomic_long_inc(&counters[id]);
-}
-
-void kcsan_counter_dec(enum kcsan_counter_id id)
-{
- atomic_long_dec(&counters[id]);
-}
-
/*
* The microbenchmark allows benchmarking KCSAN core runtime only. To run
* multiple threads, pipe 'microbench=<iters>' from multiple tasks into the
@@ -86,7 +69,7 @@ static noinline void microbenchmark(unsigned long iters)
*/
WRITE_ONCE(kcsan_enabled, false);
- pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters);
+ pr_info("%s begin | iters: %lu\n", __func__, iters);
cycles = get_cycles();
while (iters--) {
@@ -97,73 +80,13 @@ static noinline void microbenchmark(unsigned long iters)
}
cycles = get_cycles() - cycles;
- pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles);
+ pr_info("%s end | cycles: %llu\n", __func__, cycles);
WRITE_ONCE(kcsan_enabled, was_enabled);
/* restore context */
current->kcsan_ctx = ctx_save;
}
-/*
- * Simple test to create conflicting accesses. Write 'test=<iters>' to KCSAN's
- * debugfs file from multiple tasks to generate real conflicts and show reports.
- */
-static long test_dummy;
-static long test_flags;
-static long test_scoped;
-static noinline void test_thread(unsigned long iters)
-{
- const long CHANGE_BITS = 0xff00ff00ff00ff00L;
- const struct kcsan_ctx ctx_save = current->kcsan_ctx;
- cycles_t cycles;
-
- /* We may have been called from an atomic region; reset context. */
- memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
-
- pr_info("KCSAN: %s begin | iters: %lu\n", __func__, iters);
- pr_info("test_dummy@%px, test_flags@%px, test_scoped@%px,\n",
- &test_dummy, &test_flags, &test_scoped);
-
- cycles = get_cycles();
- while (iters--) {
- /* These all should generate reports. */
- __kcsan_check_read(&test_dummy, sizeof(test_dummy));
- ASSERT_EXCLUSIVE_WRITER(test_dummy);
- ASSERT_EXCLUSIVE_ACCESS(test_dummy);
-
- ASSERT_EXCLUSIVE_BITS(test_flags, ~CHANGE_BITS); /* no report */
- __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */
-
- ASSERT_EXCLUSIVE_BITS(test_flags, CHANGE_BITS); /* report */
- __kcsan_check_read(&test_flags, sizeof(test_flags)); /* no report */
-
- /* not actually instrumented */
- WRITE_ONCE(test_dummy, iters); /* to observe value-change */
- __kcsan_check_write(&test_dummy, sizeof(test_dummy));
-
- test_flags ^= CHANGE_BITS; /* generate value-change */
- __kcsan_check_write(&test_flags, sizeof(test_flags));
-
- BUG_ON(current->kcsan_ctx.scoped_accesses.prev);
- {
- /* Should generate reports anywhere in this block. */
- ASSERT_EXCLUSIVE_WRITER_SCOPED(test_scoped);
- ASSERT_EXCLUSIVE_ACCESS_SCOPED(test_scoped);
- BUG_ON(!current->kcsan_ctx.scoped_accesses.prev);
- /* Unrelated accesses. */
- __kcsan_check_access(&cycles, sizeof(cycles), 0);
- __kcsan_check_access(&cycles, sizeof(cycles), KCSAN_ACCESS_ATOMIC);
- }
- BUG_ON(current->kcsan_ctx.scoped_accesses.prev);
- }
- cycles = get_cycles() - cycles;
-
- pr_info("KCSAN: %s end | cycles: %llu\n", __func__, cycles);
-
- /* restore context */
- current->kcsan_ctx = ctx_save;
-}
-
static int cmp_filterlist_addrs(const void *rhs, const void *lhs)
{
const unsigned long a = *(const unsigned long *)rhs;
@@ -220,7 +143,7 @@ static ssize_t insert_report_filterlist(const char *func)
ssize_t ret = 0;
if (!addr) {
- pr_err("KCSAN: could not find function: '%s'\n", func);
+ pr_err("could not find function: '%s'\n", func);
return -ENOENT;
}
@@ -270,9 +193,10 @@ static int show_info(struct seq_file *file, void *v)
/* show stats */
seq_printf(file, "enabled: %i\n", READ_ONCE(kcsan_enabled));
- for (i = 0; i < KCSAN_COUNTER_COUNT; ++i)
- seq_printf(file, "%s: %ld\n", counter_to_name(i),
- atomic_long_read(&counters[i]));
+ for (i = 0; i < KCSAN_COUNTER_COUNT; ++i) {
+ seq_printf(file, "%s: %ld\n", counter_names[i],
+ atomic_long_read(&kcsan_counters[i]));
+ }
/* show filter functions, and filter type */
spin_lock_irqsave(&report_filterlist_lock, flags);
@@ -307,18 +231,12 @@ debugfs_write(struct file *file, const char __user *buf, size_t count, loff_t *o
WRITE_ONCE(kcsan_enabled, true);
} else if (!strcmp(arg, "off")) {
WRITE_ONCE(kcsan_enabled, false);
- } else if (!strncmp(arg, "microbench=", sizeof("microbench=") - 1)) {
+ } else if (str_has_prefix(arg, "microbench=")) {
unsigned long iters;
- if (kstrtoul(&arg[sizeof("microbench=") - 1], 0, &iters))
+ if (kstrtoul(&arg[strlen("microbench=")], 0, &iters))
return -EINVAL;
microbenchmark(iters);
- } else if (!strncmp(arg, "test=", sizeof("test=") - 1)) {
- unsigned long iters;
-
- if (kstrtoul(&arg[sizeof("test=") - 1], 0, &iters))
- return -EINVAL;
- test_thread(iters);
} else if (!strcmp(arg, "whitelist")) {
set_report_filterlist_whitelist(true);
} else if (!strcmp(arg, "blacklist")) {
diff --git a/kernel/kcsan/kcsan-test.c b/kernel/kcsan/kcsan-test.c
index fed6fcb5768c..ebe7fd245104 100644
--- a/kernel/kcsan/kcsan-test.c
+++ b/kernel/kcsan/kcsan-test.c
@@ -27,6 +27,12 @@
#include <linux/types.h>
#include <trace/events/printk.h>
+#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE
+#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE)
+#else
+#define __KCSAN_ACCESS_RW(alt) (alt)
+#endif
+
/* Points to current test-case memory access "kernels". */
static void (*access_kernels[2])(void);
@@ -186,20 +192,21 @@ static bool report_matches(const struct expect_report *r)
/* Access 1 & 2 */
for (i = 0; i < 2; ++i) {
+ const int ty = r->access[i].type;
const char *const access_type =
- (r->access[i].type & KCSAN_ACCESS_ASSERT) ?
- ((r->access[i].type & KCSAN_ACCESS_WRITE) ?
- "assert no accesses" :
- "assert no writes") :
- ((r->access[i].type & KCSAN_ACCESS_WRITE) ?
- "write" :
- "read");
+ (ty & KCSAN_ACCESS_ASSERT) ?
+ ((ty & KCSAN_ACCESS_WRITE) ?
+ "assert no accesses" :
+ "assert no writes") :
+ ((ty & KCSAN_ACCESS_WRITE) ?
+ ((ty & KCSAN_ACCESS_COMPOUND) ?
+ "read-write" :
+ "write") :
+ "read");
const char *const access_type_aux =
- (r->access[i].type & KCSAN_ACCESS_ATOMIC) ?
- " (marked)" :
- ((r->access[i].type & KCSAN_ACCESS_SCOPED) ?
- " (scoped)" :
- "");
+ (ty & KCSAN_ACCESS_ATOMIC) ?
+ " (marked)" :
+ ((ty & KCSAN_ACCESS_SCOPED) ? " (scoped)" : "");
if (i == 1) {
/* Access 2 */
@@ -277,6 +284,12 @@ static noinline void test_kernel_write_atomic(void)
WRITE_ONCE(test_var, READ_ONCE_NOCHECK(test_sink) + 1);
}
+static noinline void test_kernel_atomic_rmw(void)
+{
+ /* Use builtin, so we can set up the "bad" atomic/non-atomic scenario. */
+ __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED);
+}
+
__no_kcsan
static noinline void test_kernel_write_uninstrumented(void) { test_var++; }
@@ -390,6 +403,15 @@ static noinline void test_kernel_seqlock_writer(void)
write_sequnlock_irqrestore(&test_seqlock, flags);
}
+static noinline void test_kernel_atomic_builtins(void)
+{
+ /*
+ * Generate concurrent accesses, expecting no reports, ensuring KCSAN
+ * treats builtin atomics as actually atomic.
+ */
+ __atomic_load_n(&test_var, __ATOMIC_RELAXED);
+}
+
/* ===== Test cases ===== */
/* Simple test with normal data race. */
@@ -430,8 +452,8 @@ static void test_concurrent_races(struct kunit *test)
const struct expect_report expect = {
.access = {
/* NULL will match any address. */
- { test_kernel_rmw_array, NULL, 0, KCSAN_ACCESS_WRITE },
- { test_kernel_rmw_array, NULL, 0, 0 },
+ { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
+ { test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) },
},
};
static const struct expect_report never = {
@@ -620,6 +642,29 @@ static void test_read_plain_atomic_write(struct kunit *test)
KUNIT_EXPECT_TRUE(test, match_expect);
}
+/* Test that atomic RMWs generate correct report. */
+__no_kcsan
+static void test_read_plain_atomic_rmw(struct kunit *test)
+{
+ const struct expect_report expect = {
+ .access = {
+ { test_kernel_read, &test_var, sizeof(test_var), 0 },
+ { test_kernel_atomic_rmw, &test_var, sizeof(test_var),
+ KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
+ },
+ };
+ bool match_expect = false;
+
+ if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
+ return;
+
+ begin_test_checks(test_kernel_read, test_kernel_atomic_rmw);
+ do {
+ match_expect = report_matches(&expect);
+ } while (!end_test_checks(match_expect));
+ KUNIT_EXPECT_TRUE(test, match_expect);
+}
+
/* Zero-sized accesses should never cause data race reports. */
__no_kcsan
static void test_zero_size_access(struct kunit *test)
@@ -853,6 +898,59 @@ static void test_seqlock_noreport(struct kunit *test)
}
/*
+ * Test atomic builtins work and required instrumentation functions exist. We
+ * also test that KCSAN understands they're atomic by racing with them via
+ * test_kernel_atomic_builtins(), and expect no reports.
+ *
+ * The atomic builtins _SHOULD NOT_ be used in normal kernel code!
+ */
+static void test_atomic_builtins(struct kunit *test)
+{
+ bool match_never = false;
+
+ begin_test_checks(test_kernel_atomic_builtins, test_kernel_atomic_builtins);
+ do {
+ long tmp;
+
+ kcsan_enable_current();
+
+ __atomic_store_n(&test_var, 42L, __ATOMIC_RELAXED);
+ KUNIT_EXPECT_EQ(test, 42L, __atomic_load_n(&test_var, __ATOMIC_RELAXED));
+
+ KUNIT_EXPECT_EQ(test, 42L, __atomic_exchange_n(&test_var, 20, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 20L, test_var);
+
+ tmp = 20L;
+ KUNIT_EXPECT_TRUE(test, __atomic_compare_exchange_n(&test_var, &tmp, 30L,
+ 0, __ATOMIC_RELAXED,
+ __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, tmp, 20L);
+ KUNIT_EXPECT_EQ(test, test_var, 30L);
+ KUNIT_EXPECT_FALSE(test, __atomic_compare_exchange_n(&test_var, &tmp, 40L,
+ 1, __ATOMIC_RELAXED,
+ __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, tmp, 30L);
+ KUNIT_EXPECT_EQ(test, test_var, 30L);
+
+ KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_add(&test_var, 1, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 31L, __atomic_fetch_sub(&test_var, 1, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 30L, __atomic_fetch_and(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 14L, __atomic_fetch_xor(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 1L, __atomic_fetch_or(&test_var, 0xf0, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, 241L, __atomic_fetch_nand(&test_var, 0xf, __ATOMIC_RELAXED));
+ KUNIT_EXPECT_EQ(test, -2L, test_var);
+
+ __atomic_thread_fence(__ATOMIC_SEQ_CST);
+ __atomic_signal_fence(__ATOMIC_SEQ_CST);
+
+ kcsan_disable_current();
+
+ match_never = report_available();
+ } while (!end_test_checks(match_never));
+ KUNIT_EXPECT_FALSE(test, match_never);
+}
+
+/*
* Each test case is run with different numbers of threads. Until KUnit supports
* passing arguments for each test case, we encode #threads in the test case
* name (read by get_num_threads()). [The '-' was chosen as a stylistic
@@ -880,6 +978,7 @@ static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_write_write_struct_part),
KCSAN_KUNIT_CASE(test_read_atomic_write_atomic),
KCSAN_KUNIT_CASE(test_read_plain_atomic_write),
+ KCSAN_KUNIT_CASE(test_read_plain_atomic_rmw),
KCSAN_KUNIT_CASE(test_zero_size_access),
KCSAN_KUNIT_CASE(test_data_race),
KCSAN_KUNIT_CASE(test_assert_exclusive_writer),
@@ -891,6 +990,7 @@ static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_assert_exclusive_access_scoped),
KCSAN_KUNIT_CASE(test_jiffies_noreport),
KCSAN_KUNIT_CASE(test_seqlock_noreport),
+ KCSAN_KUNIT_CASE(test_atomic_builtins),
{},
};
diff --git a/kernel/kcsan/kcsan.h b/kernel/kcsan/kcsan.h
index 29480010dc30..8d4bf3431b3c 100644
--- a/kernel/kcsan/kcsan.h
+++ b/kernel/kcsan/kcsan.h
@@ -8,6 +8,7 @@
#ifndef _KERNEL_KCSAN_KCSAN_H
#define _KERNEL_KCSAN_KCSAN_H
+#include <linux/atomic.h>
#include <linux/kcsan.h>
#include <linux/sched.h>
@@ -34,6 +35,10 @@ void kcsan_restore_irqtrace(struct task_struct *task);
*/
void kcsan_debugfs_init(void);
+/*
+ * Statistics counters displayed via debugfs; should only be modified in
+ * slow-paths.
+ */
enum kcsan_counter_id {
/*
* Number of watchpoints currently in use.
@@ -86,12 +91,7 @@ enum kcsan_counter_id {
KCSAN_COUNTER_COUNT, /* number of counters */
};
-
-/*
- * Increment/decrement counter with given id; avoid calling these in fast-path.
- */
-extern void kcsan_counter_inc(enum kcsan_counter_id id);
-extern void kcsan_counter_dec(enum kcsan_counter_id id);
+extern atomic_long_t kcsan_counters[KCSAN_COUNTER_COUNT];
/*
* Returns true if data races in the function symbol that maps to func_addr
diff --git a/kernel/kcsan/report.c b/kernel/kcsan/report.c
index 9d07e175de0f..d3bf87e6007c 100644
--- a/kernel/kcsan/report.c
+++ b/kernel/kcsan/report.c
@@ -228,6 +228,10 @@ static const char *get_access_type(int type)
return "write";
case KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
return "write (marked)";
+ case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
+ return "read-write";
+ case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
+ return "read-write (marked)";
case KCSAN_ACCESS_SCOPED:
return "read (scoped)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC:
@@ -275,8 +279,8 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
cur = strnstr(buf, "kcsan_", len);
if (cur) {
- cur += sizeof("kcsan_") - 1;
- if (strncmp(cur, "test", sizeof("test") - 1))
+ cur += strlen("kcsan_");
+ if (!str_has_prefix(cur, "test"))
continue; /* KCSAN runtime function. */
/* KCSAN related test. */
}
@@ -555,7 +559,7 @@ static bool prepare_report_consumer(unsigned long *flags,
* If the actual accesses to not match, this was a false
* positive due to watchpoint encoding.
*/
- kcsan_counter_inc(KCSAN_COUNTER_ENCODING_FALSE_POSITIVES);
+ atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ENCODING_FALSE_POSITIVES]);
goto discard;
}
diff --git a/kernel/kcsan/selftest.c b/kernel/kcsan/selftest.c
index d26a052d3383..d98bc208d06d 100644
--- a/kernel/kcsan/selftest.c
+++ b/kernel/kcsan/selftest.c
@@ -1,5 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "kcsan: " fmt
+
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/printk.h>
@@ -116,16 +118,16 @@ static int __init kcsan_selftest(void)
if (do_test()) \
++passed; \
else \
- pr_err("KCSAN selftest: " #do_test " failed"); \
+ pr_err("selftest: " #do_test " failed"); \
} while (0)
RUN_TEST(test_requires);
RUN_TEST(test_encode_decode);
RUN_TEST(test_matching_access);
- pr_info("KCSAN selftest: %d/%d tests passed\n", passed, total);
+ pr_info("selftest: %d/%d tests passed\n", passed, total);
if (passed != total)
- panic("KCSAN selftests failed");
+ panic("selftests failed");
return 0;
}
postcore_initcall(kcsan_selftest);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 67e6a8c18007..c16c3236f6cf 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2142,6 +2142,9 @@ static void kill_kprobe(struct kprobe *p)
lockdep_assert_held(&kprobe_mutex);
+ if (WARN_ON_ONCE(kprobe_gone(p)))
+ return;
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
@@ -2161,9 +2164,10 @@ static void kill_kprobe(struct kprobe *p)
/*
* The module is going away. We should disarm the kprobe which
- * is using ftrace.
+ * is using ftrace, because ftrace framework is still available at
+ * MODULE_STATE_GOING notification.
*/
- if (kprobe_ftrace(p))
+ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
disarm_kprobe_ftrace(p);
}
@@ -2421,7 +2425,10 @@ static int kprobes_module_callback(struct notifier_block *nb,
mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry(p, head, hlist)
+ hlist_for_each_entry(p, head, hlist) {
+ if (kprobe_gone(p))
+ continue;
+
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
@@ -2438,6 +2445,7 @@ static int kprobes_module_callback(struct notifier_block *nb,
*/
kill_kprobe(p);
}
+ }
}
if (val == MODULE_STATE_GOING)
remove_module_kprobe_blacklist(mod);
@@ -2454,6 +2462,28 @@ static struct notifier_block kprobe_module_nb = {
extern unsigned long __start_kprobe_blacklist[];
extern unsigned long __stop_kprobe_blacklist[];
+void kprobe_free_init_mem(void)
+{
+ void *start = (void *)(&__init_begin);
+ void *end = (void *)(&__init_end);
+ struct hlist_head *head;
+ struct kprobe *p;
+ int i;
+
+ mutex_lock(&kprobe_mutex);
+
+ /* Kill all kprobes on initmem */
+ for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
+ head = &kprobe_table[i];
+ hlist_for_each_entry(p, head, hlist) {
+ if (start <= (void *)p->addr && (void *)p->addr < end)
+ kill_kprobe(p);
+ }
+ }
+
+ mutex_unlock(&kprobe_mutex);
+}
+
static int __init init_kprobes(void)
{
int i, err = 0;
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 54b74fabf40c..3e99dfef8408 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -76,6 +76,23 @@ module_param(lock_stat, int, 0644);
#define lock_stat 0
#endif
+DEFINE_PER_CPU(unsigned int, lockdep_recursion);
+EXPORT_PER_CPU_SYMBOL_GPL(lockdep_recursion);
+
+static inline bool lockdep_enabled(void)
+{
+ if (!debug_locks)
+ return false;
+
+ if (raw_cpu_read(lockdep_recursion))
+ return false;
+
+ if (current->lockdep_recursion)
+ return false;
+
+ return true;
+}
+
/*
* lockdep_lock: protects the lockdep graph, the hashes and the
* class/list/hash allocators.
@@ -93,7 +110,7 @@ static inline void lockdep_lock(void)
arch_spin_lock(&__lock);
__owner = current;
- current->lockdep_recursion++;
+ __this_cpu_inc(lockdep_recursion);
}
static inline void lockdep_unlock(void)
@@ -101,7 +118,7 @@ static inline void lockdep_unlock(void)
if (debug_locks && DEBUG_LOCKS_WARN_ON(__owner != current))
return;
- current->lockdep_recursion--;
+ __this_cpu_dec(lockdep_recursion);
__owner = NULL;
arch_spin_unlock(&__lock);
}
@@ -372,6 +389,21 @@ static struct hlist_head classhash_table[CLASSHASH_SIZE];
static struct hlist_head chainhash_table[CHAINHASH_SIZE];
/*
+ * the id of held_lock
+ */
+static inline u16 hlock_id(struct held_lock *hlock)
+{
+ BUILD_BUG_ON(MAX_LOCKDEP_KEYS_BITS + 2 > 16);
+
+ return (hlock->class_idx | (hlock->read << MAX_LOCKDEP_KEYS_BITS));
+}
+
+static inline unsigned int chain_hlock_class_idx(u16 hlock_id)
+{
+ return hlock_id & (MAX_LOCKDEP_KEYS - 1);
+}
+
+/*
* The hash key of the lock dependency chains is a hash itself too:
* it's a hash of all locks taken up to that lock, including that lock.
* It's a 64-bit hash, because it's important for the keys to be
@@ -393,10 +425,15 @@ void lockdep_init_task(struct task_struct *task)
task->lockdep_recursion = 0;
}
+static __always_inline void lockdep_recursion_inc(void)
+{
+ __this_cpu_inc(lockdep_recursion);
+}
+
static __always_inline void lockdep_recursion_finish(void)
{
- if (WARN_ON_ONCE((--current->lockdep_recursion) & LOCKDEP_RECURSION_MASK))
- current->lockdep_recursion = 0;
+ if (WARN_ON_ONCE(__this_cpu_dec_return(lockdep_recursion)))
+ __this_cpu_write(lockdep_recursion, 0);
}
void lockdep_set_selftest_task(struct task_struct *task)
@@ -585,6 +622,8 @@ static const char *usage_str[] =
#include "lockdep_states.h"
#undef LOCKDEP_STATE
[LOCK_USED] = "INITIAL USE",
+ [LOCK_USED_READ] = "INITIAL READ USE",
+ /* abused as string storage for verify_lock_unused() */
[LOCK_USAGE_STATES] = "IN-NMI",
};
#endif
@@ -1320,7 +1359,7 @@ static struct lock_list *alloc_list_entry(void)
*/
static int add_lock_to_list(struct lock_class *this,
struct lock_class *links_to, struct list_head *head,
- unsigned long ip, int distance,
+ unsigned long ip, u16 distance, u8 dep,
const struct lock_trace *trace)
{
struct lock_list *entry;
@@ -1334,6 +1373,7 @@ static int add_lock_to_list(struct lock_class *this,
entry->class = this;
entry->links_to = links_to;
+ entry->dep = dep;
entry->distance = distance;
entry->trace = trace;
/*
@@ -1421,23 +1461,19 @@ static inline unsigned int __cq_get_elem_count(struct circular_queue *cq)
return (cq->rear - cq->front) & CQ_MASK;
}
-static inline void mark_lock_accessed(struct lock_list *lock,
- struct lock_list *parent)
+static inline void mark_lock_accessed(struct lock_list *lock)
{
- unsigned long nr;
+ lock->class->dep_gen_id = lockdep_dependency_gen_id;
+}
- nr = lock - list_entries;
- WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
+static inline void visit_lock_entry(struct lock_list *lock,
+ struct lock_list *parent)
+{
lock->parent = parent;
- lock->class->dep_gen_id = lockdep_dependency_gen_id;
}
static inline unsigned long lock_accessed(struct lock_list *lock)
{
- unsigned long nr;
-
- nr = lock - list_entries;
- WARN_ON(nr >= ARRAY_SIZE(list_entries)); /* Out-of-bounds, input fail */
return lock->class->dep_gen_id == lockdep_dependency_gen_id;
}
@@ -1471,85 +1507,283 @@ static inline struct list_head *get_dep_list(struct lock_list *lock, int offset)
return lock_class + offset;
}
+/*
+ * Return values of a bfs search:
+ *
+ * BFS_E* indicates an error
+ * BFS_R* indicates a result (match or not)
+ *
+ * BFS_EINVALIDNODE: Find a invalid node in the graph.
+ *
+ * BFS_EQUEUEFULL: The queue is full while doing the bfs.
+ *
+ * BFS_RMATCH: Find the matched node in the graph, and put that node into
+ * *@target_entry.
+ *
+ * BFS_RNOMATCH: Haven't found the matched node and keep *@target_entry
+ * _unchanged_.
+ */
+enum bfs_result {
+ BFS_EINVALIDNODE = -2,
+ BFS_EQUEUEFULL = -1,
+ BFS_RMATCH = 0,
+ BFS_RNOMATCH = 1,
+};
/*
- * Forward- or backward-dependency search, used for both circular dependency
- * checking and hardirq-unsafe/softirq-unsafe checking.
+ * bfs_result < 0 means error
+ */
+static inline bool bfs_error(enum bfs_result res)
+{
+ return res < 0;
+}
+
+/*
+ * DEP_*_BIT in lock_list::dep
+ *
+ * For dependency @prev -> @next:
+ *
+ * SR: @prev is shared reader (->read != 0) and @next is recursive reader
+ * (->read == 2)
+ * ER: @prev is exclusive locker (->read == 0) and @next is recursive reader
+ * SN: @prev is shared reader and @next is non-recursive locker (->read != 2)
+ * EN: @prev is exclusive locker and @next is non-recursive locker
+ *
+ * Note that we define the value of DEP_*_BITs so that:
+ * bit0 is prev->read == 0
+ * bit1 is next->read != 2
*/
-static int __bfs(struct lock_list *source_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry,
- int offset)
+#define DEP_SR_BIT (0 + (0 << 1)) /* 0 */
+#define DEP_ER_BIT (1 + (0 << 1)) /* 1 */
+#define DEP_SN_BIT (0 + (1 << 1)) /* 2 */
+#define DEP_EN_BIT (1 + (1 << 1)) /* 3 */
+
+#define DEP_SR_MASK (1U << (DEP_SR_BIT))
+#define DEP_ER_MASK (1U << (DEP_ER_BIT))
+#define DEP_SN_MASK (1U << (DEP_SN_BIT))
+#define DEP_EN_MASK (1U << (DEP_EN_BIT))
+
+static inline unsigned int
+__calc_dep_bit(struct held_lock *prev, struct held_lock *next)
+{
+ return (prev->read == 0) + ((next->read != 2) << 1);
+}
+
+static inline u8 calc_dep(struct held_lock *prev, struct held_lock *next)
{
+ return 1U << __calc_dep_bit(prev, next);
+}
+
+/*
+ * calculate the dep_bit for backwards edges. We care about whether @prev is
+ * shared and whether @next is recursive.
+ */
+static inline unsigned int
+__calc_dep_bitb(struct held_lock *prev, struct held_lock *next)
+{
+ return (next->read != 2) + ((prev->read == 0) << 1);
+}
+
+static inline u8 calc_depb(struct held_lock *prev, struct held_lock *next)
+{
+ return 1U << __calc_dep_bitb(prev, next);
+}
+
+/*
+ * Initialize a lock_list entry @lock belonging to @class as the root for a BFS
+ * search.
+ */
+static inline void __bfs_init_root(struct lock_list *lock,
+ struct lock_class *class)
+{
+ lock->class = class;
+ lock->parent = NULL;
+ lock->only_xr = 0;
+}
+
+/*
+ * Initialize a lock_list entry @lock based on a lock acquisition @hlock as the
+ * root for a BFS search.
+ *
+ * ->only_xr of the initial lock node is set to @hlock->read == 2, to make sure
+ * that <prev> -> @hlock and @hlock -> <whatever __bfs() found> is not -(*R)->
+ * and -(S*)->.
+ */
+static inline void bfs_init_root(struct lock_list *lock,
+ struct held_lock *hlock)
+{
+ __bfs_init_root(lock, hlock_class(hlock));
+ lock->only_xr = (hlock->read == 2);
+}
+
+/*
+ * Similar to bfs_init_root() but initialize the root for backwards BFS.
+ *
+ * ->only_xr of the initial lock node is set to @hlock->read != 0, to make sure
+ * that <next> -> @hlock and @hlock -> <whatever backwards BFS found> is not
+ * -(*S)-> and -(R*)-> (reverse order of -(*R)-> and -(S*)->).
+ */
+static inline void bfs_init_rootb(struct lock_list *lock,
+ struct held_lock *hlock)
+{
+ __bfs_init_root(lock, hlock_class(hlock));
+ lock->only_xr = (hlock->read != 0);
+}
+
+static inline struct lock_list *__bfs_next(struct lock_list *lock, int offset)
+{
+ if (!lock || !lock->parent)
+ return NULL;
+
+ return list_next_or_null_rcu(get_dep_list(lock->parent, offset),
+ &lock->entry, struct lock_list, entry);
+}
+
+/*
+ * Breadth-First Search to find a strong path in the dependency graph.
+ *
+ * @source_entry: the source of the path we are searching for.
+ * @data: data used for the second parameter of @match function
+ * @match: match function for the search
+ * @target_entry: pointer to the target of a matched path
+ * @offset: the offset to struct lock_class to determine whether it is
+ * locks_after or locks_before
+ *
+ * We may have multiple edges (considering different kinds of dependencies,
+ * e.g. ER and SN) between two nodes in the dependency graph. But
+ * only the strong dependency path in the graph is relevant to deadlocks. A
+ * strong dependency path is a dependency path that doesn't have two adjacent
+ * dependencies as -(*R)-> -(S*)->, please see:
+ *
+ * Documentation/locking/lockdep-design.rst
+ *
+ * for more explanation of the definition of strong dependency paths
+ *
+ * In __bfs(), we only traverse in the strong dependency path:
+ *
+ * In lock_list::only_xr, we record whether the previous dependency only
+ * has -(*R)-> in the search, and if it does (prev only has -(*R)->), we
+ * filter out any -(S*)-> in the current dependency and after that, the
+ * ->only_xr is set according to whether we only have -(*R)-> left.
+ */
+static enum bfs_result __bfs(struct lock_list *source_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry,
+ int offset)
+{
+ struct circular_queue *cq = &lock_cq;
+ struct lock_list *lock = NULL;
struct lock_list *entry;
- struct lock_list *lock;
struct list_head *head;
- struct circular_queue *cq = &lock_cq;
- int ret = 1;
+ unsigned int cq_depth;
+ bool first;
lockdep_assert_locked();
- if (match(source_entry, data)) {
- *target_entry = source_entry;
- ret = 0;
- goto exit;
- }
-
- head = get_dep_list(source_entry, offset);
- if (list_empty(head))
- goto exit;
-
__cq_init(cq);
__cq_enqueue(cq, source_entry);
- while ((lock = __cq_dequeue(cq))) {
+ while ((lock = __bfs_next(lock, offset)) || (lock = __cq_dequeue(cq))) {
+ if (!lock->class)
+ return BFS_EINVALIDNODE;
+
+ /*
+ * Step 1: check whether we already finish on this one.
+ *
+ * If we have visited all the dependencies from this @lock to
+ * others (iow, if we have visited all lock_list entries in
+ * @lock->class->locks_{after,before}) we skip, otherwise go
+ * and visit all the dependencies in the list and mark this
+ * list accessed.
+ */
+ if (lock_accessed(lock))
+ continue;
+ else
+ mark_lock_accessed(lock);
+
+ /*
+ * Step 2: check whether prev dependency and this form a strong
+ * dependency path.
+ */
+ if (lock->parent) { /* Parent exists, check prev dependency */
+ u8 dep = lock->dep;
+ bool prev_only_xr = lock->parent->only_xr;
+
+ /*
+ * Mask out all -(S*)-> if we only have *R in previous
+ * step, because -(*R)-> -(S*)-> don't make up a strong
+ * dependency.
+ */
+ if (prev_only_xr)
+ dep &= ~(DEP_SR_MASK | DEP_SN_MASK);
- if (!lock->class) {
- ret = -2;
- goto exit;
+ /* If nothing left, we skip */
+ if (!dep)
+ continue;
+
+ /* If there are only -(*R)-> left, set that for the next step */
+ lock->only_xr = !(dep & (DEP_SN_MASK | DEP_EN_MASK));
}
- head = get_dep_list(lock, offset);
+ /*
+ * Step 3: we haven't visited this and there is a strong
+ * dependency path to this, so check with @match.
+ */
+ if (match(lock, data)) {
+ *target_entry = lock;
+ return BFS_RMATCH;
+ }
+ /*
+ * Step 4: if not match, expand the path by adding the
+ * forward or backwards dependencis in the search
+ *
+ */
+ first = true;
+ head = get_dep_list(lock, offset);
list_for_each_entry_rcu(entry, head, entry) {
- if (!lock_accessed(entry)) {
- unsigned int cq_depth;
- mark_lock_accessed(entry, lock);
- if (match(entry, data)) {
- *target_entry = entry;
- ret = 0;
- goto exit;
- }
+ visit_lock_entry(entry, lock);
- if (__cq_enqueue(cq, entry)) {
- ret = -1;
- goto exit;
- }
- cq_depth = __cq_get_elem_count(cq);
- if (max_bfs_queue_depth < cq_depth)
- max_bfs_queue_depth = cq_depth;
- }
+ /*
+ * Note we only enqueue the first of the list into the
+ * queue, because we can always find a sibling
+ * dependency from one (see __bfs_next()), as a result
+ * the space of queue is saved.
+ */
+ if (!first)
+ continue;
+
+ first = false;
+
+ if (__cq_enqueue(cq, entry))
+ return BFS_EQUEUEFULL;
+
+ cq_depth = __cq_get_elem_count(cq);
+ if (max_bfs_queue_depth < cq_depth)
+ max_bfs_queue_depth = cq_depth;
}
}
-exit:
- return ret;
+
+ return BFS_RNOMATCH;
}
-static inline int __bfs_forwards(struct lock_list *src_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry)
+static inline enum bfs_result
+__bfs_forwards(struct lock_list *src_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
return __bfs(src_entry, data, match, target_entry,
offsetof(struct lock_class, locks_after));
}
-static inline int __bfs_backwards(struct lock_list *src_entry,
- void *data,
- int (*match)(struct lock_list *entry, void *data),
- struct lock_list **target_entry)
+static inline enum bfs_result
+__bfs_backwards(struct lock_list *src_entry,
+ void *data,
+ bool (*match)(struct lock_list *entry, void *data),
+ struct lock_list **target_entry)
{
return __bfs(src_entry, data, match, target_entry,
offsetof(struct lock_class, locks_before));
@@ -1659,15 +1893,72 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
print_circular_bug_entry(entry, depth);
}
-static inline int class_equal(struct lock_list *entry, void *data)
+/*
+ * We are about to add A -> B into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * If A -> .. -> B can replace A -> B in any __bfs() search (means the former
+ * is _stronger_ than or equal to the latter), we consider A -> B as redundant.
+ * For example if A -> .. -> B is -(EN)-> (i.e. A -(E*)-> .. -(*N)-> B), and A
+ * -> B is -(ER)-> or -(EN)->, then we don't need to add A -> B into the
+ * dependency graph, as any strong path ..-> A -> B ->.. we can get with
+ * having dependency A -> B, we could already get a equivalent path ..-> A ->
+ * .. -> B -> .. with A -> .. -> B. Therefore A -> B is reduntant.
+ *
+ * We need to make sure both the start and the end of A -> .. -> B is not
+ * weaker than A -> B. For the start part, please see the comment in
+ * check_redundant(). For the end part, we need:
+ *
+ * Either
+ *
+ * a) A -> B is -(*R)-> (everything is not weaker than that)
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (nothing is stronger than this)
+ *
+ */
+static inline bool hlock_equal(struct lock_list *entry, void *data)
+{
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 2 || /* A -> B is -(*R)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
+}
+
+/*
+ * We are about to add B -> A into the dependency graph, and in __bfs() a
+ * strong dependency path A -> .. -> B is found: hlock_class equals
+ * entry->class.
+ *
+ * We will have a deadlock case (conflict) if A -> .. -> B -> A is a strong
+ * dependency cycle, that means:
+ *
+ * Either
+ *
+ * a) B -> A is -(E*)->
+ *
+ * or
+ *
+ * b) A -> .. -> B is -(*N)-> (i.e. A -> .. -(*N)-> B)
+ *
+ * as then we don't have -(*R)-> -(S*)-> in the cycle.
+ */
+static inline bool hlock_conflict(struct lock_list *entry, void *data)
{
- return entry->class == data;
+ struct held_lock *hlock = (struct held_lock *)data;
+
+ return hlock_class(hlock) == entry->class && /* Found A -> .. -> B */
+ (hlock->read == 0 || /* B -> A is -(E*)-> */
+ !entry->only_xr); /* A -> .. -> B is -(*N)-> */
}
static noinline void print_circular_bug(struct lock_list *this,
- struct lock_list *target,
- struct held_lock *check_src,
- struct held_lock *check_tgt)
+ struct lock_list *target,
+ struct held_lock *check_src,
+ struct held_lock *check_tgt)
{
struct task_struct *curr = current;
struct lock_list *parent;
@@ -1714,10 +2005,10 @@ static noinline void print_bfs_bug(int ret)
WARN(1, "lockdep bfs error:%d\n", ret);
}
-static int noop_count(struct lock_list *entry, void *data)
+static bool noop_count(struct lock_list *entry, void *data)
{
(*(unsigned long *)data)++;
- return 0;
+ return false;
}
static unsigned long __lockdep_count_forward_deps(struct lock_list *this)
@@ -1734,8 +2025,7 @@ unsigned long lockdep_count_forward_deps(struct lock_class *class)
unsigned long ret, flags;
struct lock_list this;
- this.parent = NULL;
- this.class = class;
+ __bfs_init_root(&this, class);
raw_local_irq_save(flags);
lockdep_lock();
@@ -1761,8 +2051,7 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
unsigned long ret, flags;
struct lock_list this;
- this.parent = NULL;
- this.class = class;
+ __bfs_init_root(&this, class);
raw_local_irq_save(flags);
lockdep_lock();
@@ -1775,18 +2064,18 @@ unsigned long lockdep_count_backward_deps(struct lock_class *class)
/*
* Check that the dependency graph starting at <src> can lead to
- * <target> or not. Print an error and return 0 if it does.
+ * <target> or not.
*/
-static noinline int
-check_path(struct lock_class *target, struct lock_list *src_entry,
+static noinline enum bfs_result
+check_path(struct held_lock *target, struct lock_list *src_entry,
+ bool (*match)(struct lock_list *entry, void *data),
struct lock_list **target_entry)
{
- int ret;
+ enum bfs_result ret;
- ret = __bfs_forwards(src_entry, (void *)target, class_equal,
- target_entry);
+ ret = __bfs_forwards(src_entry, target, match, target_entry);
- if (unlikely(ret < 0))
+ if (unlikely(bfs_error(ret)))
print_bfs_bug(ret);
return ret;
@@ -1797,24 +2086,23 @@ check_path(struct lock_class *target, struct lock_list *src_entry,
* lead to <target>. If it can, there is a circle when adding
* <target> -> <src> dependency.
*
- * Print an error and return 0 if it does.
+ * Print an error and return BFS_RMATCH if it does.
*/
-static noinline int
+static noinline enum bfs_result
check_noncircular(struct held_lock *src, struct held_lock *target,
struct lock_trace **const trace)
{
- int ret;
+ enum bfs_result ret;
struct lock_list *target_entry;
- struct lock_list src_entry = {
- .class = hlock_class(src),
- .parent = NULL,
- };
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
debug_atomic_inc(nr_cyclic_checks);
- ret = check_path(hlock_class(target), &src_entry, &target_entry);
+ ret = check_path(target, &src_entry, hlock_conflict, &target_entry);
- if (unlikely(!ret)) {
+ if (unlikely(ret == BFS_RMATCH)) {
if (!*trace) {
/*
* If save_trace fails here, the printing might
@@ -1836,27 +2124,35 @@ check_noncircular(struct held_lock *src, struct held_lock *target,
* <target> or not. If it can, <src> -> <target> dependency is already
* in the graph.
*
- * Print an error and return 2 if it does or 1 if it does not.
+ * Return BFS_RMATCH if it does, or BFS_RMATCH if it does not, return BFS_E* if
+ * any error appears in the bfs search.
*/
-static noinline int
+static noinline enum bfs_result
check_redundant(struct held_lock *src, struct held_lock *target)
{
- int ret;
+ enum bfs_result ret;
struct lock_list *target_entry;
- struct lock_list src_entry = {
- .class = hlock_class(src),
- .parent = NULL,
- };
+ struct lock_list src_entry;
+
+ bfs_init_root(&src_entry, src);
+ /*
+ * Special setup for check_redundant().
+ *
+ * To report redundant, we need to find a strong dependency path that
+ * is equal to or stronger than <src> -> <target>. So if <src> is E,
+ * we need to let __bfs() only search for a path starting at a -(E*)->,
+ * we achieve this by setting the initial node's ->only_xr to true in
+ * that case. And if <prev> is S, we set initial ->only_xr to false
+ * because both -(S*)-> (equal) and -(E*)-> (stronger) are redundant.
+ */
+ src_entry.only_xr = src->read == 0;
debug_atomic_inc(nr_redundant_checks);
- ret = check_path(hlock_class(target), &src_entry, &target_entry);
+ ret = check_path(target, &src_entry, hlock_equal, &target_entry);
- if (!ret) {
+ if (ret == BFS_RMATCH)
debug_atomic_inc(nr_redundant);
- ret = 2;
- } else if (ret < 0)
- ret = 0;
return ret;
}
@@ -1864,39 +2160,86 @@ check_redundant(struct held_lock *src, struct held_lock *target)
#ifdef CONFIG_TRACE_IRQFLAGS
-static inline int usage_accumulate(struct lock_list *entry, void *mask)
-{
- *(unsigned long *)mask |= entry->class->usage_mask;
-
- return 0;
-}
-
/*
* Forwards and backwards subgraph searching, for the purposes of
* proving that two subgraphs can be connected by a new dependency
* without creating any illegal irq-safe -> irq-unsafe lock dependency.
+ *
+ * A irq safe->unsafe deadlock happens with the following conditions:
+ *
+ * 1) We have a strong dependency path A -> ... -> B
+ *
+ * 2) and we have ENABLED_IRQ usage of B and USED_IN_IRQ usage of A, therefore
+ * irq can create a new dependency B -> A (consider the case that a holder
+ * of B gets interrupted by an irq whose handler will try to acquire A).
+ *
+ * 3) the dependency circle A -> ... -> B -> A we get from 1) and 2) is a
+ * strong circle:
+ *
+ * For the usage bits of B:
+ * a) if A -> B is -(*N)->, then B -> A could be any type, so any
+ * ENABLED_IRQ usage suffices.
+ * b) if A -> B is -(*R)->, then B -> A must be -(E*)->, so only
+ * ENABLED_IRQ_*_READ usage suffices.
+ *
+ * For the usage bits of A:
+ * c) if A -> B is -(E*)->, then B -> A could be any type, so any
+ * USED_IN_IRQ usage suffices.
+ * d) if A -> B is -(S*)->, then B -> A must be -(*N)->, so only
+ * USED_IN_IRQ_*_READ usage suffices.
*/
-static inline int usage_match(struct lock_list *entry, void *mask)
+/*
+ * There is a strong dependency path in the dependency graph: A -> B, and now
+ * we need to decide which usage bit of A should be accumulated to detect
+ * safe->unsafe bugs.
+ *
+ * Note that usage_accumulate() is used in backwards search, so ->only_xr
+ * stands for whether A -> B only has -(S*)-> (in this case ->only_xr is true).
+ *
+ * As above, if only_xr is false, which means A -> B has -(E*)-> dependency
+ * path, any usage of A should be considered. Otherwise, we should only
+ * consider _READ usage.
+ */
+static inline bool usage_accumulate(struct lock_list *entry, void *mask)
{
- return entry->class->usage_mask & *(unsigned long *)mask;
+ if (!entry->only_xr)
+ *(unsigned long *)mask |= entry->class->usage_mask;
+ else /* Mask out _READ usage bits */
+ *(unsigned long *)mask |= (entry->class->usage_mask & LOCKF_IRQ);
+
+ return false;
+}
+
+/*
+ * There is a strong dependency path in the dependency graph: A -> B, and now
+ * we need to decide which usage bit of B conflicts with the usage bits of A,
+ * i.e. which usage bit of B may introduce safe->unsafe deadlocks.
+ *
+ * As above, if only_xr is false, which means A -> B has -(*N)-> dependency
+ * path, any usage of B should be considered. Otherwise, we should only
+ * consider _READ usage.
+ */
+static inline bool usage_match(struct lock_list *entry, void *mask)
+{
+ if (!entry->only_xr)
+ return !!(entry->class->usage_mask & *(unsigned long *)mask);
+ else /* Mask out _READ usage bits */
+ return !!((entry->class->usage_mask & LOCKF_IRQ) & *(unsigned long *)mask);
}
/*
* Find a node in the forwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
*
- * Return 0 if such a node exists in the subgraph, and put that node
+ * Return BFS_MATCH if such a node exists in the subgraph, and put that node
* into *@target_entry.
- *
- * Return 1 otherwise and keep *@target_entry unchanged.
- * Return <0 on error.
*/
-static int
+static enum bfs_result
find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
- int result;
+ enum bfs_result result;
debug_atomic_inc(nr_find_usage_forwards_checks);
@@ -1908,18 +2251,12 @@ find_usage_forwards(struct lock_list *root, unsigned long usage_mask,
/*
* Find a node in the backwards-direction dependency sub-graph starting
* at @root->class that matches @bit.
- *
- * Return 0 if such a node exists in the subgraph, and put that node
- * into *@target_entry.
- *
- * Return 1 otherwise and keep *@target_entry unchanged.
- * Return <0 on error.
*/
-static int
+static enum bfs_result
find_usage_backwards(struct lock_list *root, unsigned long usage_mask,
struct lock_list **target_entry)
{
- int result;
+ enum bfs_result result;
debug_atomic_inc(nr_find_usage_backwards_checks);
@@ -1939,7 +2276,7 @@ static void print_lock_class_header(struct lock_class *class, int depth)
#endif
printk(KERN_CONT " {\n");
- for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
+ for (bit = 0; bit < LOCK_TRACE_STATES; bit++) {
if (class->usage_mask & (1 << bit)) {
int len = depth;
@@ -2179,17 +2516,39 @@ static unsigned long invert_dir_mask(unsigned long mask)
}
/*
- * As above, we clear bitnr0 (LOCK_*_READ off) with bitmask ops. First, for all
- * bits with bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*).
- * And then mask out all bitnr0.
+ * Note that a LOCK_ENABLED_IRQ_*_READ usage and a LOCK_USED_IN_IRQ_*_READ
+ * usage may cause deadlock too, for example:
+ *
+ * P1 P2
+ * <irq disabled>
+ * write_lock(l1); <irq enabled>
+ * read_lock(l2);
+ * write_lock(l2);
+ * <in irq>
+ * read_lock(l1);
+ *
+ * , in above case, l1 will be marked as LOCK_USED_IN_IRQ_HARDIRQ_READ and l2
+ * will marked as LOCK_ENABLE_IRQ_HARDIRQ_READ, and this is a possible
+ * deadlock.
+ *
+ * In fact, all of the following cases may cause deadlocks:
+ *
+ * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*
+ * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*
+ * LOCK_USED_IN_IRQ_* -> LOCK_ENABLED_IRQ_*_READ
+ * LOCK_USED_IN_IRQ_*_READ -> LOCK_ENABLED_IRQ_*_READ
+ *
+ * As a result, to calculate the "exclusive mask", first we invert the
+ * direction (USED_IN/ENABLED) of the original mask, and 1) for all bits with
+ * bitnr0 set (LOCK_*_READ), add those with bitnr0 cleared (LOCK_*). 2) for all
+ * bits with bitnr0 cleared (LOCK_*_READ), add those with bitnr0 set (LOCK_*).
*/
static unsigned long exclusive_mask(unsigned long mask)
{
unsigned long excl = invert_dir_mask(mask);
- /* Strip read */
excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
- excl &= ~LOCKF_IRQ_READ;
+ excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
return excl;
}
@@ -2206,6 +2565,7 @@ static unsigned long original_mask(unsigned long mask)
unsigned long excl = invert_dir_mask(mask);
/* Include read in existing usages */
+ excl |= (excl & LOCKF_IRQ_READ) >> LOCK_USAGE_READ_MASK;
excl |= (excl & LOCKF_IRQ) << LOCK_USAGE_READ_MASK;
return excl;
@@ -2220,14 +2580,24 @@ static int find_exclusive_match(unsigned long mask,
enum lock_usage_bit *bitp,
enum lock_usage_bit *excl_bitp)
{
- int bit, excl;
+ int bit, excl, excl_read;
for_each_set_bit(bit, &mask, LOCK_USED) {
+ /*
+ * exclusive_bit() strips the read bit, however,
+ * LOCK_ENABLED_IRQ_*_READ may cause deadlocks too, so we need
+ * to search excl | LOCK_USAGE_READ_MASK as well.
+ */
excl = exclusive_bit(bit);
+ excl_read = excl | LOCK_USAGE_READ_MASK;
if (excl_mask & lock_flag(excl)) {
*bitp = bit;
*excl_bitp = excl;
return 0;
+ } else if (excl_mask & lock_flag(excl_read)) {
+ *bitp = bit;
+ *excl_bitp = excl_read;
+ return 0;
}
}
return -1;
@@ -2247,17 +2617,16 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
struct lock_list *target_entry1;
struct lock_list *target_entry;
struct lock_list this, that;
- int ret;
+ enum bfs_result ret;
/*
* Step 1: gather all hard/soft IRQs usages backward in an
* accumulated usage mask.
*/
- this.parent = NULL;
- this.class = hlock_class(prev);
+ bfs_init_rootb(&this, prev);
ret = __bfs_backwards(&this, &usage_mask, usage_accumulate, NULL);
- if (ret < 0) {
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
@@ -2272,16 +2641,15 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
*/
forward_mask = exclusive_mask(usage_mask);
- that.parent = NULL;
- that.class = hlock_class(next);
+ bfs_init_root(&that, next);
ret = find_usage_forwards(&that, forward_mask, &target_entry1);
- if (ret < 0) {
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (ret == 1)
- return ret;
+ if (ret == BFS_RNOMATCH)
+ return 1;
/*
* Step 3: we found a bad match! Now retrieve a lock from the backward
@@ -2291,11 +2659,11 @@ static int check_irq_usage(struct task_struct *curr, struct held_lock *prev,
backward_mask = original_mask(target_entry1->class->usage_mask);
ret = find_usage_backwards(&this, backward_mask, &target_entry);
- if (ret < 0) {
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (DEBUG_LOCKS_WARN_ON(ret == 1))
+ if (DEBUG_LOCKS_WARN_ON(ret == BFS_RNOMATCH))
return 1;
/*
@@ -2459,11 +2827,11 @@ check_deadlock(struct task_struct *curr, struct held_lock *next)
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance,
+ struct held_lock *next, u16 distance,
struct lock_trace **const trace)
{
struct lock_list *entry;
- int ret;
+ enum bfs_result ret;
if (!hlock_class(prev)->key || !hlock_class(next)->key) {
/*
@@ -2494,23 +2862,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* in the graph whose neighbours are to be checked.
*/
ret = check_noncircular(next, prev, trace);
- if (unlikely(ret <= 0))
+ if (unlikely(bfs_error(ret) || ret == BFS_RMATCH))
return 0;
if (!check_irq_usage(curr, prev, next))
return 0;
/*
- * For recursive read-locks we do all the dependency checks,
- * but we dont store read-triggered dependencies (only
- * write-triggered dependencies). This ensures that only the
- * write-side dependencies matter, and that if for example a
- * write-lock never takes any other locks, then the reads are
- * equivalent to a NOP.
- */
- if (next->read == 2 || prev->read == 2)
- return 1;
- /*
* Is the <prev> -> <next> dependency already present?
*
* (this may occur even though this is a new chain: consider
@@ -2522,7 +2880,35 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
if (entry->class == hlock_class(next)) {
if (distance == 1)
entry->distance = 1;
- return 1;
+ entry->dep |= calc_dep(prev, next);
+
+ /*
+ * Also, update the reverse dependency in @next's
+ * ->locks_before list.
+ *
+ * Here we reuse @entry as the cursor, which is fine
+ * because we won't go to the next iteration of the
+ * outer loop:
+ *
+ * For normal cases, we return in the inner loop.
+ *
+ * If we fail to return, we have inconsistency, i.e.
+ * <prev>::locks_after contains <next> while
+ * <next>::locks_before doesn't contain <prev>. In
+ * that case, we return after the inner and indicate
+ * something is wrong.
+ */
+ list_for_each_entry(entry, &hlock_class(next)->locks_before, entry) {
+ if (entry->class == hlock_class(prev)) {
+ if (distance == 1)
+ entry->distance = 1;
+ entry->dep |= calc_depb(prev, next);
+ return 1;
+ }
+ }
+
+ /* <prev> is not found in <next>::locks_before */
+ return 0;
}
}
@@ -2531,8 +2917,10 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Is the <prev> -> <next> link redundant?
*/
ret = check_redundant(prev, next);
- if (ret != 1)
- return ret;
+ if (bfs_error(ret))
+ return 0;
+ else if (ret == BFS_RMATCH)
+ return 2;
#endif
if (!*trace) {
@@ -2547,14 +2935,18 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
*/
ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
&hlock_class(prev)->locks_after,
- next->acquire_ip, distance, *trace);
+ next->acquire_ip, distance,
+ calc_dep(prev, next),
+ *trace);
if (!ret)
return 0;
ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
&hlock_class(next)->locks_before,
- next->acquire_ip, distance, *trace);
+ next->acquire_ip, distance,
+ calc_depb(prev, next),
+ *trace);
if (!ret)
return 0;
@@ -2590,16 +2982,11 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
goto out_bug;
for (;;) {
- int distance = curr->lockdep_depth - depth + 1;
+ u16 distance = curr->lockdep_depth - depth + 1;
hlock = curr->held_locks + depth - 1;
- /*
- * Only non-recursive-read entries get new dependencies
- * added:
- */
- if (hlock->read != 2 && hlock->check) {
- int ret = check_prev_add(curr, hlock, next, distance,
- &trace);
+ if (hlock->check) {
+ int ret = check_prev_add(curr, hlock, next, distance, &trace);
if (!ret)
return 0;
@@ -2875,7 +3262,10 @@ static inline void free_chain_hlocks(int base, int size)
struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
{
- return lock_classes + chain_hlocks[chain->base + i];
+ u16 chain_hlock = chain_hlocks[chain->base + i];
+ unsigned int class_idx = chain_hlock_class_idx(chain_hlock);
+
+ return lock_classes + class_idx - 1;
}
/*
@@ -2901,12 +3291,12 @@ static inline int get_first_held_lock(struct task_struct *curr,
/*
* Returns the next chain_key iteration
*/
-static u64 print_chain_key_iteration(int class_idx, u64 chain_key)
+static u64 print_chain_key_iteration(u16 hlock_id, u64 chain_key)
{
- u64 new_chain_key = iterate_chain_key(chain_key, class_idx);
+ u64 new_chain_key = iterate_chain_key(chain_key, hlock_id);
- printk(" class_idx:%d -> chain_key:%016Lx",
- class_idx,
+ printk(" hlock_id:%d -> chain_key:%016Lx",
+ (unsigned int)hlock_id,
(unsigned long long)new_chain_key);
return new_chain_key;
}
@@ -2923,12 +3313,12 @@ print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_ne
hlock_next->irq_context);
for (; i < depth; i++) {
hlock = curr->held_locks + i;
- chain_key = print_chain_key_iteration(hlock->class_idx, chain_key);
+ chain_key = print_chain_key_iteration(hlock_id(hlock), chain_key);
print_lock(hlock);
}
- print_chain_key_iteration(hlock_next->class_idx, chain_key);
+ print_chain_key_iteration(hlock_id(hlock_next), chain_key);
print_lock(hlock_next);
}
@@ -2936,14 +3326,14 @@ static void print_chain_keys_chain(struct lock_chain *chain)
{
int i;
u64 chain_key = INITIAL_CHAIN_KEY;
- int class_id;
+ u16 hlock_id;
printk("depth: %u\n", chain->depth);
for (i = 0; i < chain->depth; i++) {
- class_id = chain_hlocks[chain->base + i];
- chain_key = print_chain_key_iteration(class_id, chain_key);
+ hlock_id = chain_hlocks[chain->base + i];
+ chain_key = print_chain_key_iteration(hlock_id, chain_key);
- print_lock_name(lock_classes + class_id);
+ print_lock_name(lock_classes + chain_hlock_class_idx(hlock_id) - 1);
printk("\n");
}
}
@@ -2992,7 +3382,7 @@ static int check_no_collision(struct task_struct *curr,
}
for (j = 0; j < chain->depth - 1; j++, i++) {
- id = curr->held_locks[i].class_idx;
+ id = hlock_id(&curr->held_locks[i]);
if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) {
print_collision(curr, hlock, chain);
@@ -3041,7 +3431,6 @@ static inline int add_chain_cache(struct task_struct *curr,
struct held_lock *hlock,
u64 chain_key)
{
- struct lock_class *class = hlock_class(hlock);
struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
int i, j;
@@ -3084,11 +3473,11 @@ static inline int add_chain_cache(struct task_struct *curr,
chain->base = j;
for (j = 0; j < chain->depth - 1; j++, i++) {
- int lock_id = curr->held_locks[i].class_idx;
+ int lock_id = hlock_id(curr->held_locks + i);
chain_hlocks[chain->base + j] = lock_id;
}
- chain_hlocks[chain->base + j] = class - lock_classes;
+ chain_hlocks[chain->base + j] = hlock_id(hlock);
hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
inc_chains(chain->irq_context);
@@ -3275,7 +3664,7 @@ static void check_chain_key(struct task_struct *curr)
if (prev_hlock && (prev_hlock->irq_context !=
hlock->irq_context))
chain_key = INITIAL_CHAIN_KEY;
- chain_key = iterate_chain_key(chain_key, hlock->class_idx);
+ chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
prev_hlock = hlock;
}
if (chain_key != curr->curr_chain_key) {
@@ -3434,24 +3823,32 @@ print_irq_inversion_bug(struct task_struct *curr,
*/
static int
check_usage_forwards(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit bit, const char *irqclass)
+ enum lock_usage_bit bit)
{
- int ret;
+ enum bfs_result ret;
struct lock_list root;
struct lock_list *target_entry;
+ enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK;
+ unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit);
- root.parent = NULL;
- root.class = hlock_class(this);
- ret = find_usage_forwards(&root, lock_flag(bit), &target_entry);
- if (ret < 0) {
+ bfs_init_root(&root, this);
+ ret = find_usage_forwards(&root, usage_mask, &target_entry);
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (ret == 1)
- return ret;
+ if (ret == BFS_RNOMATCH)
+ return 1;
+
+ /* Check whether write or read usage is the match */
+ if (target_entry->class->usage_mask & lock_flag(bit)) {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, state_name(bit));
+ } else {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 1, state_name(read_bit));
+ }
- print_irq_inversion_bug(curr, &root, target_entry,
- this, 1, irqclass);
return 0;
}
@@ -3461,24 +3858,32 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
*/
static int
check_usage_backwards(struct task_struct *curr, struct held_lock *this,
- enum lock_usage_bit bit, const char *irqclass)
+ enum lock_usage_bit bit)
{
- int ret;
+ enum bfs_result ret;
struct lock_list root;
struct lock_list *target_entry;
+ enum lock_usage_bit read_bit = bit + LOCK_USAGE_READ_MASK;
+ unsigned usage_mask = lock_flag(bit) | lock_flag(read_bit);
- root.parent = NULL;
- root.class = hlock_class(this);
- ret = find_usage_backwards(&root, lock_flag(bit), &target_entry);
- if (ret < 0) {
+ bfs_init_rootb(&root, this);
+ ret = find_usage_backwards(&root, usage_mask, &target_entry);
+ if (bfs_error(ret)) {
print_bfs_bug(ret);
return 0;
}
- if (ret == 1)
- return ret;
+ if (ret == BFS_RNOMATCH)
+ return 1;
+
+ /* Check whether write or read usage is the match */
+ if (target_entry->class->usage_mask & lock_flag(bit)) {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, state_name(bit));
+ } else {
+ print_irq_inversion_bug(curr, &root, target_entry,
+ this, 0, state_name(read_bit));
+ }
- print_irq_inversion_bug(curr, &root, target_entry,
- this, 0, irqclass);
return 0;
}
@@ -3517,8 +3922,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
return 0;
}
-#define STRICT_READ_CHECKS 1
-
static int (*state_verbose_f[])(struct lock_class *class) = {
#define LOCKDEP_STATE(__STATE) \
__STATE##_verbose,
@@ -3544,16 +3947,6 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
int dir = new_bit & LOCK_USAGE_DIR_MASK;
/*
- * mark USED_IN has to look forwards -- to ensure no dependency
- * has ENABLED state, which would allow recursion deadlocks.
- *
- * mark ENABLED has to look backwards -- to ensure no dependee
- * has USED_IN state, which, again, would allow recursion deadlocks.
- */
- check_usage_f usage = dir ?
- check_usage_backwards : check_usage_forwards;
-
- /*
* Validate that this particular lock does not have conflicting
* usage states.
*/
@@ -3561,23 +3954,30 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this,
return 0;
/*
- * Validate that the lock dependencies don't have conflicting usage
- * states.
+ * Check for read in write conflicts
*/
- if ((!read || STRICT_READ_CHECKS) &&
- !usage(curr, this, excl_bit, state_name(new_bit & ~LOCK_USAGE_READ_MASK)))
+ if (!read && !valid_state(curr, this, new_bit,
+ excl_bit + LOCK_USAGE_READ_MASK))
return 0;
+
/*
- * Check for read in write conflicts
+ * Validate that the lock dependencies don't have conflicting usage
+ * states.
*/
- if (!read) {
- if (!valid_state(curr, this, new_bit, excl_bit + LOCK_USAGE_READ_MASK))
+ if (dir) {
+ /*
+ * mark ENABLED has to look backwards -- to ensure no dependee
+ * has USED_IN state, which, again, would allow recursion deadlocks.
+ */
+ if (!check_usage_backwards(curr, this, excl_bit))
return 0;
-
- if (STRICT_READ_CHECKS &&
- !usage(curr, this, excl_bit + LOCK_USAGE_READ_MASK,
- state_name(new_bit + LOCK_USAGE_READ_MASK)))
+ } else {
+ /*
+ * mark USED_IN has to look forwards -- to ensure no dependency
+ * has ENABLED state, which would allow recursion deadlocks.
+ */
+ if (!check_usage_forwards(curr, this, excl_bit))
return 0;
}
@@ -3657,7 +4057,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
if (unlikely(in_nmi()))
return;
- if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
+ if (unlikely(__this_cpu_read(lockdep_recursion)))
return;
if (unlikely(lockdep_hardirqs_enabled())) {
@@ -3693,7 +4093,7 @@ void lockdep_hardirqs_on_prepare(unsigned long ip)
current->hardirq_chain_key = current->curr_chain_key;
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__trace_hardirqs_on_caller();
lockdep_recursion_finish();
}
@@ -3726,7 +4126,7 @@ void noinstr lockdep_hardirqs_on(unsigned long ip)
goto skip_checks;
}
- if (unlikely(current->lockdep_recursion & LOCKDEP_RECURSION_MASK))
+ if (unlikely(__this_cpu_read(lockdep_recursion)))
return;
if (lockdep_hardirqs_enabled()) {
@@ -3779,7 +4179,7 @@ void noinstr lockdep_hardirqs_off(unsigned long ip)
if (in_nmi()) {
if (!IS_ENABLED(CONFIG_TRACE_IRQFLAGS_NMI))
return;
- } else if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)
+ } else if (__this_cpu_read(lockdep_recursion))
return;
/*
@@ -3812,7 +4212,7 @@ void lockdep_softirqs_on(unsigned long ip)
{
struct irqtrace_events *trace = &current->irqtrace;
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
/*
@@ -3827,7 +4227,7 @@ void lockdep_softirqs_on(unsigned long ip)
return;
}
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
/*
* We'll do an OFF -> ON transition:
*/
@@ -3850,7 +4250,7 @@ void lockdep_softirqs_on(unsigned long ip)
*/
void lockdep_softirqs_off(unsigned long ip)
{
- if (unlikely(!debug_locks || current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
/*
@@ -3969,13 +4369,18 @@ static int separate_irq_context(struct task_struct *curr,
static int mark_lock(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit new_bit)
{
- unsigned int new_mask = 1 << new_bit, ret = 1;
+ unsigned int new_mask, ret = 1;
if (new_bit >= LOCK_USAGE_STATES) {
DEBUG_LOCKS_WARN_ON(1);
return 0;
}
+ if (new_bit == LOCK_USED && this->read)
+ new_bit = LOCK_USED_READ;
+
+ new_mask = 1 << new_bit;
+
/*
* If already set then do not dirty the cacheline,
* nor do any checks:
@@ -3988,26 +4393,32 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
/*
* Make sure we didn't race:
*/
- if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
- graph_unlock();
- return 1;
- }
+ if (unlikely(hlock_class(this)->usage_mask & new_mask))
+ goto unlock;
hlock_class(this)->usage_mask |= new_mask;
- if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))
- return 0;
+ if (new_bit < LOCK_TRACE_STATES) {
+ if (!(hlock_class(this)->usage_traces[new_bit] = save_trace()))
+ return 0;
+ }
switch (new_bit) {
+ case 0 ... LOCK_USED-1:
+ ret = mark_lock_irq(curr, this, new_bit);
+ if (!ret)
+ return 0;
+ break;
+
case LOCK_USED:
debug_atomic_dec(nr_unused_locks);
break;
+
default:
- ret = mark_lock_irq(curr, this, new_bit);
- if (!ret)
- return 0;
+ break;
}
+unlock:
graph_unlock();
/*
@@ -4220,11 +4631,11 @@ void lockdep_init_map_waits(struct lockdep_map *lock, const char *name,
if (subclass) {
unsigned long flags;
- if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion))
+ if (DEBUG_LOCKS_WARN_ON(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
register_lock_class(lock, subclass, 1);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -4411,7 +4822,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
chain_key = INITIAL_CHAIN_KEY;
chain_head = 1;
}
- chain_key = iterate_chain_key(chain_key, class_idx);
+ chain_key = iterate_chain_key(chain_key, hlock_id(hlock));
if (nest_lock && !__lock_is_held(nest_lock, -1)) {
print_lock_nested_lock_not_held(curr, hlock, ip);
@@ -4907,11 +5318,11 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
check_flags(flags);
if (__lock_set_class(lock, name, key, subclass, ip))
check_chain_key(current);
@@ -4924,11 +5335,11 @@ void lock_downgrade(struct lockdep_map *lock, unsigned long ip)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
check_flags(flags);
if (__lock_downgrade(lock, ip))
check_chain_key(current);
@@ -4942,12 +5353,20 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock
{
#ifdef CONFIG_PROVE_LOCKING
struct lock_class *class = look_up_lock_class(lock, subclass);
+ unsigned long mask = LOCKF_USED;
/* if it doesn't have a class (yet), it certainly hasn't been used yet */
if (!class)
return;
- if (!(class->usage_mask & LOCK_USED))
+ /*
+ * READ locks only conflict with USED, such that if we only ever use
+ * READ locks, there is no deadlock possible -- RCU.
+ */
+ if (!hlock->read)
+ mask |= LOCKF_USED_READ;
+
+ if (!(class->usage_mask & mask))
return;
hlock->class_idx = class - lock_classes;
@@ -4958,7 +5377,7 @@ static void verify_lock_unused(struct lockdep_map *lock, struct held_lock *hlock
static bool lockdep_nmi(void)
{
- if (current->lockdep_recursion & LOCKDEP_RECURSION_MASK)
+ if (raw_cpu_read(lockdep_recursion))
return false;
if (!in_nmi())
@@ -4968,6 +5387,20 @@ static bool lockdep_nmi(void)
}
/*
+ * read_lock() is recursive if:
+ * 1. We force lockdep think this way in selftests or
+ * 2. The implementation is not queued read/write lock or
+ * 3. The locker is at an in_interrupt() context.
+ */
+bool read_lock_is_recursive(void)
+{
+ return force_read_lock_recursive ||
+ !IS_ENABLED(CONFIG_QUEUED_RWLOCKS) ||
+ in_interrupt();
+}
+EXPORT_SYMBOL_GPL(read_lock_is_recursive);
+
+/*
* We are not always called with irqs disabled - do that here,
* and also avoid lockdep recursion:
*/
@@ -4979,7 +5412,10 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
- if (unlikely(current->lockdep_recursion)) {
+ if (!debug_locks)
+ return;
+
+ if (unlikely(!lockdep_enabled())) {
/* XXX allow trylock from NMI ?!? */
if (lockdep_nmi() && !trylock) {
struct held_lock hlock;
@@ -5002,7 +5438,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_acquire(lock, subclass, trylock, read, check,
irqs_disabled_flags(flags), nest_lock, ip, 0, 0);
lockdep_recursion_finish();
@@ -5016,13 +5452,13 @@ void lock_release(struct lockdep_map *lock, unsigned long ip)
trace_lock_release(lock, ip);
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
if (__lock_release(lock, ip))
check_chain_key(current);
lockdep_recursion_finish();
@@ -5035,13 +5471,13 @@ noinstr int lock_is_held_type(const struct lockdep_map *lock, int read)
unsigned long flags;
int ret = 0;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return 1; /* avoid false negative lockdep_assert_held() */
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
ret = __lock_is_held(lock, read);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5056,13 +5492,13 @@ struct pin_cookie lock_pin_lock(struct lockdep_map *lock)
struct pin_cookie cookie = NIL_COOKIE;
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return cookie;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
cookie = __lock_pin_lock(lock);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5075,13 +5511,13 @@ void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_repin_lock(lock, cookie);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5092,13 +5528,13 @@ void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
unsigned long flags;
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_unpin_lock(lock, cookie);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5228,15 +5664,12 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
trace_lock_acquired(lock, ip);
- if (unlikely(!lock_stat || !debug_locks))
- return;
-
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lock_stat || !lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_contended(lock, ip);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5249,15 +5682,12 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip)
trace_lock_contended(lock, ip);
- if (unlikely(!lock_stat || !debug_locks))
- return;
-
- if (unlikely(current->lockdep_recursion))
+ if (unlikely(!lock_stat || !lockdep_enabled()))
return;
raw_local_irq_save(flags);
check_flags(flags);
- current->lockdep_recursion++;
+ lockdep_recursion_inc();
__lock_acquired(lock, ip);
lockdep_recursion_finish();
raw_local_irq_restore(flags);
@@ -5296,7 +5726,7 @@ static void remove_class_from_lock_chain(struct pending_free *pf,
int i;
for (i = chain->base; i < chain->base + chain->depth; i++) {
- if (chain_hlocks[i] != class - lock_classes)
+ if (chain_hlock_class_idx(chain_hlocks[i]) != class - lock_classes)
continue;
/*
* Each lock class occurs at most once in a lock chain so once
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
index baca699b94e9..de49f9e1c11b 100644
--- a/kernel/locking/lockdep_internals.h
+++ b/kernel/locking/lockdep_internals.h
@@ -19,9 +19,13 @@ enum lock_usage_bit {
#include "lockdep_states.h"
#undef LOCKDEP_STATE
LOCK_USED,
- LOCK_USAGE_STATES
+ LOCK_USED_READ,
+ LOCK_USAGE_STATES,
};
+/* states after LOCK_USED_READ are not traced and printed */
+static_assert(LOCK_TRACE_STATES == LOCK_USAGE_STATES);
+
#define LOCK_USAGE_READ_MASK 1
#define LOCK_USAGE_DIR_MASK 2
#define LOCK_USAGE_STATE_MASK (~(LOCK_USAGE_READ_MASK | LOCK_USAGE_DIR_MASK))
@@ -40,6 +44,7 @@ enum {
#include "lockdep_states.h"
#undef LOCKDEP_STATE
__LOCKF(USED)
+ __LOCKF(USED_READ)
};
#define LOCKDEP_STATE(__STATE) LOCKF_ENABLED_##__STATE |
@@ -119,7 +124,7 @@ static const unsigned long LOCKF_USED_IN_IRQ_READ =
extern struct list_head all_lock_classes;
extern struct lock_chain lock_chains[];
-#define LOCK_USAGE_CHARS (1+LOCK_USAGE_STATES/2)
+#define LOCK_USAGE_CHARS (2*XXX_LOCK_USAGE_STATES + 1)
extern void get_usage_chars(struct lock_class *class,
char usage[LOCK_USAGE_CHARS]);
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 8bbafe3e5203..70a32a576f3f 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -45,7 +45,7 @@ EXPORT_SYMBOL_GPL(percpu_free_rwsem);
static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
{
- __this_cpu_inc(*sem->read_count);
+ this_cpu_inc(*sem->read_count);
/*
* Due to having preemption disabled the decrement happens on
@@ -71,7 +71,7 @@ static bool __percpu_down_read_trylock(struct percpu_rw_semaphore *sem)
if (likely(!atomic_read_acquire(&sem->block)))
return true;
- __this_cpu_dec(*sem->read_count);
+ this_cpu_dec(*sem->read_count);
/* Prod writer to re-evaluate readers_active_check() */
rcuwait_wake_up(&sem->writer);
diff --git a/kernel/padata.c b/kernel/padata.c
index 16cb894dc272..d4d3ba6e1728 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -215,12 +215,13 @@ int padata_do_parallel(struct padata_shell *ps,
padata->pd = pd;
padata->cb_cpu = *cb_cpu;
- rcu_read_unlock_bh();
-
spin_lock(&padata_works_lock);
padata->seq_nr = ++pd->seq_nr;
pw = padata_work_alloc();
spin_unlock(&padata_works_lock);
+
+ rcu_read_unlock_bh();
+
if (pw) {
padata_work_init(pw, padata_parallel_worker, padata, 0);
queue_work(pinst->parallel_wq, &pw->pw_work);
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index cf66a3ccd757..e01cba5e4b52 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -167,7 +167,7 @@ static inline unsigned long rcu_seq_diff(unsigned long new, unsigned long old)
# define STATE_RCU_HEAD_READY 0
# define STATE_RCU_HEAD_QUEUED 1
-extern struct debug_obj_descr rcuhead_debug_descr;
+extern const struct debug_obj_descr rcuhead_debug_descr;
static inline int debug_rcu_head_queue(struct rcu_head *head)
{
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 835e2df8590a..05d3e1375e4c 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -590,7 +590,7 @@ void exit_tasks_rcu_finish(void) __releases(&tasks_rcu_exit_srcu)
}
#else /* #ifdef CONFIG_TASKS_RCU */
-static void show_rcu_tasks_classic_gp_kthread(void) { }
+static inline void show_rcu_tasks_classic_gp_kthread(void) { }
void exit_tasks_rcu_start(void) { }
void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
#endif /* #else #ifdef CONFIG_TASKS_RCU */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8ce77d9ac716..f78ee759af9c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -673,6 +673,7 @@ void rcu_idle_enter(void)
lockdep_assert_irqs_disabled();
rcu_eqs_enter(false);
}
+EXPORT_SYMBOL_GPL(rcu_idle_enter);
#ifdef CONFIG_NO_HZ_FULL
/**
@@ -886,6 +887,7 @@ void rcu_idle_exit(void)
rcu_eqs_exit(false);
local_irq_restore(flags);
}
+EXPORT_SYMBOL_GPL(rcu_idle_exit);
#ifdef CONFIG_NO_HZ_FULL
/**
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 2de49b5d8dd2..3e0f4bcb558f 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -469,7 +469,7 @@ void destroy_rcu_head_on_stack(struct rcu_head *head)
}
EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack);
-struct debug_obj_descr rcuhead_debug_descr = {
+const struct debug_obj_descr rcuhead_debug_descr = {
.name = "rcu_head",
.is_static_object = rcuhead_is_static_object,
};
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d95dc3f4644..8160ab5263f8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -36,6 +36,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
+EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
@@ -940,11 +941,6 @@ static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
return clamp_value / UCLAMP_BUCKET_DELTA;
}
-static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value)
-{
- return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value);
-}
-
static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
{
if (clamp_id == UCLAMP_MIN)
@@ -4551,9 +4547,12 @@ void __noreturn do_task_dead(void)
static inline void sched_submit_work(struct task_struct *tsk)
{
+ unsigned int task_flags;
+
if (!tsk->state)
return;
+ task_flags = tsk->flags;
/*
* If a worker went to sleep, notify and ask workqueue whether
* it wants to wake up a task to maintain concurrency.
@@ -4562,9 +4561,9 @@ static inline void sched_submit_work(struct task_struct *tsk)
* in the possible wakeup of a kworker and because wq_worker_sleeping()
* requires it.
*/
- if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
preempt_disable();
- if (tsk->flags & PF_WQ_WORKER)
+ if (task_flags & PF_WQ_WORKER)
wq_worker_sleeping(tsk);
else
io_wq_worker_sleeping(tsk);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 3862a28cd05d..6d93f4518734 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1525,14 +1525,38 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
*/
if (pi_task && dl_prio(pi_task->normal_prio) && p->dl.dl_boosted) {
pi_se = &pi_task->dl;
+ /*
+ * Because of delays in the detection of the overrun of a
+ * thread's runtime, it might be the case that a thread
+ * goes to sleep in a rt mutex with negative runtime. As
+ * a consequence, the thread will be throttled.
+ *
+ * While waiting for the mutex, this thread can also be
+ * boosted via PI, resulting in a thread that is throttled
+ * and boosted at the same time.
+ *
+ * In this case, the boost overrides the throttle.
+ */
+ if (p->dl.dl_throttled) {
+ /*
+ * The replenish timer needs to be canceled. No
+ * problem if it fires concurrently: boosted threads
+ * are ignored in dl_task_timer().
+ */
+ hrtimer_try_to_cancel(&p->dl.dl_timer);
+ p->dl.dl_throttled = 0;
+ }
} else if (!dl_prio(p->normal_prio)) {
/*
- * Special case in which we have a !SCHED_DEADLINE task
- * that is going to be deboosted, but exceeds its
- * runtime while doing so. No point in replenishing
- * it, as it's going to return back to its original
- * scheduling class after this.
+ * Special case in which we have a !SCHED_DEADLINE task that is going
+ * to be deboosted, but exceeds its runtime while doing so. No point in
+ * replenishing it, as it's going to return back to its original
+ * scheduling class after this. If it has been throttled, we need to
+ * clear the flag, otherwise the task may wake up as throttled after
+ * being boosted again with no means to replenish the runtime and clear
+ * the throttle.
*/
+ p->dl.dl_throttled = 0;
BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
return;
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 36c54265bb2b..0655524700d2 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -245,6 +245,60 @@ set_table_entry(struct ctl_table *entry,
entry->proc_handler = proc_handler;
}
+static int sd_ctl_doflags(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ unsigned long flags = *(unsigned long *)table->data;
+ size_t data_size = 0;
+ size_t len = 0;
+ char *tmp;
+ int idx;
+
+ if (write)
+ return 0;
+
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ char *name = sd_flag_debug[idx].name;
+
+ /* Name plus whitespace */
+ data_size += strlen(name) + 1;
+ }
+
+ if (*ppos > data_size) {
+ *lenp = 0;
+ return 0;
+ }
+
+ tmp = kcalloc(data_size + 1, sizeof(*tmp), GFP_KERNEL);
+ if (!tmp)
+ return -ENOMEM;
+
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ char *name = sd_flag_debug[idx].name;
+
+ len += snprintf(tmp + len, strlen(name) + 2, "%s ", name);
+ }
+
+ tmp += *ppos;
+ len -= *ppos;
+
+ if (len > *lenp)
+ len = *lenp;
+ if (len)
+ memcpy(buffer, tmp, len);
+ if (len < *lenp) {
+ ((char *)buffer)[len] = '\n';
+ len++;
+ }
+
+ *lenp = len;
+ *ppos += len;
+
+ kfree(tmp);
+
+ return 0;
+}
+
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
@@ -258,7 +312,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
set_table_entry(&table[2], "busy_factor", &sd->busy_factor, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[3], "imbalance_pct", &sd->imbalance_pct, sizeof(int), 0644, proc_dointvec_minmax);
set_table_entry(&table[4], "cache_nice_tries", &sd->cache_nice_tries, sizeof(int), 0644, proc_dointvec_minmax);
- set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, proc_dointvec_minmax);
+ set_table_entry(&table[5], "flags", &sd->flags, sizeof(int), 0444, sd_ctl_doflags);
set_table_entry(&table[6], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax);
set_table_entry(&table[7], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring);
/* &table[8] is terminator */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1a68a0536add..aa4c6227cd6d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -831,7 +831,7 @@ void init_entity_runnable_average(struct sched_entity *se)
void post_init_entity_util_avg(struct task_struct *p)
{
}
-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
}
#endif /* CONFIG_SMP */
@@ -1504,6 +1504,7 @@ enum numa_type {
/* Cached statistics for all CPUs within a node */
struct numa_stats {
unsigned long load;
+ unsigned long runnable;
unsigned long util;
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
@@ -1547,19 +1548,22 @@ struct task_numa_env {
};
static unsigned long cpu_load(struct rq *rq);
+static unsigned long cpu_runnable(struct rq *rq);
static unsigned long cpu_util(int cpu);
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running);
+static inline long adjust_numa_imbalance(int imbalance, int nr_running);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
struct numa_stats *ns)
{
if ((ns->nr_running > ns->weight) &&
- ((ns->compute_capacity * 100) < (ns->util * imbalance_pct)))
+ (((ns->compute_capacity * 100) < (ns->util * imbalance_pct)) ||
+ ((ns->compute_capacity * imbalance_pct) < (ns->runnable * 100))))
return node_overloaded;
if ((ns->nr_running < ns->weight) ||
- ((ns->compute_capacity * 100) > (ns->util * imbalance_pct)))
+ (((ns->compute_capacity * 100) > (ns->util * imbalance_pct)) &&
+ ((ns->compute_capacity * imbalance_pct) > (ns->runnable * 100))))
return node_has_spare;
return node_fully_busy;
@@ -1610,6 +1614,7 @@ static void update_numa_stats(struct task_numa_env *env,
struct rq *rq = cpu_rq(cpu);
ns->load += cpu_load(rq);
+ ns->runnable += cpu_runnable(rq);
ns->util += cpu_util(cpu);
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
@@ -1925,7 +1930,7 @@ static void task_numa_find_cpu(struct task_numa_env *env,
src_running = env->src_stats.nr_running - 1;
dst_running = env->dst_stats.nr_running + 1;
imbalance = max(0, dst_running - src_running);
- imbalance = adjust_numa_imbalance(imbalance, src_running);
+ imbalance = adjust_numa_imbalance(imbalance, dst_running);
/* Use idle CPU if there is no imbalance */
if (!imbalance) {
@@ -3084,7 +3089,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
- account_entity_dequeue(cfs_rq, se);
+ update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
@@ -3100,7 +3105,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq)
- account_entity_enqueue(cfs_rq, se);
+ update_load_add(&cfs_rq->load, se->load.weight);
}
@@ -3288,7 +3293,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
/**
* update_tg_load_avg - update the tg's load avg
* @cfs_rq: the cfs_rq whose avg changed
- * @force: update regardless of how small the difference
*
* This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
* However, because tg->load_avg is a global value there are performance
@@ -3300,7 +3304,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
*
* Updating tg's load_avg is necessary before update_cfs_share().
*/
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
@@ -3310,7 +3314,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
if (cfs_rq->tg == &root_task_group)
return;
- if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
+ if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
atomic_long_add(delta, &cfs_rq->tg->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
@@ -3612,7 +3616,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
#else /* CONFIG_FAIR_GROUP_SCHED */
-static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) {}
static inline int propagate_entity_load_avg(struct sched_entity *se)
{
@@ -3800,13 +3804,13 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
* IOW we're enqueueing a task on a new CPU.
*/
attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
} else if (decayed) {
cfs_rq_util_change(cfs_rq, 0);
if (flags & UPDATE_TG)
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
}
}
@@ -4461,17 +4465,17 @@ pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
se = second;
}
- /*
- * Prefer last buddy, try to return the CPU to a preempted task.
- */
- if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
- se = cfs_rq->last;
-
- /*
- * Someone really wants this to run. If it's not unfair, run it.
- */
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1) {
+ /*
+ * Someone really wants this to run. If it's not unfair, run it.
+ */
se = cfs_rq->next;
+ } else if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1) {
+ /*
+ * Prefer last buddy, try to return the CPU to a preempted task.
+ */
+ se = cfs_rq->last;
+ }
clear_buddies(cfs_rq, se);
@@ -6075,7 +6079,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu;
@@ -6083,7 +6087,8 @@ static int select_idle_smt(struct task_struct *p, int target)
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu) || sched_idle_cpu(cpu))
return cpu;
@@ -6099,7 +6104,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}
-static inline int select_idle_smt(struct task_struct *p, int target)
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
@@ -6274,7 +6279,7 @@ symmetric:
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, target);
+ i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -6594,7 +6599,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
util = cpu_util_next(cpu, p, cpu);
cpu_cap = capacity_of(cpu);
- spare_cap = cpu_cap - util;
+ spare_cap = cpu_cap;
+ lsub_positive(&spare_cap, util);
/*
* Skip CPUs that cannot satisfy the capacity request.
@@ -7402,6 +7408,10 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (unlikely(task_has_idle_policy(p)))
return 0;
+ /* SMT siblings share cache */
+ if (env->sd->flags & SD_SHARE_CPUCAPACITY)
+ return 0;
+
/*
* Buddy candidates are cache hot:
*/
@@ -7669,8 +7679,8 @@ static int detach_tasks(struct lb_env *env)
* scheduler fails to find a good waiting task to
* migrate.
*/
- if (load/2 > env->imbalance &&
- env->sd->nr_balance_failed <= env->sd->cache_nice_tries)
+
+ if ((load >> env->sd->nr_balance_failed) > env->imbalance)
goto next;
env->imbalance -= load;
@@ -7887,7 +7897,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done)
struct sched_entity *se;
if (update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq)) {
- update_tg_load_avg(cfs_rq, 0);
+ update_tg_load_avg(cfs_rq);
if (cfs_rq == &rq->cfs)
decayed = true;
@@ -8098,6 +8108,8 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
capacity = 1;
cpu_rq(cpu)->cpu_capacity = capacity;
+ trace_sched_cpu_capacity_tp(cpu_rq(cpu));
+
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
sdg->sgc->max_capacity = capacity;
@@ -8957,7 +8969,7 @@ next_group:
}
}
-static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
+static inline long adjust_numa_imbalance(int imbalance, int nr_running)
{
unsigned int imbalance_min;
@@ -8966,7 +8978,7 @@ static inline long adjust_numa_imbalance(int imbalance, int src_nr_running)
* tasks that remain local when the source domain is almost idle.
*/
imbalance_min = 2;
- if (src_nr_running <= imbalance_min)
+ if (nr_running <= imbalance_min)
return 0;
return imbalance;
@@ -9780,6 +9792,15 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
+
+ /*
+ * Reduce likelihood of busy balancing at higher domains racing with
+ * balancing at lower domains by preventing their balancing periods
+ * from being multiples of each other.
+ */
+ if (cpu_busy)
+ interval -= 1;
+
interval = clamp(interval, 1UL, max_load_balance_interval);
return interval;
@@ -10786,7 +10807,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
/* Catch up with the cfs_rq and remove our load when we leave */
update_load_avg(cfs_rq, se, 0);
detach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
@@ -10805,7 +10826,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
/* Synchronize entity with its cfs_rq */
update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
attach_entity_load_avg(cfs_rq, se);
- update_tg_load_avg(cfs_rq, false);
+ update_tg_load_avg(cfs_rq);
propagate_entity_cfs_rq(se);
}
@@ -11302,6 +11323,18 @@ int sched_trace_rq_cpu(struct rq *rq)
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
+int sched_trace_rq_cpu_capacity(struct rq *rq)
+{
+ return rq ?
+#ifdef CONFIG_SMP
+ rq->cpu_capacity
+#else
+ SCHED_CAPACITY_SCALE
+#endif
+ : -1;
+}
+EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
+
const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
{
#ifdef CONFIG_SMP
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 7481cd96f391..68d369cba9e4 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -77,7 +77,7 @@ SCHED_FEAT(WARN_DOUBLE_CLOCK, false)
SCHED_FEAT(RT_PUSH_IPI, true)
#endif
-SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(RT_RUNTIME_SHARE, false)
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 168479a7d61b..e23e74d52db5 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -18,6 +18,14 @@
#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
#endif
+#ifdef CONFIG_RSEQ
+#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK \
+ (MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ \
+ | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ_BITMASK)
+#else
+#define MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ_BITMASK 0
+#endif
+
#define MEMBARRIER_CMD_BITMASK \
(MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
| MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
@@ -30,6 +38,11 @@ static void ipi_mb(void *info)
smp_mb(); /* IPIs should be serializing but paranoid. */
}
+static void ipi_rseq(void *info)
+{
+ rseq_preempt(current);
+}
+
static void ipi_sync_rq_state(void *info)
{
struct mm_struct *mm = (struct mm_struct *) info;
@@ -129,19 +142,27 @@ static int membarrier_global_expedited(void)
return 0;
}
-static int membarrier_private_expedited(int flags)
+static int membarrier_private_expedited(int flags, int cpu_id)
{
- int cpu;
cpumask_var_t tmpmask;
struct mm_struct *mm = current->mm;
+ smp_call_func_t ipi_func = ipi_mb;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
return -EPERM;
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ if (!(atomic_read(&mm->membarrier_state) &
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY))
+ return -EPERM;
+ ipi_func = ipi_rseq;
} else {
+ WARN_ON_ONCE(flags);
if (!(atomic_read(&mm->membarrier_state) &
MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY))
return -EPERM;
@@ -156,35 +177,59 @@ static int membarrier_private_expedited(int flags)
*/
smp_mb(); /* system call entry is not a mb. */
- if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ if (cpu_id < 0 && !zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
return -ENOMEM;
cpus_read_lock();
- rcu_read_lock();
- for_each_online_cpu(cpu) {
+
+ if (cpu_id >= 0) {
struct task_struct *p;
- /*
- * Skipping the current CPU is OK even through we can be
- * migrated at any point. The current CPU, at the point
- * where we read raw_smp_processor_id(), is ensured to
- * be in program order with respect to the caller
- * thread. Therefore, we can skip this CPU from the
- * iteration.
- */
- if (cpu == raw_smp_processor_id())
- continue;
- p = rcu_dereference(cpu_rq(cpu)->curr);
- if (p && p->mm == mm)
- __cpumask_set_cpu(cpu, tmpmask);
+ if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
+ goto out;
+ if (cpu_id == raw_smp_processor_id())
+ goto out;
+ rcu_read_lock();
+ p = rcu_dereference(cpu_rq(cpu_id)->curr);
+ if (!p || p->mm != mm) {
+ rcu_read_unlock();
+ goto out;
+ }
+ rcu_read_unlock();
+ } else {
+ int cpu;
+
+ rcu_read_lock();
+ for_each_online_cpu(cpu) {
+ struct task_struct *p;
+
+ /*
+ * Skipping the current CPU is OK even through we can be
+ * migrated at any point. The current CPU, at the point
+ * where we read raw_smp_processor_id(), is ensured to
+ * be in program order with respect to the caller
+ * thread. Therefore, we can skip this CPU from the
+ * iteration.
+ */
+ if (cpu == raw_smp_processor_id())
+ continue;
+ p = rcu_dereference(cpu_rq(cpu)->curr);
+ if (p && p->mm == mm)
+ __cpumask_set_cpu(cpu, tmpmask);
+ }
+ rcu_read_unlock();
}
- rcu_read_unlock();
preempt_disable();
- smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
+ if (cpu_id >= 0)
+ smp_call_function_single(cpu_id, ipi_func, NULL, 1);
+ else
+ smp_call_function_many(tmpmask, ipi_func, NULL, 1);
preempt_enable();
- free_cpumask_var(tmpmask);
+out:
+ if (cpu_id < 0)
+ free_cpumask_var(tmpmask);
cpus_read_unlock();
/*
@@ -283,11 +328,18 @@ static int membarrier_register_private_expedited(int flags)
set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
ret;
- if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
+ if (flags == MEMBARRIER_FLAG_SYNC_CORE) {
if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
return -EINVAL;
ready_state =
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
+ } else if (flags == MEMBARRIER_FLAG_RSEQ) {
+ if (!IS_ENABLED(CONFIG_RSEQ))
+ return -EINVAL;
+ ready_state =
+ MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY;
+ } else {
+ WARN_ON_ONCE(flags);
}
/*
@@ -299,6 +351,8 @@ static int membarrier_register_private_expedited(int flags)
return 0;
if (flags & MEMBARRIER_FLAG_SYNC_CORE)
set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
+ if (flags & MEMBARRIER_FLAG_RSEQ)
+ set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ;
atomic_or(set_state, &mm->membarrier_state);
ret = sync_runqueues_membarrier_state(mm);
if (ret)
@@ -310,8 +364,15 @@ static int membarrier_register_private_expedited(int flags)
/**
* sys_membarrier - issue memory barriers on a set of threads
- * @cmd: Takes command values defined in enum membarrier_cmd.
- * @flags: Currently needs to be 0. For future extensions.
+ * @cmd: Takes command values defined in enum membarrier_cmd.
+ * @flags: Currently needs to be 0 for all commands other than
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ: in the latter
+ * case it can be MEMBARRIER_CMD_FLAG_CPU, indicating that @cpu_id
+ * contains the CPU on which to interrupt (= restart)
+ * the RSEQ critical section.
+ * @cpu_id: if @flags == MEMBARRIER_CMD_FLAG_CPU, indicates the cpu on which
+ * RSEQ CS should be interrupted (@cmd must be
+ * MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ).
*
* If this system call is not implemented, -ENOSYS is returned. If the
* command specified does not exist, not available on the running
@@ -337,10 +398,21 @@ static int membarrier_register_private_expedited(int flags)
* smp_mb() X O O
* sys_membarrier() O O O
*/
-SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
+SYSCALL_DEFINE3(membarrier, int, cmd, unsigned int, flags, int, cpu_id)
{
- if (unlikely(flags))
- return -EINVAL;
+ switch (cmd) {
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ if (unlikely(flags && flags != MEMBARRIER_CMD_FLAG_CPU))
+ return -EINVAL;
+ break;
+ default:
+ if (unlikely(flags))
+ return -EINVAL;
+ }
+
+ if (!(flags & MEMBARRIER_CMD_FLAG_CPU))
+ cpu_id = -1;
+
switch (cmd) {
case MEMBARRIER_CMD_QUERY:
{
@@ -362,13 +434,17 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED:
return membarrier_register_global_expedited();
case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
- return membarrier_private_expedited(0);
+ return membarrier_private_expedited(0, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED:
return membarrier_register_private_expedited(0);
case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE:
- return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE, cpu_id);
case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE:
return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE);
+ case MEMBARRIER_CMD_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_private_expedited(MEMBARRIER_FLAG_RSEQ, cpu_id);
+ case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_RSEQ:
+ return membarrier_register_private_expedited(MEMBARRIER_FLAG_RSEQ);
default:
return -EINVAL;
}
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 1bd7e3af904f..dd7770226086 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -25,10 +25,18 @@ static inline bool sched_debug(void)
return sched_debug_enabled;
}
+#define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name },
+const struct sd_flag_debug sd_flag_debug[] = {
+#include <linux/sched/sd_flags.h>
+};
+#undef SD_FLAG
+
static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
struct cpumask *groupmask)
{
struct sched_group *group = sd->groups;
+ unsigned long flags = sd->flags;
+ unsigned int idx;
cpumask_clear(groupmask);
@@ -43,6 +51,21 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
}
+ for_each_set_bit(idx, &flags, __SD_FLAG_CNT) {
+ unsigned int flag = BIT(idx);
+ unsigned int meta_flags = sd_flag_debug[idx].meta_flags;
+
+ if ((meta_flags & SDF_SHARED_CHILD) && sd->child &&
+ !(sd->child->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in child\n",
+ sd_flag_debug[idx].name);
+
+ if ((meta_flags & SDF_SHARED_PARENT) && sd->parent &&
+ !(sd->parent->flags & flag))
+ printk(KERN_ERR "ERROR: flag %s set here but not in parent\n",
+ sd_flag_debug[idx].name);
+ }
+
printk(KERN_DEBUG "%*s groups:", level + 1, "");
do {
if (!group) {
@@ -137,22 +160,22 @@ static inline bool sched_debug(void)
}
#endif /* CONFIG_SCHED_DEBUG */
+/* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */
+#define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) |
+static const unsigned int SD_DEGENERATE_GROUPS_MASK =
+#include <linux/sched/sd_flags.h>
+0;
+#undef SD_FLAG
+
static int sd_degenerate(struct sched_domain *sd)
{
if (cpumask_weight(sched_domain_span(sd)) == 1)
return 1;
/* Following flags need at least 2 groups */
- if (sd->flags & (SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_SHARE_CPUCAPACITY |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_SHARE_POWERDOMAIN)) {
- if (sd->groups != sd->groups->next)
- return 0;
- }
+ if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) &&
+ (sd->groups != sd->groups->next))
+ return 0;
/* Following flags don't use groups */
if (sd->flags & (SD_WAKE_AFFINE))
@@ -173,18 +196,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 0;
/* Flags needing groups don't count if only 1 group in parent */
- if (parent->groups == parent->groups->next) {
- pflags &= ~(SD_BALANCE_NEWIDLE |
- SD_BALANCE_FORK |
- SD_BALANCE_EXEC |
- SD_ASYM_CPUCAPACITY |
- SD_SHARE_CPUCAPACITY |
- SD_SHARE_PKG_RESOURCES |
- SD_PREFER_SIBLING |
- SD_SHARE_POWERDOMAIN);
- if (nr_node_ids == 1)
- pflags &= ~SD_SERIALIZE;
- }
+ if (parent->groups == parent->groups->next)
+ pflags &= ~SD_DEGENERATE_GROUPS_MASK;
+
if (~cflags & pflags)
return 0;
@@ -1292,7 +1306,6 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
* SD_SHARE_CPUCAPACITY - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
- * SD_SHARE_POWERDOMAIN - describes shared power domain
*
* Odd one out, which beside describing the topology has a quirk also
* prescribes the desired behaviour that goes along with it:
@@ -1303,8 +1316,7 @@ int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
(SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
- SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
+ SD_ASYM_PACKING)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl,
@@ -1336,8 +1348,8 @@ sd_init(struct sched_domain_topology_level *tl,
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
- .busy_factor = 32,
- .imbalance_pct = 125,
+ .busy_factor = 16,
+ .imbalance_pct = 117,
.cache_nice_tries = 0,
@@ -1989,11 +2001,10 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Set up domains for CPUs specified by the cpu_map: */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
+ int dflags = 0;
sd = NULL;
for_each_sd_topology(tl) {
- int dflags = 0;
-
if (tl == tl_asym) {
dflags |= SD_ASYM_CPUCAPACITY;
has_asym = true;
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 3ee59ce0a323..676d4af62103 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -1109,13 +1109,18 @@ out:
}
#ifdef CONFIG_SECCOMP_FILTER
-static int seccomp_notify_release(struct inode *inode, struct file *file)
+static void seccomp_notify_free(struct seccomp_filter *filter)
+{
+ kfree(filter->notif);
+ filter->notif = NULL;
+}
+
+static void seccomp_notify_detach(struct seccomp_filter *filter)
{
- struct seccomp_filter *filter = file->private_data;
struct seccomp_knotif *knotif;
if (!filter)
- return 0;
+ return;
mutex_lock(&filter->notify_lock);
@@ -1139,9 +1144,15 @@ static int seccomp_notify_release(struct inode *inode, struct file *file)
complete(&knotif->ready);
}
- kfree(filter->notif);
- filter->notif = NULL;
+ seccomp_notify_free(filter);
mutex_unlock(&filter->notify_lock);
+}
+
+static int seccomp_notify_release(struct inode *inode, struct file *file)
+{
+ struct seccomp_filter *filter = file->private_data;
+
+ seccomp_notify_detach(filter);
__put_seccomp_filter(filter);
return 0;
}
@@ -1488,7 +1499,7 @@ static struct file *init_listener(struct seccomp_filter *filter)
out_notif:
if (IS_ERR(ret))
- kfree(filter->notif);
+ seccomp_notify_free(filter);
out:
return ret;
}
@@ -1581,6 +1592,7 @@ out_put_fd:
listener_f->private_data = NULL;
fput(listener_f);
put_unused_fd(listener);
+ seccomp_notify_detach(prepared);
} else {
fd_install(listener, listener_f);
ret = listener;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bf88d7f62433..09229ad82209 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -481,6 +481,7 @@ void raise_softirq(unsigned int nr)
void __raise_softirq_irqoff(unsigned int nr)
{
+ lockdep_assert_irqs_disabled();
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr);
}
diff --git a/kernel/stackleak.c b/kernel/stackleak.c
index a8fc9ae1d03d..ce161a8e8d97 100644
--- a/kernel/stackleak.c
+++ b/kernel/stackleak.c
@@ -20,7 +20,7 @@
static DEFINE_STATIC_KEY_FALSE(stack_erasing_bypass);
int stack_erasing_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = 0;
int state = !static_branch_unlikely(&stack_erasing_bypass);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 946f44a9e86a..9f8117c7cfdd 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -78,8 +78,7 @@ struct stacktrace_cookie {
unsigned int len;
};
-static bool stack_trace_consume_entry(void *cookie, unsigned long addr,
- bool reliable)
+static bool stack_trace_consume_entry(void *cookie, unsigned long addr)
{
struct stacktrace_cookie *c = cookie;
@@ -94,12 +93,11 @@ static bool stack_trace_consume_entry(void *cookie, unsigned long addr,
return c->len < c->size;
}
-static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr,
- bool reliable)
+static bool stack_trace_consume_entry_nosched(void *cookie, unsigned long addr)
{
if (in_sched_functions(addr))
return true;
- return stack_trace_consume_entry(cookie, addr, reliable);
+ return stack_trace_consume_entry(cookie, addr);
}
/**
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 287862f91717..afad085960b8 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -204,8 +204,7 @@ static int max_extfrag_threshold = 1000;
#if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_SYSCTL)
static int bpf_stats_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
struct static_key *key = (struct static_key *)table->data;
static int saved_val;
@@ -2913,6 +2912,14 @@ static struct ctl_table vm_table[] = {
.proc_handler = percpu_pagelist_fraction_sysctl_handler,
.extra1 = SYSCTL_ZERO,
},
+ {
+ .procname = "page_lock_unfairness",
+ .data = &sysctl_page_lock_unfairness,
+ .maxlen = sizeof(sysctl_page_lock_unfairness),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
#ifdef CONFIG_MMU
{
.procname = "max_map_count",
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index ca223a89530a..f4ace1bf8382 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -908,7 +908,7 @@ static int __init alarmtimer_init(void)
/* Initialize alarm bases */
alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
alarm_bases[ALARM_REALTIME].get_ktime = &ktime_get_real;
- alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64,
+ alarm_bases[ALARM_REALTIME].get_timespec = ktime_get_real_ts64;
alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
alarm_bases[ALARM_BOOTTIME].get_ktime = &ktime_get_boottime;
alarm_bases[ALARM_BOOTTIME].get_timespec = get_boottime_timespec;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 95b6a708b040..3624b9b5835d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
-static struct debug_obj_descr hrtimer_debug_descr;
+static const struct debug_obj_descr hrtimer_debug_descr;
static void *hrtimer_debug_hint(void *addr)
{
@@ -401,7 +401,7 @@ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state)
}
}
-static struct debug_obj_descr hrtimer_debug_descr = {
+static const struct debug_obj_descr hrtimer_debug_descr = {
.name = "hrtimer",
.debug_hint = hrtimer_debug_hint,
.fixup_init = hrtimer_fixup_init,
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
index 1c03eec6ca9b..0642013dace4 100644
--- a/kernel/time/sched_clock.c
+++ b/kernel/time/sched_clock.c
@@ -35,7 +35,7 @@
* into a single 64-byte cache line.
*/
struct clock_data {
- seqcount_t seq;
+ seqcount_latch_t seq;
struct clock_read_data read_data[2];
ktime_t wrap_kt;
unsigned long rate;
@@ -76,7 +76,7 @@ struct clock_read_data *sched_clock_read_begin(unsigned int *seq)
int sched_clock_read_retry(unsigned int seq)
{
- return read_seqcount_retry(&cd.seq, seq);
+ return read_seqcount_latch_retry(&cd.seq, seq);
}
unsigned long long notrace sched_clock(void)
@@ -258,7 +258,7 @@ void __init generic_sched_clock_init(void)
*/
static u64 notrace suspended_sched_clock_read(void)
{
- unsigned int seq = raw_read_seqcount(&cd.seq);
+ unsigned int seq = raw_read_seqcount_latch(&cd.seq);
return cd.read_data[seq & 1].epoch_cyc;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4c47f388a83f..6858a31364b6 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -54,6 +54,9 @@ static struct {
static struct timekeeper shadow_timekeeper;
+/* flag for if timekeeping is suspended */
+int __read_mostly timekeeping_suspended;
+
/**
* struct tk_fast - NMI safe timekeeper
* @seq: Sequence counter for protecting updates. The lowest bit
@@ -64,7 +67,7 @@ static struct timekeeper shadow_timekeeper;
* See @update_fast_timekeeper() below.
*/
struct tk_fast {
- seqcount_raw_spinlock_t seq;
+ seqcount_latch_t seq;
struct tk_read_base base[2];
};
@@ -73,28 +76,42 @@ static u64 cycles_at_suspend;
static u64 dummy_clock_read(struct clocksource *cs)
{
- return cycles_at_suspend;
+ if (timekeeping_suspended)
+ return cycles_at_suspend;
+ return local_clock();
}
static struct clocksource dummy_clock = {
.read = dummy_clock_read,
};
+/*
+ * Boot time initialization which allows local_clock() to be utilized
+ * during early boot when clocksources are not available. local_clock()
+ * returns nanoseconds already so no conversion is required, hence mult=1
+ * and shift=0. When the first proper clocksource is installed then
+ * the fast time keepers are updated with the correct values.
+ */
+#define FAST_TK_INIT \
+ { \
+ .clock = &dummy_clock, \
+ .mask = CLOCKSOURCE_MASK(64), \
+ .mult = 1, \
+ .shift = 0, \
+ }
+
static struct tk_fast tk_fast_mono ____cacheline_aligned = {
- .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_mono.seq, &timekeeper_lock),
- .base[0] = { .clock = &dummy_clock, },
- .base[1] = { .clock = &dummy_clock, },
+ .seq = SEQCNT_LATCH_ZERO(tk_fast_mono.seq),
+ .base[0] = FAST_TK_INIT,
+ .base[1] = FAST_TK_INIT,
};
static struct tk_fast tk_fast_raw ____cacheline_aligned = {
- .seq = SEQCNT_RAW_SPINLOCK_ZERO(tk_fast_raw.seq, &timekeeper_lock),
- .base[0] = { .clock = &dummy_clock, },
- .base[1] = { .clock = &dummy_clock, },
+ .seq = SEQCNT_LATCH_ZERO(tk_fast_raw.seq),
+ .base[0] = FAST_TK_INIT,
+ .base[1] = FAST_TK_INIT,
};
-/* flag for if timekeeping is suspended */
-int __read_mostly timekeeping_suspended;
-
static inline void tk_normalize_xtime(struct timekeeper *tk)
{
while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
@@ -467,7 +484,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
tk_clock_read(tkr),
tkr->cycle_last,
tkr->mask));
- } while (read_seqcount_retry(&tkf->seq, seq));
+ } while (read_seqcount_latch_retry(&tkf->seq, seq));
return now;
}
@@ -513,29 +530,29 @@ u64 notrace ktime_get_boot_fast_ns(void)
}
EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
-
/*
* See comment for __ktime_get_fast_ns() vs. timestamp ordering
*/
-static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf)
+static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
{
struct tk_read_base *tkr;
+ u64 basem, baser, delta;
unsigned int seq;
- u64 now;
do {
seq = raw_read_seqcount_latch(&tkf->seq);
tkr = tkf->base + (seq & 0x01);
- now = ktime_to_ns(tkr->base_real);
+ basem = ktime_to_ns(tkr->base);
+ baser = ktime_to_ns(tkr->base_real);
- now += timekeeping_delta_to_ns(tkr,
- clocksource_delta(
- tk_clock_read(tkr),
- tkr->cycle_last,
- tkr->mask));
- } while (read_seqcount_retry(&tkf->seq, seq));
+ delta = timekeeping_delta_to_ns(tkr,
+ clocksource_delta(tk_clock_read(tkr),
+ tkr->cycle_last, tkr->mask));
+ } while (read_seqcount_latch_retry(&tkf->seq, seq));
- return now;
+ if (mono)
+ *mono = basem + delta;
+ return baser + delta;
}
/**
@@ -543,11 +560,65 @@ static __always_inline u64 __ktime_get_real_fast_ns(struct tk_fast *tkf)
*/
u64 ktime_get_real_fast_ns(void)
{
- return __ktime_get_real_fast_ns(&tk_fast_mono);
+ return __ktime_get_real_fast(&tk_fast_mono, NULL);
}
EXPORT_SYMBOL_GPL(ktime_get_real_fast_ns);
/**
+ * ktime_get_fast_timestamps: - NMI safe timestamps
+ * @snapshot: Pointer to timestamp storage
+ *
+ * Stores clock monotonic, boottime and realtime timestamps.
+ *
+ * Boot time is a racy access on 32bit systems if the sleep time injection
+ * happens late during resume and not in timekeeping_resume(). That could
+ * be avoided by expanding struct tk_read_base with boot offset for 32bit
+ * and adding more overhead to the update. As this is a hard to observe
+ * once per resume event which can be filtered with reasonable effort using
+ * the accurate mono/real timestamps, it's probably not worth the trouble.
+ *
+ * Aside of that it might be possible on 32 and 64 bit to observe the
+ * following when the sleep time injection happens late:
+ *
+ * CPU 0 CPU 1
+ * timekeeping_resume()
+ * ktime_get_fast_timestamps()
+ * mono, real = __ktime_get_real_fast()
+ * inject_sleep_time()
+ * update boot offset
+ * boot = mono + bootoffset;
+ *
+ * That means that boot time already has the sleep time adjustment, but
+ * real time does not. On the next readout both are in sync again.
+ *
+ * Preventing this for 64bit is not really feasible without destroying the
+ * careful cache layout of the timekeeper because the sequence count and
+ * struct tk_read_base would then need two cache lines instead of one.
+ *
+ * Access to the time keeper clock source is disabled accross the innermost
+ * steps of suspend/resume. The accessors still work, but the timestamps
+ * are frozen until time keeping is resumed which happens very early.
+ *
+ * For regular suspend/resume there is no observable difference vs. sched
+ * clock, but it might affect some of the nasty low level debug printks.
+ *
+ * OTOH, access to sched clock is not guaranteed accross suspend/resume on
+ * all systems either so it depends on the hardware in use.
+ *
+ * If that turns out to be a real problem then this could be mitigated by
+ * using sched clock in a similar way as during early boot. But it's not as
+ * trivial as on early boot because it needs some careful protection
+ * against the clock monotonic timestamp jumping backwards on resume.
+ */
+void ktime_get_fast_timestamps(struct ktime_timestamps *snapshot)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ snapshot->real = __ktime_get_real_fast(&tk_fast_mono, &snapshot->mono);
+ snapshot->boot = snapshot->mono + ktime_to_ns(data_race(tk->offs_boot));
+}
+
+/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
*
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index a50364df1054..dda05f4b7a1f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -611,7 +611,7 @@ static void internal_add_timer(struct timer_base *base, struct timer_list *timer
#ifdef CONFIG_DEBUG_OBJECTS_TIMERS
-static struct debug_obj_descr timer_debug_descr;
+static const struct debug_obj_descr timer_debug_descr;
static void *timer_debug_hint(void *addr)
{
@@ -707,7 +707,7 @@ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
}
}
-static struct debug_obj_descr timer_debug_descr = {
+static const struct debug_obj_descr timer_debug_descr = {
.name = "timer_list",
.debug_hint = timer_debug_hint,
.is_static_object = timer_is_static_object,
@@ -794,6 +794,8 @@ static void do_init_timer(struct timer_list *timer,
{
timer->entry.pprev = NULL;
timer->function = func;
+ if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
+ flags &= TIMER_INIT_FLAGS;
timer->flags = flags | raw_smp_processor_id();
lockdep_init_map(&timer->lockdep_map, name, key, 0);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 275441254bb5..541453927c82 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2782,6 +2782,7 @@ static void ftrace_remove_trampoline_from_kallsyms(struct ftrace_ops *ops)
{
lockdep_assert_held(&ftrace_lock);
list_del_rcu(&ops->list);
+ synchronize_rcu();
}
/*
@@ -2862,6 +2863,8 @@ int ftrace_startup(struct ftrace_ops *ops, int command)
__unregister_ftrace_function(ops);
ftrace_start_up--;
ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+ ftrace_trampoline_free(ops);
return ret;
}
@@ -6990,16 +6993,14 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
{
int bit;
- if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching())
- return;
-
bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
if (bit < 0)
return;
preempt_disable_notrace();
- op->func(ip, parent_ip, op, regs);
+ if (!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching())
+ op->func(ip, parent_ip, op, regs);
preempt_enable_notrace();
trace_clear_recursion(bit);
@@ -7531,8 +7532,7 @@ static bool is_permanent_ops_registered(void)
int
ftrace_enable_sysctl(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret = -ENODEV;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index df499922b6a4..25b72a73608a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3546,13 +3546,15 @@ struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
if (iter->ent && iter->ent != iter->temp) {
if ((!iter->temp || iter->temp_size < iter->ent_size) &&
!WARN_ON_ONCE(iter->temp == static_temp_buf)) {
- kfree(iter->temp);
- iter->temp = kmalloc(iter->ent_size, GFP_KERNEL);
- if (!iter->temp)
+ void *temp;
+ temp = kmalloc(iter->ent_size, GFP_KERNEL);
+ if (!temp)
return NULL;
+ kfree(iter->temp);
+ iter->temp = temp;
+ iter->temp_size = iter->ent_size;
}
memcpy(iter->temp, iter->ent, iter->ent_size);
- iter->temp_size = iter->ent_size;
iter->ent = iter->temp;
}
entry = __find_next_entry(iter, ent_cpu, NULL, ent_ts);
@@ -3782,14 +3784,14 @@ unsigned long trace_total_entries(struct trace_array *tr)
static void print_lat_help_header(struct seq_file *m)
{
- seq_puts(m, "# _------=> CPU# \n"
- "# / _-----=> irqs-off \n"
- "# | / _----=> need-resched \n"
- "# || / _---=> hardirq/softirq \n"
- "# ||| / _--=> preempt-depth \n"
- "# |||| / delay \n"
- "# cmd pid ||||| time | caller \n"
- "# \\ / ||||| \\ | / \n");
+ seq_puts(m, "# _------=> CPU# \n"
+ "# / _-----=> irqs-off \n"
+ "# | / _----=> need-resched \n"
+ "# || / _---=> hardirq/softirq \n"
+ "# ||| / _--=> preempt-depth \n"
+ "# |||| / delay \n"
+ "# cmd pid ||||| time | caller \n"
+ "# \\ / ||||| \\ | / \n");
}
static void print_event_info(struct array_buffer *buf, struct seq_file *m)
@@ -3810,26 +3812,26 @@ static void print_func_help_header(struct array_buffer *buf, struct seq_file *m,
print_event_info(buf, m);
- seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? "TGID " : "");
- seq_printf(m, "# | | %s | | |\n", tgid ? " | " : "");
+ seq_printf(m, "# TASK-PID %s CPU# TIMESTAMP FUNCTION\n", tgid ? " TGID " : "");
+ seq_printf(m, "# | | %s | | |\n", tgid ? " | " : "");
}
static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file *m,
unsigned int flags)
{
bool tgid = flags & TRACE_ITER_RECORD_TGID;
- const char *space = " ";
- int prec = tgid ? 10 : 2;
+ const char *space = " ";
+ int prec = tgid ? 12 : 2;
print_event_info(buf, m);
- seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space);
- seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
- seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
- seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
- seq_printf(m, "# %.*s||| / delay\n", prec, space);
- seq_printf(m, "# TASK-PID %.*sCPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID ");
- seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | ");
+ seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space);
+ seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space);
+ seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space);
+ seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space);
+ seq_printf(m, "# %.*s||| / delay\n", prec, space);
+ seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID ");
+ seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | ");
}
void
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 0b933546142e..1b2ef6490229 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -3865,7 +3865,6 @@ static int parse_var_defs(struct hist_trigger_data *hist_data)
s = kstrdup(field_str, GFP_KERNEL);
if (!s) {
- kfree(hist_data->attrs->var_defs.name[n_vars]);
ret = -ENOMEM;
goto free;
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 4d1893564912..000e9dc224c6 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -497,7 +497,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
trace_find_cmdline(entry->pid, comm);
- trace_seq_printf(s, "%8.8s-%-5d %3d",
+ trace_seq_printf(s, "%8.8s-%-7d %3d",
comm, entry->pid, cpu);
return trace_print_lat_fmt(s, entry);
@@ -588,15 +588,15 @@ int trace_print_context(struct trace_iterator *iter)
trace_find_cmdline(entry->pid, comm);
- trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+ trace_seq_printf(s, "%16s-%-7d ", comm, entry->pid);
if (tr->trace_flags & TRACE_ITER_RECORD_TGID) {
unsigned int tgid = trace_find_tgid(entry->pid);
if (!tgid)
- trace_seq_printf(s, "(-----) ");
+ trace_seq_printf(s, "(-------) ");
else
- trace_seq_printf(s, "(%5d) ", tgid);
+ trace_seq_printf(s, "(%7d) ", tgid);
}
trace_seq_printf(s, "[%03d] ", iter->cpu);
@@ -636,7 +636,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
trace_find_cmdline(entry->pid, comm);
trace_seq_printf(
- s, "%16s %5d %3d %d %08x %08lx ",
+ s, "%16s %7d %3d %d %08x %08lx ",
comm, entry->pid, iter->cpu, entry->flags,
entry->preempt_count, iter->idx);
} else {
@@ -917,7 +917,7 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
S = task_index_to_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
trace_seq_printf(&iter->seq,
- " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+ " %7d:%3d:%c %s [%03d] %7d:%3d:%c %s\n",
field->prev_pid,
field->prev_prio,
S, delim,
diff --git a/kernel/trace/trace_preemptirq.c b/kernel/trace/trace_preemptirq.c
index f10073e62603..f4938040c228 100644
--- a/kernel/trace/trace_preemptirq.c
+++ b/kernel/trace/trace_preemptirq.c
@@ -102,14 +102,14 @@ NOKPROBE_SYMBOL(trace_hardirqs_on_caller);
__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
{
+ lockdep_hardirqs_off(CALLER_ADDR0);
+
if (!this_cpu_read(tracing_irq_cpu)) {
this_cpu_write(tracing_irq_cpu, 1);
tracer_hardirqs_off(CALLER_ADDR0, caller_addr);
if (!in_nmi())
trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
}
-
- lockdep_hardirqs_off(CALLER_ADDR0);
}
EXPORT_SYMBOL(trace_hardirqs_off_caller);
NOKPROBE_SYMBOL(trace_hardirqs_off_caller);
diff --git a/kernel/umh.c b/kernel/umh.c
index fcf3ee803630..3f646613a9d3 100644
--- a/kernel/umh.c
+++ b/kernel/umh.c
@@ -14,6 +14,7 @@
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/fs_struct.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
@@ -72,6 +73,14 @@ static int call_usermodehelper_exec_async(void *data)
spin_unlock_irq(&current->sighand->siglock);
/*
+ * Initial kernel threads share ther FS with init, in order to
+ * get the init root directory. But we've now created a new
+ * thread that is going to execve a user process and has its own
+ * 'struct fs_struct'. Reset umask to the default.
+ */
+ current->fs->umask = 0022;
+
+ /*
* Our parent (unbound workqueue) runs with elevated scheduling
* priority. Avoid propagating that into the userspace child.
*/
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c41c3c17b86a..ac088ce6059b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -427,7 +427,7 @@ static void show_pwq(struct pool_workqueue *pwq);
#ifdef CONFIG_DEBUG_OBJECTS_WORK
-static struct debug_obj_descr work_debug_descr;
+static const struct debug_obj_descr work_debug_descr;
static void *work_debug_hint(void *addr)
{
@@ -477,7 +477,7 @@ static bool work_fixup_free(void *addr, enum debug_obj_state state)
}
}
-static struct debug_obj_descr work_debug_descr = {
+static const struct debug_obj_descr work_debug_descr = {
.name = "work_struct",
.debug_hint = work_debug_hint,
.is_static_object = work_is_static_object,