diff options
Diffstat (limited to 'kernel')
136 files changed, 7155 insertions, 3785 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 172d151d429c..f85ae5dfa474 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o diff --git a/kernel/acct.c b/kernel/acct.c index d15c0ee4d955..addf7732fb56 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -102,7 +102,7 @@ static int check_free_space(struct bsd_acct_struct *acct) { struct kstatfs sbuf; - if (time_is_before_jiffies(acct->needcheck)) + if (time_is_after_jiffies(acct->needcheck)) goto out; /* May block */ diff --git a/kernel/async.c b/kernel/async.c index 2cbd3dd5940d..a893d6170944 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -84,20 +84,24 @@ static atomic_t entry_count; static async_cookie_t lowest_in_progress(struct async_domain *domain) { - struct list_head *pending; + struct async_entry *first = NULL; async_cookie_t ret = ASYNC_COOKIE_MAX; unsigned long flags; spin_lock_irqsave(&async_lock, flags); - if (domain) - pending = &domain->pending; - else - pending = &async_global_pending; + if (domain) { + if (!list_empty(&domain->pending)) + first = list_first_entry(&domain->pending, + struct async_entry, domain_list); + } else { + if (!list_empty(&async_global_pending)) + first = list_first_entry(&async_global_pending, + struct async_entry, global_list); + } - if (!list_empty(pending)) - ret = list_first_entry(pending, struct async_entry, - domain_list)->cookie; + if (first) + ret = first->cookie; spin_unlock_irqrestore(&async_lock, flags); return ret; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e691da0b3bab..a713fd23ec88 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,9 +9,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) +ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif endif +endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 7c25426d3cf5..b1f66480135b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -49,34 +49,64 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static struct bpf_map *array_map_alloc(union bpf_attr *attr) +static int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); - struct bpf_array *array; - u64 array_size; - u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || (percpu && numa_node != NUMA_NO_NODE)) - return ERR_PTR(-EINVAL); + return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ - return ERR_PTR(-E2BIG); + return -E2BIG; + + return 0; +} + +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); + struct bpf_array *array; + u64 array_size, mask64; elem_size = round_up(attr->value_size, 8); + max_entries = attr->max_entries; + + /* On 32 bit archs roundup_pow_of_two() with max_entries that has + * upper most bit set in u32 space is undefined behavior due to + * resulting 1U << 32, so do it manually here in u64 space. + */ + mask64 = fls_long(max_entries - 1); + mask64 = 1ULL << mask64; + mask64 -= 1; + + index_mask = mask64; + if (unpriv) { + /* round up array size to nearest power of 2, + * since cpu will speculate within index_mask limits + */ + max_entries = index_mask + 1; + /* Check for overflows. */ + if (max_entries < attr->max_entries) + return ERR_PTR(-E2BIG); + } + array_size = sizeof(*array); if (percpu) - array_size += (u64) attr->max_entries * sizeof(void *); + array_size += (u64) max_entries * sizeof(void *); else - array_size += (u64) attr->max_entries * elem_size; + array_size += (u64) max_entries * elem_size; /* make sure there is no u32 overflow later in round_up() */ if (array_size >= U32_MAX - PAGE_SIZE) @@ -86,14 +116,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array = bpf_map_area_alloc(array_size, numa_node); if (!array) return ERR_PTR(-ENOMEM); + array->index_mask = index_mask; + array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ - array->map.map_type = attr->map_type; - array->map.key_size = attr->key_size; - array->map.value_size = attr->value_size; - array->map.max_entries = attr->max_entries; - array->map.map_flags = attr->map_flags; - array->map.numa_node = numa_node; + bpf_map_init_from_attr(&array->map, attr); array->elem_size = elem_size; if (!percpu) @@ -121,12 +148,13 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return array->value + array->elem_size * index; + return array->value + array->elem_size * (index & array->index_mask); } /* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); struct bpf_insn *insn = insn_buf; u32 elem_size = round_up(map->value_size, 8); const int ret = BPF_REG_0; @@ -135,7 +163,12 @@ static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 4); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + } if (is_power_of_2(elem_size)) { *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); @@ -157,7 +190,7 @@ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) if (unlikely(index >= array->map.max_entries)) return NULL; - return this_cpu_ptr(array->pptrs[index]); + return this_cpu_ptr(array->pptrs[index & array->index_mask]); } int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) @@ -177,7 +210,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); off += size; @@ -225,10 +258,11 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, return -EEXIST; if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) - memcpy(this_cpu_ptr(array->pptrs[index]), + memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]), value, map->value_size); else - memcpy(array->value + array->elem_size * index, + memcpy(array->value + + array->elem_size * (index & array->index_mask), value, map->value_size); return 0; } @@ -262,7 +296,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, */ size = round_up(map->value_size, 8); rcu_read_lock(); - pptr = array->pptrs[index]; + pptr = array->pptrs[index & array->index_mask]; for_each_possible_cpu(cpu) { bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); off += size; @@ -296,6 +330,7 @@ static void array_map_free(struct bpf_map *map) } const struct bpf_map_ops array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -306,6 +341,7 @@ const struct bpf_map_ops array_map_ops = { }; const struct bpf_map_ops percpu_array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -314,12 +350,12 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) +static int fd_array_map_alloc_check(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return array_map_alloc(attr); + return -EINVAL; + return array_map_alloc_check(attr); } static void fd_array_map_free(struct bpf_map *map) @@ -443,7 +479,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map) } const struct bpf_map_ops prog_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -530,7 +567,8 @@ static void perf_event_fd_array_release(struct bpf_map *map, } const struct bpf_map_ops perf_event_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -561,7 +599,8 @@ static void cgroup_fd_array_free(struct bpf_map *map) } const struct bpf_map_ops cgroup_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -579,7 +618,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_array_map_alloc(attr); + map = array_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -613,6 +652,7 @@ static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) static u32 array_of_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) { + struct bpf_array *array = container_of(map, struct bpf_array, map); u32 elem_size = round_up(map->value_size, 8); struct bpf_insn *insn = insn_buf; const int ret = BPF_REG_0; @@ -621,7 +661,12 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); - *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + if (map->unpriv_array) { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 6); + *insn++ = BPF_ALU32_IMM(BPF_AND, ret, array->index_mask); + } else { + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 5); + } if (is_power_of_2(elem_size)) *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); else @@ -636,6 +681,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, } const struct bpf_map_ops array_of_maps_map_ops = { + .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, .map_get_next_key = array_map_get_next_key, diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b789ab78d28f..c1c0b60d3f2f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -568,6 +568,8 @@ static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (type == BPF_WRITE) return false; @@ -576,8 +578,17 @@ static bool cgroup_dev_is_valid_access(int off, int size, /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; + + switch (off) { + case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + break; + default: + if (size != size_default) + return false; + } return true; } diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b9f8686a84cf..29ca9208dcfa 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -94,6 +94,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->pages = size / PAGE_SIZE; fp->aux = aux; fp->aux->prog = fp; + fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); @@ -217,30 +218,40 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) -{ - return BPF_CLASS(insn->code) == BPF_JMP && - /* Call and Exit are both special jumps with no - * target inside the BPF instruction image. - */ - BPF_OP(insn->code) != BPF_CALL && - BPF_OP(insn->code) != BPF_EXIT; -} - static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) { struct bpf_insn *insn = prog->insnsi; u32 i, insn_cnt = prog->len; + bool pseudo_call; + u8 code; + int off; for (i = 0; i < insn_cnt; i++, insn++) { - if (!bpf_is_jmp_and_has_target(insn)) + code = insn->code; + if (BPF_CLASS(code) != BPF_JMP) + continue; + if (BPF_OP(code) == BPF_EXIT) continue; + if (BPF_OP(code) == BPF_CALL) { + if (insn->src_reg == BPF_PSEUDO_CALL) + pseudo_call = true; + else + continue; + } else { + pseudo_call = false; + } + off = pseudo_call ? insn->imm : insn->off; /* Adjust offset of jmps if we cross boundaries. */ - if (i < pos && i + insn->off + 1 > pos) - insn->off += delta; - else if (i > pos + delta && i + insn->off + 1 <= pos + delta) - insn->off -= delta; + if (i < pos && i + off + 1 > pos) + off += delta; + else if (i > pos + delta && i + off + 1 <= pos + delta) + off -= delta; + + if (pseudo_call) + insn->imm = off; + else + insn->off = off; } } @@ -289,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +/* All BPF JIT sysctl knobs here. */ +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_harden __read_mostly; +int bpf_jit_kallsyms __read_mostly; + static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, unsigned long *symbol_start, @@ -370,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; -int bpf_jit_kallsyms __read_mostly; - static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) { WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); @@ -552,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } -int bpf_jit_harden __read_mostly; - static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) @@ -711,7 +723,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled()) + if (!bpf_jit_blinding_enabled(prog) || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); @@ -753,13 +765,16 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) i += insn_delta; } + clone->blinded = 1; return clone; } #endif /* CONFIG_BPF_JIT */ /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs - * anyway later on, so do not let the compiler omit it. + * anyway later on, so do not let the compiler omit it. This also needs + * to go into kallsyms for correlation from e.g. bpftool, so naming + * must not change. */ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { @@ -767,6 +782,138 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +/* All UAPI available opcodes. */ +#define BPF_INSN_MAP(INSN_2, INSN_3) \ + /* 32 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU, ADD, X), \ + INSN_3(ALU, SUB, X), \ + INSN_3(ALU, AND, X), \ + INSN_3(ALU, OR, X), \ + INSN_3(ALU, LSH, X), \ + INSN_3(ALU, RSH, X), \ + INSN_3(ALU, XOR, X), \ + INSN_3(ALU, MUL, X), \ + INSN_3(ALU, MOV, X), \ + INSN_3(ALU, DIV, X), \ + INSN_3(ALU, MOD, X), \ + INSN_2(ALU, NEG), \ + INSN_3(ALU, END, TO_BE), \ + INSN_3(ALU, END, TO_LE), \ + /* Immediate based. */ \ + INSN_3(ALU, ADD, K), \ + INSN_3(ALU, SUB, K), \ + INSN_3(ALU, AND, K), \ + INSN_3(ALU, OR, K), \ + INSN_3(ALU, LSH, K), \ + INSN_3(ALU, RSH, K), \ + INSN_3(ALU, XOR, K), \ + INSN_3(ALU, MUL, K), \ + INSN_3(ALU, MOV, K), \ + INSN_3(ALU, DIV, K), \ + INSN_3(ALU, MOD, K), \ + /* 64 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU64, ADD, X), \ + INSN_3(ALU64, SUB, X), \ + INSN_3(ALU64, AND, X), \ + INSN_3(ALU64, OR, X), \ + INSN_3(ALU64, LSH, X), \ + INSN_3(ALU64, RSH, X), \ + INSN_3(ALU64, XOR, X), \ + INSN_3(ALU64, MUL, X), \ + INSN_3(ALU64, MOV, X), \ + INSN_3(ALU64, ARSH, X), \ + INSN_3(ALU64, DIV, X), \ + INSN_3(ALU64, MOD, X), \ + INSN_2(ALU64, NEG), \ + /* Immediate based. */ \ + INSN_3(ALU64, ADD, K), \ + INSN_3(ALU64, SUB, K), \ + INSN_3(ALU64, AND, K), \ + INSN_3(ALU64, OR, K), \ + INSN_3(ALU64, LSH, K), \ + INSN_3(ALU64, RSH, K), \ + INSN_3(ALU64, XOR, K), \ + INSN_3(ALU64, MUL, K), \ + INSN_3(ALU64, MOV, K), \ + INSN_3(ALU64, ARSH, K), \ + INSN_3(ALU64, DIV, K), \ + INSN_3(ALU64, MOD, K), \ + /* Call instruction. */ \ + INSN_2(JMP, CALL), \ + /* Exit instruction. */ \ + INSN_2(JMP, EXIT), \ + /* Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP, JEQ, X), \ + INSN_3(JMP, JNE, X), \ + INSN_3(JMP, JGT, X), \ + INSN_3(JMP, JLT, X), \ + INSN_3(JMP, JGE, X), \ + INSN_3(JMP, JLE, X), \ + INSN_3(JMP, JSGT, X), \ + INSN_3(JMP, JSLT, X), \ + INSN_3(JMP, JSGE, X), \ + INSN_3(JMP, JSLE, X), \ + INSN_3(JMP, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP, JEQ, K), \ + INSN_3(JMP, JNE, K), \ + INSN_3(JMP, JGT, K), \ + INSN_3(JMP, JLT, K), \ + INSN_3(JMP, JGE, K), \ + INSN_3(JMP, JLE, K), \ + INSN_3(JMP, JSGT, K), \ + INSN_3(JMP, JSLT, K), \ + INSN_3(JMP, JSGE, K), \ + INSN_3(JMP, JSLE, K), \ + INSN_3(JMP, JSET, K), \ + INSN_2(JMP, JA), \ + /* Store instructions. */ \ + /* Register based. */ \ + INSN_3(STX, MEM, B), \ + INSN_3(STX, MEM, H), \ + INSN_3(STX, MEM, W), \ + INSN_3(STX, MEM, DW), \ + INSN_3(STX, XADD, W), \ + INSN_3(STX, XADD, DW), \ + /* Immediate based. */ \ + INSN_3(ST, MEM, B), \ + INSN_3(ST, MEM, H), \ + INSN_3(ST, MEM, W), \ + INSN_3(ST, MEM, DW), \ + /* Load instructions. */ \ + /* Register based. */ \ + INSN_3(LDX, MEM, B), \ + INSN_3(LDX, MEM, H), \ + INSN_3(LDX, MEM, W), \ + INSN_3(LDX, MEM, DW), \ + /* Immediate based. */ \ + INSN_3(LD, IMM, DW), \ + /* Misc (old cBPF carry-over). */ \ + INSN_3(LD, ABS, B), \ + INSN_3(LD, ABS, H), \ + INSN_3(LD, ABS, W), \ + INSN_3(LD, IND, B), \ + INSN_3(LD, IND, H), \ + INSN_3(LD, IND, W) + +bool bpf_opcode_in_insntable(u8 code) +{ +#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true +#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true + static const bool public_insntable[256] = { + [0 ... 255] = false, + /* Now overwrite non-defaults ... */ + BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + }; +#undef BPF_INSN_3_TBL +#undef BPF_INSN_2_TBL + return public_insntable[code]; +} + +#ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on @@ -774,118 +921,21 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); * * Decode and execute eBPF instructions. */ -static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, - u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { u64 tmp; +#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y +#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ - /* 32 bit ALU operations */ - [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, - [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, - [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, - [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, - [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, - [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, - [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, - [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, - [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, - [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, - [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, - [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, - [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, - [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, - [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, - [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, - [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, - [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, - [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, - [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, - [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, - [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, - [BPF_ALU | BPF_NEG] = &&ALU_NEG, - [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, - [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, - /* 64 bit ALU operations */ - [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, - [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, - [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, - [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, - [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, - [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, - [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, - [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, - [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, - [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, - [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, - [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, - [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, - [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, - [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, - [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, - [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, - [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, - [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, - [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, - [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, - [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, - [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, - [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, - [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, - /* Call instruction */ - [BPF_JMP | BPF_CALL] = &&JMP_CALL, + BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), + /* Non-UAPI available opcodes. */ + [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, - /* Jumps */ - [BPF_JMP | BPF_JA] = &&JMP_JA, - [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, - [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, - [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, - [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, - [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, - [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, - [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, - [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, - [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, - [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, - [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, - [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, - [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, - [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, - [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, - [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, - [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, - [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, - [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, - [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, - [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, - [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, - /* Program return */ - [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, - /* Store instructions */ - [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, - [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, - [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, - [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, - [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, - [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, - [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, - [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, - [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, - [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, - /* Load instructions */ - [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, - [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, - [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, - [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, - [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, - [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, - [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, - [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, - [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, - [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, - [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; +#undef BPF_INSN_3_LBL +#undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; void *ptr; int off; @@ -949,14 +999,10 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - if (unlikely(SRC == 0)) - return 0; div64_u64_rem(DST, SRC, &tmp); DST = tmp; CONT; ALU_MOD_X: - if (unlikely(SRC == 0)) - return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); CONT; @@ -969,13 +1015,9 @@ select_insn: DST = do_div(tmp, (u32) IMM); CONT; ALU64_DIV_X: - if (unlikely(SRC == 0)) - return 0; DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely(SRC == 0)) - return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); DST = (u32) tmp; @@ -1025,6 +1067,13 @@ select_insn: BPF_R4, BPF_R5); CONT; + JMP_CALL_ARGS: + BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, + BPF_R3, BPF_R4, + BPF_R5, + insn + insn->off + 1); + CONT; + JMP_TAIL_CALL: { struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1279,8 +1328,14 @@ load_byte: goto load_byte; default_label: - /* If we ever reach this, we have a bug somewhere. */ - WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + /* If we ever reach this, we have a bug somewhere. Die hard here + * instead of just returning 0; we could be somewhere in a subprog, + * so execution could continue otherwise which we do /not/ want. + * + * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). + */ + pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + BUG_ON(1); return 0; } STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ @@ -1297,6 +1352,23 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn return ___bpf_prog_run(regs, insn, stack); \ } +#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size +#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ +static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ + const struct bpf_insn *insn) \ +{ \ + u64 stack[stack_size / sizeof(u64)]; \ + u64 regs[MAX_BPF_REG]; \ +\ + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ + BPF_R1 = r1; \ + BPF_R2 = r2; \ + BPF_R3 = r3; \ + BPF_R4 = r4; \ + BPF_R5 = r5; \ + return ___bpf_prog_run(regs, insn, stack); \ +} + #define EVAL1(FN, X) FN(X) #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) @@ -1308,6 +1380,10 @@ EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); +EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); +EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); +EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); + #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), static unsigned int (*interpreters[])(const void *ctx, @@ -1316,10 +1392,43 @@ EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#undef PROG_NAME_LIST +#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), +static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, + const struct bpf_insn *insn) = { +EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) +EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) +EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) +}; +#undef PROG_NAME_LIST + +void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) +{ + stack_depth = max_t(u32, stack_depth, 1); + insn->off = (s16) insn->imm; + insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - + __bpf_call_base_args; + insn->code = BPF_JMP | BPF_CALL_ARGS; +} + +#else +static unsigned int __bpf_prog_ret0_warn(const void *ctx, + const struct bpf_insn *insn) +{ + /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON + * is not working properly, so warn about it! + */ + WARN_ON_ONCE(1); + return 0; +} +#endif bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + if (fp->kprobe_override) + return false; + if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. @@ -1364,9 +1473,13 @@ static int bpf_check_tail_call(const struct bpf_prog *fp) */ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) { +#ifndef CONFIG_BPF_JIT_ALWAYS_ON u32 stack_depth = max_t(u32, fp->aux->stack_depth, 1); fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; +#else + fp->bpf_func = __bpf_prog_ret0_warn; +#endif /* eBPF JITs can rewrite the program in case constant * blinding is active. However, in case of error during @@ -1376,6 +1489,12 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) */ if (!bpf_prog_is_dev_bound(fp->aux)) { fp = bpf_int_jit_compile(fp); +#ifdef CONFIG_BPF_JIT_ALWAYS_ON + if (!fp->jited) { + *err = -ENOTSUPP; + return fp; + } +#endif } else { *err = bpf_prog_offload_compile(fp); if (*err) @@ -1447,7 +1566,8 @@ int bpf_prog_array_length(struct bpf_prog_array __rcu *progs) rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) - cnt++; + if (*prog != &dummy_bpf_prog.prog) + cnt++; rcu_read_unlock(); return cnt; } @@ -1456,23 +1576,41 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, __u32 __user *prog_ids, u32 cnt) { struct bpf_prog **prog; - u32 i = 0, id; - + unsigned long err = 0; + u32 i = 0, *ids; + bool nospc; + + /* users of this function are doing: + * cnt = bpf_prog_array_length(); + * if (cnt > 0) + * bpf_prog_array_copy_to_user(..., cnt); + * so below kcalloc doesn't need extra cnt > 0 check, but + * bpf_prog_array_length() releases rcu lock and + * prog array could have been swapped with empty or larger array, + * so always copy 'cnt' prog_ids to the user. + * In a rare race the user will see zero prog_ids + */ + ids = kcalloc(cnt, sizeof(u32), GFP_USER); + if (!ids) + return -ENOMEM; rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) { - id = (*prog)->aux->id; - if (copy_to_user(prog_ids + i, &id, sizeof(id))) { - rcu_read_unlock(); - return -EFAULT; - } + if (*prog == &dummy_bpf_prog.prog) + continue; + ids[i] = (*prog)->aux->id; if (++i == cnt) { prog++; break; } } + nospc = !!(*prog); rcu_read_unlock(); - if (*prog) + err = copy_to_user(prog_ids, ids, cnt * sizeof(u32)); + kfree(ids); + if (err) + return -EFAULT; + if (nospc) return -ENOSPC; return 0; } @@ -1544,14 +1682,41 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, return 0; } +int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, + __u32 __user *prog_ids, u32 request_cnt, + __u32 __user *prog_cnt) +{ + u32 cnt = 0; + + if (array) + cnt = bpf_prog_array_length(array); + + if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + + /* return early if user requested only program count or nothing to copy */ + if (!request_cnt || !cnt) + return 0; + + return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; + int i; aux = container_of(work, struct bpf_prog_aux, work); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); - bpf_jit_free(aux->prog); + for (i = 0; i < aux->func_cnt; i++) + bpf_jit_free(aux->func[i]); + if (aux->func_cnt) { + kfree(aux->func); + bpf_prog_unlock_free(aux->prog); + } else { + bpf_jit_free(aux->prog); + } } /* Free internal BPF program */ diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ce5b669003b2..fbfdada6caee 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) if (!cmap) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - cmap->map.map_type = attr->map_type; - cmap->map.key_size = attr->key_size; - cmap->map.value_size = attr->value_size; - cmap->map.max_entries = attr->max_entries; - cmap->map.map_flags = attr->map_flags; - cmap->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&cmap->map, attr); /* Pre-limit array size based on NR_CPUS, not final CPU check */ if (cmap->map.max_entries > NR_CPUS) { @@ -143,7 +137,7 @@ free_cmap: return ERR_PTR(err); } -void __cpu_map_queue_destructor(void *ptr) +static void __cpu_map_queue_destructor(void *ptr) { /* The tear-down procedure should have made sure that queue is * empty. See __cpu_map_entry_replace() and work-queue @@ -222,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) return xdp_pkt; } -struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) +static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, + struct xdp_pkt *xdp_pkt) { unsigned int frame_size; void *pkt_data_start; @@ -337,7 +331,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, + int map_id) { gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; @@ -395,7 +390,7 @@ free_rcu: return NULL; } -void __cpu_map_entry_free(struct rcu_head *rcu) +static void __cpu_map_entry_free(struct rcu_head *rcu) { struct bpf_cpu_map_entry *rcpu; int cpu; @@ -438,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu) * cpu_map_kthread_stop, which waits for an RCU graze period before * stopping kthread, emptying the queue. */ -void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, - u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, + u32 key_cpu, struct bpf_cpu_map_entry *rcpu) { struct bpf_cpu_map_entry *old_rcpu; @@ -451,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, } } -int cpu_map_delete_elem(struct bpf_map *map, void *key) +static int cpu_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 key_cpu = *(u32 *)key; @@ -464,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key) return 0; } -int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -502,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } -void cpu_map_free(struct bpf_map *map) +static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); int cpu; diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ebdef54bf7df..565f9ece9115 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - dtab->map.map_type = attr->map_type; - dtab->map.key_size = attr->key_size; - dtab->map.value_size = attr->value_size; - dtab->map.max_entries = attr->max_entries; - dtab->map.map_flags = attr->map_flags; - dtab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&dtab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index e682850c9715..8740406df2cd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -21,10 +21,39 @@ static const char * const func_id_str[] = { }; #undef __BPF_FUNC_STR_FN -const char *func_id_name(int id) +static const char *__func_get_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + char *buff, size_t len) { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + if (insn->src_reg != BPF_PSEUDO_CALL && + insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && + func_id_str[insn->imm]) + return func_id_str[insn->imm]; + + if (cbs && cbs->cb_call) + return cbs->cb_call(cbs->private_data, insn); + + if (insn->src_reg == BPF_PSEUDO_CALL) + snprintf(buff, len, "%+d", insn->imm); + + return buff; +} + +static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + u64 full_imm, char *buff, size_t len) +{ + if (cbs && cbs->cb_imm) + return cbs->cb_imm(cbs->private_data, insn, full_imm); + + snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); + return buff; +} + +const char *func_id_name(int id) +{ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) return func_id_str[id]; else @@ -83,7 +112,7 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_end_insn(bpf_insn_print_cb verbose, +static void print_bpf_end_insn(bpf_insn_print_t verbose, struct bpf_verifier_env *env, const struct bpf_insn *insn) { @@ -92,9 +121,12 @@ static void print_bpf_end_insn(bpf_insn_print_cb verbose, insn->imm, insn->dst_reg); } -void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, - const struct bpf_insn *insn, bool allow_ptr_leaks) +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + struct bpf_verifier_env *env, + const struct bpf_insn *insn, + bool allow_ptr_leaks) { + const bpf_insn_print_t verbose = cbs->cb_print; u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { @@ -175,12 +207,15 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + char tmp[64]; if (map_ptr && !allow_ptr_leaks) imm = 0; - verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, - insn->dst_reg, (unsigned long long)imm); + verbose(env, "(%02x) r%d = %s\n", + insn->code, insn->dst_reg, + __func_imm_name(cbs, insn, imm, + tmp, sizeof(tmp))); } else { verbose(env, "BUG_ld_%02x\n", insn->code); return; @@ -189,8 +224,20 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); + char tmp[64]; + + if (insn->src_reg == BPF_PSEUDO_CALL) { + verbose(env, "(%02x) call pc%s\n", + insn->code, + __func_get_name(cbs, insn, + tmp, sizeof(tmp))); + } else { + strcpy(tmp, "unknown"); + verbose(env, "(%02x) call %s#%d\n", insn->code, + __func_get_name(cbs, insn, + tmp, sizeof(tmp)), + insn->imm); + } } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(env, "(%02x) goto pc%+d\n", insn->code, insn->off); diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index 8de977e420b6..266fe8ee542b 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -17,16 +17,35 @@ #include <linux/bpf.h> #include <linux/kernel.h> #include <linux/stringify.h> +#ifndef __KERNEL__ +#include <stdio.h> +#include <string.h> +#endif + +struct bpf_verifier_env; extern const char *const bpf_alu_string[16]; extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -struct bpf_verifier_env; -typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, - const char *, ...); -void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, - const struct bpf_insn *insn, bool allow_ptr_leaks); +typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env, + const char *, ...); +typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, + const struct bpf_insn *insn); +typedef const char *(*bpf_insn_print_imm_t)(void *private_data, + const struct bpf_insn *insn, + __u64 full_imm); + +struct bpf_insn_cbs { + bpf_insn_print_t cb_print; + bpf_insn_revmap_call_t cb_call; + bpf_insn_print_imm_t cb_imm; + void *private_data; +}; +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + struct bpf_verifier_env *env, + const struct bpf_insn *insn, + bool allow_ptr_leaks); #endif diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index e469e05c8e83..b76828f23b49 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -114,6 +114,7 @@ static void htab_free_elems(struct bpf_htab *htab) pptr = htab_elem_get_ptr(get_htab_elem(htab, i), htab->map.key_size); free_percpu(pptr); + cond_resched(); } free_elems: bpf_map_area_free(htab->elems); @@ -159,6 +160,7 @@ static int prealloc_init(struct bpf_htab *htab) goto free_elems; htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, pptr); + cond_resched(); } skip_percpu_elems: @@ -225,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab) } /* Called from syscall */ -static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +static int htab_map_alloc_check(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); @@ -239,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); int numa_node = bpf_map_attr_numa_node(attr); - struct bpf_htab *htab; - int err, i; - u64 cost; BUILD_BUG_ON(offsetof(struct htab_elem, htab) != offsetof(struct htab_elem, hash_node.pprev)); @@ -252,40 +251,68 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* LRU implementation is much complicated than other * maps. Hence, limit to CAP_SYS_ADMIN for now. */ - return ERR_PTR(-EPERM); + return -EPERM; if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) /* reserved bits should not be used */ - return ERR_PTR(-EINVAL); + return -EINVAL; if (!lru && percpu_lru) - return ERR_PTR(-EINVAL); + return -EINVAL; if (lru && !prealloc) - return ERR_PTR(-ENOTSUPP); + return -ENOTSUPP; if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) - return ERR_PTR(-EINVAL); + return -EINVAL; + + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + if (attr->max_entries == 0 || attr->key_size == 0 || + attr->value_size == 0) + return -EINVAL; + + if (attr->key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + return -E2BIG; + + if (attr->value_size >= KMALLOC_MAX_SIZE - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + return -E2BIG; + + return 0; +} + +static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +{ + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + struct bpf_htab *htab; + int err, i; + u64 cost; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - htab->map.map_type = attr->map_type; - htab->map.key_size = attr->key_size; - htab->map.value_size = attr->value_size; - htab->map.max_entries = attr->max_entries; - htab->map.map_flags = attr->map_flags; - htab->map.numa_node = numa_node; - - /* check sanity of attributes. - * value_size == 0 may be allowed in the future to use map as a set - */ - err = -EINVAL; - if (htab->map.max_entries == 0 || htab->map.key_size == 0 || - htab->map.value_size == 0) - goto free_htab; + bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. @@ -302,22 +329,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* hash table size must be power of 2 */ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); - err = -E2BIG; - if (htab->map.key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - goto free_htab; - - if (htab->map.value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's - * kmalloc-able later in htab_map_update_elem() - */ - goto free_htab; - htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) @@ -325,6 +336,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else htab->elem_size += round_up(htab->map.value_size, 8); + err = -E2BIG; /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct bucket)) @@ -1141,6 +1153,7 @@ static void htab_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1151,6 +1164,7 @@ const struct bpf_map_ops htab_map_ops = { }; const struct bpf_map_ops htab_lru_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1234,6 +1248,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, } const struct bpf_map_ops htab_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1243,6 +1258,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { }; const struct bpf_map_ops htab_lru_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1251,11 +1267,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +static int fd_htab_map_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return htab_map_alloc(attr); + return -EINVAL; + return htab_map_alloc_check(attr); } static void fd_htab_map_free(struct bpf_map *map) @@ -1326,7 +1342,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_htab_map_alloc(attr); + map = htab_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -1370,6 +1386,7 @@ static void htab_of_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_of_maps_map_ops = { + .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, .map_free = htab_of_map_free, .map_get_next_key = htab_map_get_next_key, diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 01aaef1a77c5..81e2f6995adb 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,39 +150,29 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } -static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, - umode_t mode, const struct inode_operations *iops) +static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, + const struct inode_operations *iops) { - struct inode *inode; - - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); + struct inode *dir = dentry->d_parent->d_inode; + struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = iops; - inode->i_private = dentry->d_fsdata; + inode->i_private = raw; bpf_dentry_finalize(dentry, inode, dir); return 0; } -static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t devt) +static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { - enum bpf_type type = MINOR(devt); - - if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) || - dentry->d_fsdata == NULL) - return -EPERM; + return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops); +} - switch (type) { - case BPF_TYPE_PROG: - return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); - case BPF_TYPE_MAP: - return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops); - default: - return -EPERM; - } +static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) +{ + return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops); } static struct dentry * @@ -218,7 +208,6 @@ static int bpf_symlink(struct inode *dir, struct dentry *dentry, static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, - .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .symlink = bpf_symlink, .rmdir = simple_rmdir, @@ -234,7 +223,6 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, struct inode *dir; struct path path; umode_t mode; - dev_t devt; int ret; dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); @@ -242,9 +230,8 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, return PTR_ERR(dentry); mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); - devt = MKDEV(UNNAMED_MAJOR, type); - ret = security_path_mknod(&path, dentry, mode, devt); + ret = security_path_mknod(&path, dentry, mode, 0); if (ret) goto out; @@ -254,9 +241,16 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, goto out; } - dentry->d_fsdata = raw; - ret = vfs_mknod(dir, dentry, mode, devt); - dentry->d_fsdata = NULL; + switch (type) { + case BPF_TYPE_PROG: + ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); + break; + case BPF_TYPE_MAP: + ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); + break; + default: + ret = -EPERM; + } out: done_path_create(&path, dentry); return ret; @@ -368,7 +362,45 @@ out: putname(pname); return ret; } -EXPORT_SYMBOL_GPL(bpf_obj_get_user); + +static struct bpf_prog *__get_prog_inode(struct inode *inode, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + int ret = inode_permission(inode, MAY_READ | MAY_WRITE); + if (ret) + return ERR_PTR(ret); + + if (inode->i_op == &bpf_map_iops) + return ERR_PTR(-EINVAL); + if (inode->i_op != &bpf_prog_iops) + return ERR_PTR(-EACCES); + + prog = inode->i_private; + + ret = security_bpf_prog(prog); + if (ret < 0) + return ERR_PTR(ret); + + if (!bpf_prog_get_ok(prog, &type, false)) + return ERR_PTR(-EINVAL); + + return bpf_prog_inc(prog); +} + +struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type type) +{ + struct bpf_prog *prog; + struct path path; + int ret = kern_path(name, LOOKUP_FOLLOW, &path); + if (ret) + return ERR_PTR(ret); + prog = __get_prog_inode(d_backing_inode(path.dentry), type); + if (!IS_ERR(prog)) + touch_atime(&path); + path_put(&path); + return prog; +} +EXPORT_SYMBOL(bpf_prog_get_type_path); static void bpf_evict_inode(struct inode *inode) { diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 885e45479680..7b469d10d0e9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ - trie->map.map_type = attr->map_type; - trie->map.key_size = attr->key_size; - trie->map.value_size = attr->value_size; - trie->map.max_entries = attr->max_entries; - trie->map.map_flags = attr->map_flags; - trie->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&trie->map, attr); trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; @@ -596,9 +591,96 @@ unlock: raw_spin_unlock(&trie->lock); } -static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { - return -ENOTSUPP; + struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; + struct lpm_trie_node **node_stack = NULL; + int err = 0, stack_ptr = -1; + unsigned int next_bit; + size_t matchlen; + + /* The get_next_key follows postorder. For the 4 node example in + * the top of this file, the trie_get_next_key() returns the following + * one after another: + * 192.168.0.0/24 + * 192.168.1.0/24 + * 192.168.128.0/24 + * 192.168.0.0/16 + * + * The idea is to return more specific keys before less specific ones. + */ + + /* Empty trie */ + search_root = rcu_dereference(trie->root); + if (!search_root) + return -ENOENT; + + /* For invalid key, find the leftmost node in the trie */ + if (!key || key->prefixlen > trie->max_prefixlen) + goto find_leftmost; + + node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), + GFP_ATOMIC | __GFP_NOWARN); + if (!node_stack) + return -ENOMEM; + + /* Try to find the exact node for the given key */ + for (node = search_root; node;) { + node_stack[++stack_ptr] = node; + matchlen = longest_prefix_match(trie, node, key); + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) + goto find_leftmost; + + /* The node with the exactly-matching key has been found, + * find the first node in postorder after the matched node. + */ + node = node_stack[stack_ptr]; + while (stack_ptr > 0) { + parent = node_stack[stack_ptr - 1]; + if (rcu_dereference(parent->child[0]) == node) { + search_root = rcu_dereference(parent->child[1]); + if (search_root) + goto find_leftmost; + } + if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { + next_node = parent; + goto do_copy; + } + + node = parent; + stack_ptr--; + } + + /* did not find anything */ + err = -ENOENT; + goto free_stack; + +find_leftmost: + /* Find the leftmost non-intermediate node, all intermediate nodes + * have exact two children, so this function will never return NULL. + */ + for (node = search_root; node;) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + next_node = node; + node = rcu_dereference(node->child[0]); + } +do_copy: + next_key->prefixlen = next_node->prefixlen; + memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), + next_node->data, trie->data_size); +free_stack: + kfree(node_stack); + return err; } const struct bpf_map_ops trie_map_ops = { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 68ec884440b7..c9401075b58c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -1,18 +1,50 @@ +/* + * Copyright (C) 2017 Netronome Systems, Inc. + * + * This software is licensed under the GNU General License Version 2, + * June 1991 as shown in the file COPYING in the top-level directory of this + * source tree. + * + * THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" + * WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, + * BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE + * OF THE PROGRAM IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME + * THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + */ + #include <linux/bpf.h> #include <linux/bpf_verifier.h> #include <linux/bug.h> +#include <linux/kdev_t.h> #include <linux/list.h> #include <linux/netdevice.h> #include <linux/printk.h> +#include <linux/proc_ns.h> #include <linux/rtnetlink.h> +#include <linux/rwsem.h> -/* protected by RTNL */ +/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members + * of all progs. + * RTNL lock cannot be taken when holding this lock. + */ +static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); +static LIST_HEAD(bpf_map_offload_devs); + +static int bpf_dev_offload_check(struct net_device *netdev) +{ + if (!netdev) + return -EINVAL; + if (!netdev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + return 0; +} int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { - struct net *net = current->nsproxy->net_ns; - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; + int err; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) @@ -26,34 +58,44 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) return -ENOMEM; offload->prog = prog; - init_waitqueue_head(&offload->verifier_done); - rtnl_lock(); - offload->netdev = __dev_get_by_index(net, attr->prog_ifindex); - if (!offload->netdev) { - rtnl_unlock(); - kfree(offload); - return -EINVAL; - } + offload->netdev = dev_get_by_index(current->nsproxy->net_ns, + attr->prog_ifindex); + err = bpf_dev_offload_check(offload->netdev); + if (err) + goto err_maybe_put; + down_write(&bpf_devs_lock); + if (offload->netdev->reg_state != NETREG_REGISTERED) { + err = -EINVAL; + goto err_unlock; + } prog->aux->offload = offload; list_add_tail(&offload->offloads, &bpf_prog_offload_devs); - rtnl_unlock(); + dev_put(offload->netdev); + up_write(&bpf_devs_lock); return 0; +err_unlock: + up_write(&bpf_devs_lock); +err_maybe_put: + if (offload->netdev) + dev_put(offload->netdev); + kfree(offload); + return err; } static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, struct netdev_bpf *data) { - struct net_device *netdev = prog->aux->offload->netdev; + struct bpf_prog_offload *offload = prog->aux->offload; + struct net_device *netdev; ASSERT_RTNL(); - if (!netdev) + if (!offload) return -ENODEV; - if (!netdev->netdev_ops->ndo_bpf) - return -EOPNOTSUPP; + netdev = offload->netdev; data->command = cmd; @@ -72,62 +114,63 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) if (err) goto exit_unlock; - env->dev_ops = data.verifier.ops; - + env->prog->aux->offload->dev_ops = data.verifier.ops; env->prog->aux->offload->dev_state = true; - env->prog->aux->offload->verifier_running = true; exit_unlock: rtnl_unlock(); return err; } +int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) +{ + struct bpf_prog_offload *offload; + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload) + ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); + up_read(&bpf_devs_lock); + + return ret; +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; - /* Caution - if netdev is destroyed before the program, this function - * will be called twice. - */ - data.offload.prog = prog; - if (offload->verifier_running) - wait_event(offload->verifier_done, !offload->verifier_running); - if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); - offload->dev_state = false; + /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ + bpf_prog_free_id(prog, true); + list_del_init(&offload->offloads); - offload->netdev = NULL; + kfree(offload); + prog->aux->offload = NULL; } void bpf_prog_offload_destroy(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; - - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); - __bpf_prog_offload_destroy(prog); + down_write(&bpf_devs_lock); + if (prog->aux->offload) + __bpf_prog_offload_destroy(prog); + up_write(&bpf_devs_lock); rtnl_unlock(); - - kfree(offload); } static int bpf_prog_offload_translate(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; int ret; data.offload.prog = prog; - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); rtnl_unlock(); @@ -149,14 +192,323 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } +struct ns_get_path_bpf_prog_args { + struct bpf_prog *prog; + struct bpf_prog_info *info; +}; + +static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_prog_args *args = private_data; + struct bpf_prog_aux *aux = args->prog->aux; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (aux->offload) { + args->info->ifindex = aux->offload->netdev->ifindex; + net = dev_net(aux->offload->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_prog_offload_info_fill(struct bpf_prog_info *info, + struct bpf_prog *prog) +{ + struct ns_get_path_bpf_prog_args args = { + .prog = prog, + .info = info, + }; + struct bpf_prog_aux *aux = prog->aux; + struct inode *ns_inode; + struct path ns_path; + char __user *uinsns; + void *res; + u32 ulen; + + res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + down_read(&bpf_devs_lock); + + if (!aux->offload) { + up_read(&bpf_devs_lock); + return -ENODEV; + } + + ulen = info->jited_prog_len; + info->jited_prog_len = aux->offload->jited_len; + if (info->jited_prog_len & ulen) { + uinsns = u64_to_user_ptr(info->jited_prog_insns); + ulen = min_t(u32, info->jited_prog_len, ulen); + if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { + up_read(&bpf_devs_lock); + return -EFAULT; + } + } + + up_read(&bpf_devs_lock); + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + const struct bpf_prog_ops bpf_offload_prog_ops = { }; +static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, + enum bpf_netdev_command cmd) +{ + struct netdev_bpf data = {}; + struct net_device *netdev; + + ASSERT_RTNL(); + + data.command = cmd; + data.offmap = offmap; + /* Caller must make sure netdev is valid */ + netdev = offmap->netdev; + + return netdev->netdev_ops->ndo_bpf(netdev, &data); +} + +struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_offloaded_map *offmap; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_type != BPF_MAP_TYPE_HASH) + return ERR_PTR(-EINVAL); + + offmap = kzalloc(sizeof(*offmap), GFP_USER); + if (!offmap) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&offmap->map, attr); + + rtnl_lock(); + down_write(&bpf_devs_lock); + offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); + err = bpf_dev_offload_check(offmap->netdev); + if (err) + goto err_unlock; + + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); + if (err) + goto err_unlock; + + list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + return &offmap->map; + +err_unlock: + up_write(&bpf_devs_lock); + rtnl_unlock(); + kfree(offmap); + return ERR_PTR(err); +} + +static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) +{ + WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); + /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ + bpf_map_free_id(&offmap->map, true); + list_del_init(&offmap->offloads); + offmap->netdev = NULL; +} + +void bpf_map_offload_map_free(struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + + rtnl_lock(); + down_write(&bpf_devs_lock); + if (offmap->netdev) + __bpf_map_offload_destroy(offmap); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + kfree(offmap); +} + +int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_update_elem(offmap, key, value, + flags); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_delete_elem(offmap, key); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); + up_read(&bpf_devs_lock); + + return ret; +} + +struct ns_get_path_bpf_map_args { + struct bpf_offloaded_map *offmap; + struct bpf_map_info *info; +}; + +static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_map_args *args = private_data; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (args->offmap->netdev) { + args->info->ifindex = args->offmap->netdev->ifindex; + net = dev_net(args->offmap->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) +{ + struct ns_get_path_bpf_map_args args = { + .offmap = map_to_offmap(map), + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap; + struct bpf_prog_offload *offload; + bool ret; + + if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) + return false; + + down_read(&bpf_devs_lock); + offload = prog->aux->offload; + offmap = map_to_offmap(map); + + ret = offload && offload->netdev == offmap->netdev; + up_read(&bpf_devs_lock); + + return ret; +} + +static void bpf_offload_orphan_all_progs(struct net_device *netdev) +{ + struct bpf_prog_offload *offload, *tmp; + + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) + if (offload->netdev == netdev) + __bpf_prog_offload_destroy(offload->prog); +} + +static void bpf_offload_orphan_all_maps(struct net_device *netdev) +{ + struct bpf_offloaded_map *offmap, *tmp; + + list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) + if (offmap->netdev == netdev) + __bpf_map_offload_destroy(offmap); +} + static int bpf_offload_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct bpf_dev_offload *offload, *tmp; ASSERT_RTNL(); @@ -166,11 +518,10 @@ static int bpf_offload_notification(struct notifier_block *notifier, if (netdev->reg_state != NETREG_UNREGISTERING) break; - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, - offloads) { - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); - } + down_write(&bpf_devs_lock); + bpf_offload_orphan_all_progs(netdev); + bpf_offload_orphan_all_maps(netdev); + up_write(&bpf_devs_lock); break; default: break; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41893d9..48c33417d13c 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -86,9 +86,10 @@ struct smap_psock { struct work_struct tx_work; struct work_struct gc_work; + struct proto *sk_proto; + void (*save_close)(struct sock *sk, long timeout); void (*save_data_ready)(struct sock *sk); void (*save_write_space)(struct sock *sk); - void (*save_state_change)(struct sock *sk); }; static inline struct smap_psock *smap_psock_sk(const struct sock *sk) @@ -96,12 +97,78 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } -/* compute the linear packet data range [data, data_end) for skb when - * sk_skb type programs are in use. - */ -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) +static struct proto tcp_bpf_proto; +static int bpf_tcp_init(struct sock *sk) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return -EINVAL; + } + + if (unlikely(psock->sk_proto)) { + rcu_read_unlock(); + return -EBUSY; + } + + psock->save_close = sk->sk_prot->close; + psock->sk_proto = sk->sk_prot; + sk->sk_prot = &tcp_bpf_proto; + rcu_read_unlock(); + return 0; +} + +static void bpf_tcp_release(struct sock *sk) +{ + struct smap_psock *psock; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + + if (likely(psock)) { + sk->sk_prot = psock->sk_proto; + psock->sk_proto = NULL; + } + rcu_read_unlock(); +} + +static void smap_release_sock(struct smap_psock *psock, struct sock *sock); + +static void bpf_tcp_close(struct sock *sk, long timeout) { - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); + void (*close_fun)(struct sock *sk, long timeout); + struct smap_psock_map_entry *e, *tmp; + struct smap_psock *psock; + struct sock *osk; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) { + rcu_read_unlock(); + return sk->sk_prot->close(sk, timeout); + } + + /* The psock may be destroyed anytime after exiting the RCU critial + * section so by the time we use close_fun the psock may no longer + * be valid. However, bpf_tcp_close is called with the sock lock + * held so the close hook and sk are still valid. + */ + close_fun = psock->save_close; + + write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(e, tmp, &psock->maps, list) { + osk = cmpxchg(e->entry, sk, NULL); + if (osk == sk) { + list_del(&e->list); + smap_release_sock(psock, sk); + } + } + write_unlock_bh(&sk->sk_callback_lock); + rcu_read_unlock(); + close_fun(sk, timeout); } enum __sk_action { @@ -110,6 +177,22 @@ enum __sk_action { __SK_REDIRECT, }; +static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = { + .name = "bpf_tcp", + .uid = TCP_ULP_BPF, + .user_visible = false, + .owner = NULL, + .init = bpf_tcp_init, + .release = bpf_tcp_release, +}; + +static int bpf_tcp_ulp_register(void) +{ + tcp_bpf_proto = tcp_prot; + tcp_bpf_proto.close = bpf_tcp_close; + return tcp_register_ulp(&bpf_tcp_ulp_ops); +} + static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) { struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict); @@ -174,68 +257,6 @@ static void smap_report_sk_error(struct smap_psock *psock, int err) sk->sk_error_report(sk); } -static void smap_release_sock(struct smap_psock *psock, struct sock *sock); - -/* Called with lock_sock(sk) held */ -static void smap_state_change(struct sock *sk) -{ - struct smap_psock_map_entry *e, *tmp; - struct smap_psock *psock; - struct socket_wq *wq; - struct sock *osk; - - rcu_read_lock(); - - /* Allowing transitions into an established syn_recv states allows - * for early binding sockets to a smap object before the connection - * is established. - */ - switch (sk->sk_state) { - case TCP_SYN_SENT: - case TCP_SYN_RECV: - case TCP_ESTABLISHED: - break; - case TCP_CLOSE_WAIT: - case TCP_CLOSING: - case TCP_LAST_ACK: - case TCP_FIN_WAIT1: - case TCP_FIN_WAIT2: - case TCP_LISTEN: - break; - case TCP_CLOSE: - /* Only release if the map entry is in fact the sock in - * question. There is a case where the operator deletes - * the sock from the map, but the TCP sock is closed before - * the psock is detached. Use cmpxchg to verify correct - * sock is removed. - */ - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - break; - write_lock_bh(&sk->sk_callback_lock); - list_for_each_entry_safe(e, tmp, &psock->maps, list) { - osk = cmpxchg(e->entry, sk, NULL); - if (osk == sk) { - list_del(&e->list); - smap_release_sock(psock, sk); - } - } - write_unlock_bh(&sk->sk_callback_lock); - break; - default: - psock = smap_psock_sk(sk); - if (unlikely(!psock)) - break; - smap_report_sk_error(psock, EPIPE); - break; - } - - wq = rcu_dereference(sk->sk_wq); - if (skwq_has_sleeper(wq)) - wake_up_interruptible_all(&wq->wait); - rcu_read_unlock(); -} - static void smap_read_sock_strparser(struct strparser *strp, struct sk_buff *skb) { @@ -330,10 +351,8 @@ static void smap_stop_sock(struct smap_psock *psock, struct sock *sk) return; sk->sk_data_ready = psock->save_data_ready; sk->sk_write_space = psock->save_write_space; - sk->sk_state_change = psock->save_state_change; psock->save_data_ready = NULL; psock->save_write_space = NULL; - psock->save_state_change = NULL; strp_stop(&psock->strp); psock->strp_enabled = false; } @@ -358,6 +377,7 @@ static void smap_release_sock(struct smap_psock *psock, struct sock *sock) if (psock->refcnt) return; + tcp_cleanup_ulp(sock); smap_stop_sock(psock, sock); clear_bit(SMAP_TX_RUNNING, &psock->state); rcu_assign_sk_user_data(sock, NULL); @@ -435,10 +455,8 @@ static void smap_start_sock(struct smap_psock *psock, struct sock *sk) return; psock->save_data_ready = sk->sk_data_ready; psock->save_write_space = sk->sk_write_space; - psock->save_state_change = sk->sk_state_change; sk->sk_data_ready = smap_data_ready; sk->sk_write_space = smap_write_space; - sk->sk_state_change = smap_state_change; psock->strp_enabled = true; } @@ -517,17 +535,15 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (attr->value_size > KMALLOC_MAX_SIZE) return ERR_PTR(-E2BIG); + err = bpf_tcp_ulp_register(); + if (err && err != -EEXIST) + return ERR_PTR(err); + stab = kzalloc(sizeof(*stab), GFP_USER); if (!stab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - stab->map.map_type = attr->map_type; - stab->map.key_size = attr->key_size; - stab->map.value_size = attr->value_size; - stab->map.max_entries = attr->max_entries; - stab->map.map_flags = attr->map_flags; - stab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&stab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); @@ -591,17 +607,19 @@ static void sock_map_free(struct bpf_map *map) write_lock_bh(&sock->sk_callback_lock); psock = smap_psock_sk(sock); - smap_list_remove(psock, &stab->sock_map[i]); - smap_release_sock(psock, sock); + /* This check handles a racing sock event that can get the + * sk_callback_lock before this case but after xchg happens + * causing the refcnt to hit zero and sock user data (psock) + * to be null and queued for garbage collection. + */ + if (likely(psock)) { + smap_list_remove(psock, &stab->sock_map[i]); + smap_release_sock(psock, sock); + } write_unlock_bh(&sock->sk_callback_lock); } rcu_read_unlock(); - if (stab->bpf_verdict) - bpf_prog_put(stab->bpf_verdict); - if (stab->bpf_parse) - bpf_prog_put(stab->bpf_parse); - sock_map_remove_complete(stab); } @@ -761,6 +779,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, goto out_progs; } + err = tcp_set_ulp_id(sock, TCP_ULP_BPF); + if (err) + goto out_progs; + set_bit(SMAP_TX_RUNNING, &psock->state); } @@ -873,6 +895,19 @@ static int sock_map_update_elem(struct bpf_map *map, return err; } +static void sock_map_release(struct bpf_map *map, struct file *map_file) +{ + struct bpf_stab *stab = container_of(map, struct bpf_stab, map); + struct bpf_prog *orig; + + orig = xchg(&stab->bpf_parse, NULL); + if (orig) + bpf_prog_put(orig); + orig = xchg(&stab->bpf_verdict, NULL); + if (orig) + bpf_prog_put(orig); +} + const struct bpf_map_ops sock_map_ops = { .map_alloc = sock_map_alloc, .map_free = sock_map_free, @@ -880,6 +915,7 @@ const struct bpf_map_ops sock_map_ops = { .map_get_next_key = sock_map_get_next_key, .map_update_elem = sock_map_update_elem, .map_delete_elem = sock_map_delete_elem, + .map_release = sock_map_release, }; BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock, diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a15bc636cc98..b0ecf43f5894 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_smap; - smap->map.map_type = attr->map_type; - smap->map.key_size = attr->key_size; + bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; - smap->map.max_entries = attr->max_entries; - smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - smap->map.numa_node = bpf_map_attr_numa_node(attr); err = bpf_map_precharge_memlock(smap->map.pages); if (err) @@ -226,9 +222,33 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) return 0; } -static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int stack_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) { - return -EINVAL; + struct bpf_stack_map *smap = container_of(map, + struct bpf_stack_map, map); + u32 id; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!key) { + id = 0; + } else { + id = *(u32 *)key; + if (id >= smap->n_buckets || !smap->buckets[id]) + id = 0; + else + id++; + } + + while (id < smap->n_buckets && !smap->buckets[id]) + id++; + + if (id >= smap->n_buckets) + return -ENOENT; + + *(u32 *)next_key = id; + return 0; } static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c4cfeaa8d5e..e24aa3241387 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -94,18 +94,34 @@ static int check_uarg_tail_zero(void __user *uaddr, return 0; } +const struct bpf_map_ops bpf_map_offload_ops = { + .map_alloc = bpf_map_offload_map_alloc, + .map_free = bpf_map_offload_map_free, +}; + static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { + const struct bpf_map_ops *ops; struct bpf_map *map; + int err; - if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || - !bpf_map_types[attr->map_type]) + if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) + return ERR_PTR(-EINVAL); + ops = bpf_map_types[attr->map_type]; + if (!ops) return ERR_PTR(-EINVAL); - map = bpf_map_types[attr->map_type]->map_alloc(attr); + if (ops->map_alloc_check) { + err = ops->map_alloc_check(attr); + if (err) + return ERR_PTR(err); + } + if (attr->map_ifindex) + ops = &bpf_map_offload_ops; + map = ops->map_alloc(attr); if (IS_ERR(map)) return map; - map->ops = bpf_map_types[attr->map_type]; + map->ops = ops; map->map_type = attr->map_type; return map; } @@ -134,6 +150,16 @@ void bpf_map_area_free(void *area) kvfree(area); } +void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) +{ + map->map_type = attr->map_type; + map->key_size = attr->key_size; + map->value_size = attr->value_size; + map->max_entries = attr->max_entries; + map->map_flags = attr->map_flags; + map->numa_node = bpf_map_attr_numa_node(attr); +} + int bpf_map_precharge_memlock(u32 pages) { struct user_struct *user = get_current_user(); @@ -189,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) +void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { unsigned long flags; + /* Offloaded maps are removed from the IDR store when their device + * disappears - even if someone holds an fd to them they are unusable, + * the memory is gone, all ops will fail; they are simply waiting for + * refcnt to drop to be freed. + */ + if (!map->id) + return; + if (do_idr_lock) spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); + map->id = 0; if (do_idr_lock) spin_unlock_irqrestore(&map_idr_lock, flags); @@ -378,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD map_name +#define BPF_MAP_CREATE_LAST_FIELD map_ifindex /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -566,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -654,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr) goto free_value; /* Need to create a kthread, thus must support schedule */ - if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_update_elem(map, key, value, attr->flags); + goto out; + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } @@ -669,10 +709,7 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || - map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || - map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); @@ -731,6 +768,11 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_delete_elem(map, key); + goto out; + } + preempt_disable(); __this_cpu_inc(bpf_prog_active); rcu_read_lock(); @@ -738,7 +780,7 @@ static int map_delete_elem(union bpf_attr *attr) rcu_read_unlock(); __this_cpu_dec(bpf_prog_active); preempt_enable(); - +out: if (!err) trace_bpf_map_delete_elem(map, ufd, key); kfree(key); @@ -788,9 +830,15 @@ static int map_get_next_key(union bpf_attr *attr) if (!next_key) goto free_key; + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_get_next_key(map, key, next_key); + goto out; + } + rcu_read_lock(); err = map->ops->map_get_next_key(map, key, next_key); rcu_read_unlock(); +out: if (err) goto free_next_key; @@ -905,9 +953,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) +void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { - /* cBPF to eBPF migrations are currently not in the idr store. */ + /* cBPF to eBPF migrations are currently not in the idr store. + * Offloaded programs are removed from the store when their device + * disappears - even if someone grabs an fd to them they are unusable, + * simply waiting for refcnt to drop to be freed. + */ if (!prog->aux->id) return; @@ -917,6 +969,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) __acquire(&prog_idr_lock); idr_remove(&prog_idr, prog->aux->id); + prog->aux->id = 0; if (do_idr_lock) spin_unlock_bh(&prog_idr_lock); @@ -937,10 +990,16 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + int i; + trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); + + for (i = 0; i < prog->aux->func_cnt; i++) + bpf_prog_kallsyms_del(prog->aux->func[i]); bpf_prog_kallsyms_del(prog); + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } @@ -1057,7 +1116,7 @@ struct bpf_prog *bpf_prog_inc_not_zero(struct bpf_prog *prog) } EXPORT_SYMBOL_GPL(bpf_prog_inc_not_zero); -static bool bpf_prog_get_ok(struct bpf_prog *prog, +bool bpf_prog_get_ok(struct bpf_prog *prog, enum bpf_prog_type *attach_type, bool attach_drv) { /* not an attachment, just a refcount inc, always allow */ @@ -1151,6 +1210,8 @@ static int bpf_prog_load(union bpf_attr *attr) if (!prog) return -ENOMEM; + prog->aux->offload_requested = !!attr->prog_ifindex; + err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog_nouncharge; @@ -1172,7 +1233,7 @@ static int bpf_prog_load(union bpf_attr *attr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; - if (attr->prog_ifindex) { + if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_offload_init(prog, attr); if (err) goto free_prog; @@ -1194,7 +1255,8 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_used_maps; /* eBPF program is ready to be JITed */ - prog = bpf_prog_select_runtime(prog, &err); + if (!prog->bpf_func) + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; @@ -1439,6 +1501,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; @@ -1551,6 +1615,67 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) return fd; } +static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, + unsigned long addr) +{ + int i; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (prog->aux->used_maps[i] == (void *)addr) + return prog->aux->used_maps[i]; + return NULL; +} + +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +{ + const struct bpf_map *map; + struct bpf_insn *insns; + u64 imm; + int i; + + insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), + GFP_USER); + if (!insns) + return insns; + + for (i = 0; i < prog->len; i++) { + if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { + insns[i].code = BPF_JMP | BPF_CALL; + insns[i].imm = BPF_FUNC_tail_call; + /* fall-through */ + } + if (insns[i].code == (BPF_JMP | BPF_CALL) || + insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { + if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) + insns[i].code = BPF_JMP | BPF_CALL; + if (!bpf_dump_raw_ok()) + insns[i].imm = 0; + continue; + } + + if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + + imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; + map = bpf_map_from_imm(prog, imm); + if (map) { + insns[i].src_reg = BPF_PSEUDO_MAP_FD; + insns[i].imm = map->id; + insns[i + 1].imm = 0; + continue; + } + + if (!bpf_dump_raw_ok() && + imm == (unsigned long)prog->aux) { + insns[i].imm = 0; + insns[i + 1].imm = 0; + continue; + } + } + + return insns; +} + static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1598,24 +1723,51 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, goto done; } - ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; - if (info.jited_prog_len && ulen) { - uinsns = u64_to_user_ptr(info.jited_prog_insns); - ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; - } - ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { + struct bpf_insn *insns_sanitized; + bool fault; + + if (prog->blinded && !bpf_dump_raw_ok()) { + info.xlated_prog_insns = 0; + goto done; + } + insns_sanitized = bpf_insn_prepare_dump(prog); + if (!insns_sanitized) + return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); - if (copy_to_user(uinsns, prog->insnsi, ulen)) + fault = copy_to_user(uinsns, insns_sanitized, ulen); + kfree(insns_sanitized); + if (fault) return -EFAULT; } + if (bpf_prog_is_dev_bound(prog->aux)) { + err = bpf_prog_offload_info_fill(&info, prog); + if (err) + return err; + goto done; + } + + /* NOTE: the following code is supposed to be skipped for offload. + * bpf_prog_offload_info_fill() is the place to fill similar fields + * for offload. + */ + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + if (bpf_dump_raw_ok()) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } else { + info.jited_prog_insns = 0; + } + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) @@ -1646,6 +1798,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_info_fill(&info, map); + if (err) + return err; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4593571c404..5fb69a85d967 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20,6 +20,8 @@ #include <linux/file.h> #include <linux/vmalloc.h> #include <linux/stringify.h> +#include <linux/bsearch.h> +#include <linux/sort.h> #include "disasm.h" @@ -167,11 +169,11 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); /* log_level controls verbosity level of eBPF verifier. - * verbose() is used to dump the verification trace to the log, so the user - * can figure out what's wrong with the program + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program */ -static __printf(2, 3) void verbose(struct bpf_verifier_env *env, - const char *fmt, ...) +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...) { struct bpf_verifer_log *log = &env->log; unsigned int n; @@ -195,6 +197,14 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env, else log->ubuf = NULL; } +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); +/* Historically bpf_verifier_log_write was called verbose, but the name was too + * generic for symbol export. The function was renamed, but not the calls in + * the verifier to avoid complicating backports. Hence the alias below. + */ +static __printf(2, 3) void verbose(struct bpf_verifier_env *env, + const char *fmt, ...) + __attribute__((alias("bpf_verifier_log_write"))); static bool type_is_pkt_pointer(enum bpf_reg_type type) { @@ -216,23 +226,48 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +static void print_liveness(struct bpf_verifier_env *env, + enum bpf_reg_liveness live) +{ + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + verbose(env, "_"); + if (live & REG_LIVE_READ) + verbose(env, "r"); + if (live & REG_LIVE_WRITTEN) + verbose(env, "w"); +} + +static struct bpf_func_state *func(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg) +{ + struct bpf_verifier_state *cur = env->cur_state; + + return cur->frame[reg->frameno]; +} + static void print_verifier_state(struct bpf_verifier_env *env, - struct bpf_verifier_state *state) + const struct bpf_func_state *state) { - struct bpf_reg_state *reg; + const struct bpf_reg_state *reg; enum bpf_reg_type t; int i; + if (state->frameno) + verbose(env, " frame%d:", state->frameno); for (i = 0; i < MAX_BPF_REG; i++) { reg = &state->regs[i]; t = reg->type; if (t == NOT_INIT) continue; - verbose(env, " R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d", i); + print_liveness(env, reg->live); + verbose(env, "=%s", reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); + if (t == PTR_TO_STACK) + verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) @@ -277,16 +312,21 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(env, " fp%d=%s", - -MAX_BPF_STACK + i * BPF_REG_SIZE, + if (state->stack[i].slot_type[0] == STACK_SPILL) { + verbose(env, " fp%d", + (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); + } + if (state->stack[i].slot_type[0] == STACK_ZERO) + verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); } verbose(env, "\n"); } -static int copy_stack_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) +static int copy_stack_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { if (!src->stack) return 0; @@ -302,13 +342,13 @@ static int copy_stack_state(struct bpf_verifier_state *dst, /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from - * the program calls into realloc_verifier_state() to grow the stack size. + * the program calls into realloc_func_state() to grow the stack size. * Note there is a non-zero 'parent' pointer inside bpf_verifier_state * which this function copies over. It points to previous bpf_verifier_state * which is never reallocated */ -static int realloc_verifier_state(struct bpf_verifier_state *state, int size, - bool copy_old) +static int realloc_func_state(struct bpf_func_state *state, int size, + bool copy_old) { u32 old_size = state->allocated_stack; struct bpf_stack_state *new_stack; @@ -341,10 +381,23 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size, return 0; } +static void free_func_state(struct bpf_func_state *state) +{ + if (!state) + return; + kfree(state->stack); + kfree(state); +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { - kfree(state->stack); + int i; + + for (i = 0; i <= state->curframe; i++) { + free_func_state(state->frame[i]); + state->frame[i] = NULL; + } if (free_self) kfree(state); } @@ -352,18 +405,46 @@ static void free_verifier_state(struct bpf_verifier_state *state, /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ -static int copy_verifier_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) +static int copy_func_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { int err; - err = realloc_verifier_state(dst, src->allocated_stack, false); + err = realloc_func_state(dst, src->allocated_stack, false); if (err) return err; - memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); + memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack)); return copy_stack_state(dst, src); } +static int copy_verifier_state(struct bpf_verifier_state *dst_state, + const struct bpf_verifier_state *src) +{ + struct bpf_func_state *dst; + int i, err; + + /* if dst has more stack frames then src frame, free them */ + for (i = src->curframe + 1; i <= dst_state->curframe; i++) { + free_func_state(dst_state->frame[i]); + dst_state->frame[i] = NULL; + } + dst_state->curframe = src->curframe; + dst_state->parent = src->parent; + for (i = 0; i <= src->curframe; i++) { + dst = dst_state->frame[i]; + if (!dst) { + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return -ENOMEM; + dst_state->frame[i] = dst; + } + err = copy_func_state(dst, src->frame[i]); + if (err) + return err; + } + return 0; +} + static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@ -416,6 +497,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, } return &elem->st; err: + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; /* pop all elements and return */ while (!pop_stack(env, NULL, NULL)); return NULL; @@ -425,6 +508,10 @@ err: static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; +#define CALLEE_SAVED_REGS 5 +static const int callee_saved[CALLEE_SAVED_REGS] = { + BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 +}; static void __mark_reg_not_init(struct bpf_reg_state *reg); @@ -449,6 +536,13 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } +static void __mark_reg_const_zero(struct bpf_reg_state *reg) +{ + __mark_reg_known(reg, 0); + reg->off = 0; + reg->type = SCALAR_VALUE; +} + static void mark_reg_known_zero(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { @@ -560,6 +654,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) reg->id = 0; reg->off = 0; reg->var_off = tnum_unknown; + reg->frameno = 0; __mark_reg_unbounded(reg); } @@ -568,8 +663,8 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_unknown(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } @@ -587,8 +682,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_not_init(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } @@ -596,8 +691,9 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, } static void init_reg_state(struct bpf_verifier_env *env, - struct bpf_reg_state *regs) + struct bpf_func_state *state) { + struct bpf_reg_state *regs = state->regs; int i; for (i = 0; i < MAX_BPF_REG; i++) { @@ -608,41 +704,218 @@ static void init_reg_state(struct bpf_verifier_env *env, /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; mark_reg_known_zero(env, regs, BPF_REG_FP); + regs[BPF_REG_FP].frameno = state->frameno; /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; mark_reg_known_zero(env, regs, BPF_REG_1); } +#define BPF_MAIN_FUNC (-1) +static void init_func_state(struct bpf_verifier_env *env, + struct bpf_func_state *state, + int callsite, int frameno, int subprogno) +{ + state->callsite = callsite; + state->frameno = frameno; + state->subprogno = subprogno; + init_reg_state(env, state); +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ DST_OP_NO_MARK /* same as above, check only, don't mark */ }; -static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) +static int cmp_subprogs(const void *a, const void *b) { - struct bpf_verifier_state *parent = state->parent; + return *(int *)a - *(int *)b; +} + +static int find_subprog(struct bpf_verifier_env *env, int off) +{ + u32 *p; + + p = bsearch(&off, env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs); + if (!p) + return -ENOENT; + return p - env->subprog_starts; + +} + +static int add_subprog(struct bpf_verifier_env *env, int off) +{ + int insn_cnt = env->prog->len; + int ret; + + if (off >= insn_cnt || off < 0) { + verbose(env, "call to invalid destination\n"); + return -EINVAL; + } + ret = find_subprog(env, off); + if (ret >= 0) + return 0; + if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { + verbose(env, "too many subprograms\n"); + return -E2BIG; + } + env->subprog_starts[env->subprog_cnt++] = off; + sort(env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); + return 0; +} + +static int check_subprogs(struct bpf_verifier_env *env) +{ + int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + + /* determine subprog starts. The end is one before the next starts */ + for (i = 0; i < insn_cnt; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + if (!env->allow_ptr_leaks) { + verbose(env, "function calls to other bpf functions are allowed for root only\n"); + return -EPERM; + } + if (bpf_prog_is_dev_bound(env->prog->aux)) { + verbose(env, "function calls in offloaded programs are not supported yet\n"); + return -EINVAL; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + } + + if (env->log.level > 1) + for (i = 0; i < env->subprog_cnt; i++) + verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); + + /* now check that all jumps are within the same subprog */ + subprog_start = 0; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + for (i = 0; i < insn_cnt; i++) { + u8 code = insn[i].code; + + if (BPF_CLASS(code) != BPF_JMP) + goto next; + if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + goto next; + off = i + insn[i].off + 1; + if (off < subprog_start || off >= subprog_end) { + verbose(env, "jump out of range from insn %d to %d\n", i, off); + return -EINVAL; + } +next: + if (i == subprog_end - 1) { + /* to avoid fall-through from one subprog into another + * the last insn of the subprog should be either exit + * or unconditional jump back + */ + if (code != (BPF_JMP | BPF_EXIT) && + code != (BPF_JMP | BPF_JA)) { + verbose(env, "last insn is not an exit or jmp\n"); + return -EINVAL; + } + subprog_start = subprog_end; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + } + } + return 0; +} + +static +struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) +{ + struct bpf_verifier_state *tmp = NULL; + + /* 'parent' could be a state of caller and + * 'state' could be a state of callee. In such case + * parent->curframe < state->curframe + * and it's ok for r1 - r5 registers + * + * 'parent' could be a callee's state after it bpf_exit-ed. + * In such case parent->curframe > state->curframe + * and it's ok for r0 only + */ + if (parent->curframe == state->curframe || + (parent->curframe < state->curframe && + regno >= BPF_REG_1 && regno <= BPF_REG_5) || + (parent->curframe > state->curframe && + regno == BPF_REG_0)) + return parent; + + if (parent->curframe > state->curframe && + regno >= BPF_REG_6) { + /* for callee saved regs we have to skip the whole chain + * of states that belong to callee and mark as LIVE_READ + * the registers before the call + */ + tmp = parent; + while (tmp && tmp->curframe != state->curframe) { + tmp = tmp->parent; + } + if (!tmp) + goto bug; + parent = tmp; + } else { + goto bug; + } + return parent; +bug: + verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); + verbose(env, "regno %d parent frame %d current frame %d\n", + regno, parent->curframe, state->curframe); + return NULL; +} + +static int mark_reg_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) +{ + bool writes = parent == state->parent; /* Observe write marks */ if (regno == BPF_REG_FP) /* We don't need to worry about FP liveness because it's read-only */ - return; + return 0; while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) break; + parent = skip_callee(env, state, parent, regno); + if (!parent) + return -EFAULT; /* ... then we depend on parent's value */ - parent->regs[regno].live |= REG_LIVE_READ; + parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } + return 0; } static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_reg_state *regs = env->cur_state->regs; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@ -655,7 +928,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - mark_reg_read(env->cur_state, regno); + return mark_reg_read(env, vstate, vstate->parent, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -686,17 +959,25 @@ static bool is_spillable_regtype(enum bpf_reg_type type) } } +/* Does this register contain a constant zero? */ +static bool register_is_null(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, - int size, int value_regno) + struct bpf_func_state *state, /* func where register points to */ + int off, int size, int value_regno) { + struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + enum bpf_reg_type type; - err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), - true); + err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); if (err) return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -709,8 +990,9 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EACCES; } + cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0 && - is_spillable_regtype(state->regs[value_regno].type)) { + is_spillable_regtype((type = cur->regs[value_regno].type))) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -718,51 +1000,116 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EACCES; } + if (state != cur && type == PTR_TO_STACK) { + verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); + return -EINVAL; + } + /* save register state */ - state->stack[spi].spilled_ptr = state->regs[value_regno]; + state->stack[spi].spilled_ptr = cur->regs[value_regno]; state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) state->stack[spi].slot_type[i] = STACK_SPILL; } else { + u8 type = STACK_MISC; + /* regular write of data into stack */ state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; + /* only mark the slot as written if all 8 bytes were written + * otherwise read propagation may incorrectly stop too soon + * when stack slots are partially written. + * This heuristic means that read propagation will be + * conservative, since it will add reg_live_read marks + * to stack slots all the way to first state when programs + * writes+reads less than 8 bytes + */ + if (size == BPF_REG_SIZE) + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + /* when we zero initialize stack slots mark them as such */ + if (value_regno >= 0 && + register_is_null(&cur->regs[value_regno])) + type = STACK_ZERO; + for (i = 0; i < size; i++) state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = - STACK_MISC; + type; } return 0; } -static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot) +/* registers of every function are unique and mark_reg_read() propagates + * the liveness in the following cases: + * - from callee into caller for R1 - R5 that were used as arguments + * - from caller into callee for R0 that used as result of the call + * - from caller to the same caller skipping states of the callee for R6 - R9, + * since R6 - R9 are callee saved by implicit function prologue and + * caller's R6 != callee's R6, so when we propagate liveness up to + * parent states we need to skip callee states for R6 - R9. + * + * stack slot marking is different, since stacks of caller and callee are + * accessible in both (since caller can pass a pointer to caller's stack to + * callee which can pass it to another function), hence mark_stack_slot_read() + * has to propagate the stack liveness to all parent states at given frame number. + * Consider code: + * f1() { + * ptr = fp - 8; + * *ptr = ctx; + * call f2 { + * .. = *ptr; + * } + * .. = *ptr; + * } + * First *ptr is reading from f1's stack and mark_stack_slot_read() has + * to mark liveness at the f1's frame and not f2's frame. + * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has + * to propagate liveness to f2 states at f1's frame level and further into + * f1 states at f1's frame level until write into that stack slot + */ +static void mark_stack_slot_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + int slot, int frameno) { - struct bpf_verifier_state *parent = state->parent; + bool writes = parent == state->parent; /* Observe write marks */ while (parent) { + if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) + /* since LIVE_WRITTEN mark is only done for full 8-byte + * write the read marks are conservative and parent + * state may not even have the stack allocated. In such case + * end the propagation, since the loop reached beginning + * of the function + */ + break; /* if read wasn't screened by an earlier write ... */ - if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) + if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; + parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } } static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, int size, - int value_regno) + struct bpf_func_state *reg_state /* func where register points to */, + int off, int size, int value_regno) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; u8 *stype; - if (state->allocated_stack <= slot) { + if (reg_state->allocated_stack <= slot) { verbose(env, "invalid read from stack off %d+0 size %d\n", off, size); return -EACCES; } - stype = state->stack[spi].slot_type; + stype = reg_state->stack[spi].slot_type; if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { @@ -778,21 +1125,44 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = state->stack[spi].spilled_ptr; - mark_stack_slot_read(state, spi); + state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; + /* mark reg as written since spilled pointer state likely + * has its liveness marks cleared by is_state_visited() + * which resets stack/reg liveness for state transitions + */ + state->regs[value_regno].live |= REG_LIVE_WRITTEN; } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); return 0; } else { + int zeros = 0; + for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { - verbose(env, "invalid read from stack off %d+%d size %d\n", - off, i, size); - return -EACCES; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + continue; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { + zeros++; + continue; } + verbose(env, "invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); + if (value_regno >= 0) { + if (zeros == size) { + /* any size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[value_regno]); + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, value_regno); + } + state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - if (value_regno >= 0) - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); return 0; } } @@ -817,7 +1187,8 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; @@ -978,6 +1349,13 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno) return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); } +static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) +{ + const struct bpf_reg_state *reg = cur_regs(env) + regno; + + return reg->type == PTR_TO_CTX; +} + static int check_pkt_ptr_alignment(struct bpf_verifier_env *env, const struct bpf_reg_state *reg, int off, int size, bool strict) @@ -1059,6 +1437,11 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, break; case PTR_TO_STACK: pointer_desc = "stack "; + /* The stack spill tracking logic in check_stack_write() + * and check_stack_read() relies on stack accesses being + * aligned. + */ + strict = true; break; default: break; @@ -1067,6 +1450,126 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +static int update_stack_depth(struct bpf_verifier_env *env, + const struct bpf_func_state *func, + int off) +{ + u16 stack = env->subprog_stack_depth[func->subprogno]; + + if (stack >= -off) + return 0; + + /* update known max for given subprogram */ + env->subprog_stack_depth[func->subprogno] = -off; + return 0; +} + +/* starting from main bpf function walk all instructions of the function + * and recursively walk all callees that given function can call. + * Ignore jump and exit insns. + * Since recursion is prevented by check_cfg() this algorithm + * only needs a local stack of MAX_CALL_FRAMES to remember callsites + */ +static int check_max_stack_depth(struct bpf_verifier_env *env) +{ + int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int ret_insn[MAX_CALL_FRAMES]; + int ret_prog[MAX_CALL_FRAMES]; + +process_func: + /* round up to 32-bytes, since this is granularity + * of interpreter stack size + */ + depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + if (depth > MAX_BPF_STACK) { + verbose(env, "combined stack size of %d calls is %d. Too large\n", + frame + 1, depth); + return -EACCES; + } +continue_func: + if (env->subprog_cnt == subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[subprog]; + for (; i < subprog_end; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + /* remember insn and function to return to */ + ret_insn[frame] = i + 1; + ret_prog[frame] = subprog; + + /* find the callee */ + i = i + insn[i].imm + 1; + subprog = find_subprog(env, i); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i); + return -EFAULT; + } + subprog++; + frame++; + if (frame >= MAX_CALL_FRAMES) { + WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); + return -EFAULT; + } + goto process_func; + } + /* end of for() loop means the last insn of the 'subprog' + * was reached. Doesn't matter whether it was JA or EXIT + */ + if (frame == 0) + return 0; + depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + frame--; + i = ret_insn[frame]; + subprog = ret_prog[frame]; + goto continue_func; +} + +#ifndef CONFIG_BPF_JIT_ALWAYS_ON +static int get_callee_stack_depth(struct bpf_verifier_env *env, + const struct bpf_insn *insn, int idx) +{ + int start = idx + insn->imm + 1, subprog; + + subprog = find_subprog(env, start); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + start); + return -EFAULT; + } + subprog++; + return env->subprog_stack_depth[subprog]; +} +#endif + +/* truncate register to smaller size (in bytes) + * must be called with size < BPF_REG_SIZE + */ +static void coerce_reg_to_size(struct bpf_reg_state *reg, int size) +{ + u64 mask; + + /* clear high bits in bit representation */ + reg->var_off = tnum_cast(reg->var_off, size); + + /* fix arithmetic bounds */ + mask = ((u64)1 << (size * 8)) - 1; + if ((reg->umin_value & ~mask) == (reg->umax_value & ~mask)) { + reg->umin_value &= mask; + reg->umax_value &= mask; + } else { + reg->umin_value = 0; + reg->umax_value = mask; + } + reg->smin_value = reg->umin_value; + reg->smax_value = reg->umax_value; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -1077,9 +1580,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn int bpf_size, enum bpf_access_type t, int value_regno) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -1168,8 +1671,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; + state = func(env, reg); + err = update_stack_depth(env, state, off); + if (err) + return err; if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, @@ -1200,9 +1705,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && regs[value_regno].type == SCALAR_VALUE) { /* b/h/w load zero-extends, mark upper bits as known 0 */ - regs[value_regno].var_off = - tnum_cast(regs[value_regno].var_off, size); - __update_reg_bounds(®s[value_regno]); + coerce_reg_to_size(®s[value_regno], size); } return err; } @@ -1232,6 +1735,12 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins return -EACCES; } + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_XADD stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check whether atomic_add can read the memory */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, -1); @@ -1243,12 +1752,6 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins BPF_SIZE(insn->code), BPF_WRITE, -1); } -/* Does this register contain a constant zero? */ -static bool register_is_null(struct bpf_reg_state reg) -{ - return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0); -} - /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized. @@ -1259,31 +1762,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_verifier_state *state = env->cur_state; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *reg = cur_regs(env) + regno; + struct bpf_func_state *state = func(env, reg); int off, i, slot, spi; - if (regs[regno].type != PTR_TO_STACK) { + if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && - register_is_null(regs[regno])) + register_is_null(reg)) return 0; verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[regs[regno].type], + reg_type_str[reg->type], reg_type_str[PTR_TO_STACK]); return -EACCES; } /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(regs[regno].var_off)) { + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); + return -EACCES; } - off = regs[regno].off + regs[regno].var_off.value; + off = reg->off + reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size < 0 || (access_size == 0 && !zero_size_allowed)) { verbose(env, "invalid stack type R%d off=%d access_size=%d\n", @@ -1291,9 +1795,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } - if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; - if (meta && meta->raw_mode) { meta->access_size = access_size; meta->regno = regno; @@ -1301,17 +1802,32 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } for (i = 0; i < access_size; i++) { + u8 *stype; + slot = -(off + i) - 1; spi = slot / BPF_REG_SIZE; - if (state->allocated_stack <= slot || - state->stack[spi].slot_type[slot % BPF_REG_SIZE] != - STACK_MISC) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); - return -EACCES; + if (state->allocated_stack <= slot) + goto err; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + if (*stype == STACK_MISC) + goto mark; + if (*stype == STACK_ZERO) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + goto mark; } +err: + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; +mark: + /* reading any byte out of 8-byte 'spill_slot' will cause + * the whole slot to be marked as 'read' + */ + mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, + spi, state->frameno); } - return 0; + return update_stack_depth(env, state, off); } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, @@ -1334,6 +1850,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +static bool arg_type_is_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MEM || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_UNINIT_MEM; +} + +static bool arg_type_is_mem_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_SIZE || + type == ARG_CONST_SIZE_OR_ZERO; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -1383,15 +1912,13 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_MEM || - arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_UNINIT_MEM) { + } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a SCALAR_VALUE type. Final test * happens during stack boundary checking. */ - if (register_is_null(*reg) && + if (register_is_null(reg) && arg_type == ARG_PTR_TO_MEM_OR_NULL) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && @@ -1446,25 +1973,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* bpf_xxx(..., buf, len) call will access 'len' bytes - * from stack pointer 'buf'. Check it - * note: regno == len, regno - 1 == buf - */ - if (regno == 0) { - /* kernel subsystem misconfigured verifier */ - verbose(env, - "ARG_CONST_SIZE cannot be first argument\n"); - return -EACCES; - } - /* The register is SCALAR_VALUE; the access check * happens using its boundaries. */ - if (!tnum_is_const(reg->var_off)) /* For unprivileged variable accesses, disable raw * mode so that the program is required to @@ -1564,6 +2078,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; + if (env->subprog_cnt) { + verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); + return -EINVAL; + } break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: @@ -1604,7 +2122,7 @@ error: return -EINVAL; } -static int check_raw_mode(const struct bpf_func_proto *fn) +static bool check_raw_mode_ok(const struct bpf_func_proto *fn) { int count = 0; @@ -1619,15 +2137,52 @@ static int check_raw_mode(const struct bpf_func_proto *fn) if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; - return count > 1 ? -EINVAL : 0; + /* We only support one arg being in raw mode at the moment, + * which is sufficient for the helper functions we have + * right now. + */ + return count <= 1; +} + +static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, + enum bpf_arg_type arg_next) +{ + return (arg_type_is_mem_ptr(arg_curr) && + !arg_type_is_mem_size(arg_next)) || + (!arg_type_is_mem_ptr(arg_curr) && + arg_type_is_mem_size(arg_next)); +} + +static bool check_arg_pair_ok(const struct bpf_func_proto *fn) +{ + /* bpf_xxx(..., buf, len) call will access 'len' + * bytes from memory 'buf'. Both arg types need + * to be paired, so make sure there's no buggy + * helper function specification. + */ + if (arg_type_is_mem_size(fn->arg1_type) || + arg_type_is_mem_ptr(fn->arg5_type) || + check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || + check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || + check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || + check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) + return false; + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn) +{ + return check_raw_mode_ok(fn) && + check_arg_pair_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. */ -static void clear_all_pkt_pointers(struct bpf_verifier_env *env) +static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, + struct bpf_func_state *state) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -1644,7 +2199,121 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) } } -static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + int i; + + for (i = 0; i <= vstate->curframe; i++) + __clear_all_pkt_pointers(env, vstate->frame[i]); +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + int i, subprog, target_insn; + + if (state->curframe + 1 >= MAX_CALL_FRAMES) { + verbose(env, "the call stack of %d frames is too deep\n", + state->curframe + 2); + return -E2BIG; + } + + target_insn = *insn_idx + insn->imm; + subprog = find_subprog(env, target_insn + 1); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", + target_insn + 1); + return -EFAULT; + } + + caller = state->frame[state->curframe]; + if (state->frame[state->curframe + 1]) { + verbose(env, "verifier bug. Frame %d already allocated\n", + state->curframe + 1); + return -EFAULT; + } + + callee = kzalloc(sizeof(*callee), GFP_KERNEL); + if (!callee) + return -ENOMEM; + state->frame[state->curframe + 1] = callee; + + /* callee cannot access r0, r6 - r9 for reading and has to write + * into its own stack before reading from it. + * callee can read/write into caller's stack + */ + init_func_state(env, callee, + /* remember the callsite, it will be used by bpf_exit */ + *insn_idx /* callsite */, + state->curframe + 1 /* frameno within this callchain */, + subprog + 1 /* subprog number within this prog */); + + /* copy r1 - r5 args that callee can access */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + callee->regs[i] = caller->regs[i]; + + /* after the call regsiters r0 - r5 were scratched */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + mark_reg_not_init(env, caller->regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } + + /* only increment it after check_reg_arg() finished */ + state->curframe++; + + /* and go analyze first insn of the callee */ + *insn_idx = target_insn; + + if (env->log.level) { + verbose(env, "caller:\n"); + print_verifier_state(env, caller); + verbose(env, "callee:\n"); + print_verifier_state(env, callee); + } + return 0; +} + +static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + struct bpf_reg_state *r0; + + callee = state->frame[state->curframe]; + r0 = &callee->regs[BPF_REG_0]; + if (r0->type == PTR_TO_STACK) { + /* technically it's ok to return caller's stack pointer + * (or caller's caller's pointer) back to the caller, + * since these pointers are valid. Only current stack + * pointer will be invalid as soon as function exits, + * but let's be conservative + */ + verbose(env, "cannot return stack pointer to the caller\n"); + return -EINVAL; + } + + state->curframe--; + caller = state->frame[state->curframe]; + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + + *insn_idx = callee->callsite + 1; + if (env->log.level) { + verbose(env, "returning from callee:\n"); + print_verifier_state(env, callee); + verbose(env, "to caller at %d:\n", *insn_idx); + print_verifier_state(env, caller); + } + /* clear everything in the callee */ + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return 0; +} + +static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; @@ -1661,7 +2330,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id); - if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -1674,15 +2342,18 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return -EINVAL; } + /* With LD_ABS/IND some JITs save/restore skb from r1. */ changes_data = bpf_helper_changes_pkt_data(fn->func); + if (changes_data && fn->arg1_type != ARG_PTR_TO_CTX) { + verbose(env, "kernel subsystem misconfigured func %s#%d: r1 != ctx\n", + func_id_name(func_id), func_id); + return -EINVAL; + } memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - /* We only support one arg being in raw mode at the moment, which - * is sufficient for the helper functions we have right now. - */ - err = check_raw_mode(fn); + err = check_func_proto(fn); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); @@ -1696,6 +2367,13 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; + if (func_id == BPF_FUNC_tail_call) { + if (meta.map_ptr == NULL) { + verbose(env, "verifier bug\n"); + return -EINVAL; + } + env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; + } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -1766,14 +2444,6 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return 0; } -static void coerce_reg_to_32(struct bpf_reg_state *reg) -{ - /* clear high 32 bits */ - reg->var_off = tnum_cast(reg->var_off, 4); - /* Update bounds */ - __update_reg_bounds(reg); -} - static bool signed_add_overflows(s64 a, s64 b) { /* Do the add in u64, where overflow is well-defined */ @@ -1794,6 +2464,41 @@ static bool signed_sub_overflows(s64 a, s64 b) return res > a; } +static bool check_reg_sane_offset(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + enum bpf_reg_type type) +{ + bool known = tnum_is_const(reg->var_off); + s64 val = reg->var_off.value; + s64 smin = reg->smin_value; + + if (known && (val >= BPF_MAX_VAR_OFF || val <= -BPF_MAX_VAR_OFF)) { + verbose(env, "math between %s pointer and %lld is not allowed\n", + reg_type_str[type], val); + return false; + } + + if (reg->off >= BPF_MAX_VAR_OFF || reg->off <= -BPF_MAX_VAR_OFF) { + verbose(env, "%s pointer offset %d is not allowed\n", + reg_type_str[type], reg->off); + return false; + } + + if (smin == S64_MIN) { + verbose(env, "math between %s pointer and register with unbounded min value is not allowed\n", + reg_type_str[type]); + return false; + } + + if (smin >= BPF_MAX_VAR_OFF || smin <= -BPF_MAX_VAR_OFF) { + verbose(env, "value %lld makes %s pointer be out of bounds\n", + smin, reg_type_str[type]); + return false; + } + + return true; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -1804,7 +2509,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; @@ -1815,44 +2522,36 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; - if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(env, env->cur_state); - verbose(env, - "verifier internal error: known but bad sbounds\n"); - return -EINVAL; - } - if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(env, env->cur_state); - verbose(env, - "verifier internal error: known but bad ubounds\n"); - return -EINVAL; + if ((known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; } if (BPF_CLASS(insn->code) != BPF_ALU64) { /* 32-bit ALU ops on pointers produce (meaningless) scalars */ - if (!env->allow_ptr_leaks) - verbose(env, - "R%d 32-bit pointer arithmetic prohibited\n", - dst); + verbose(env, + "R%d 32-bit pointer arithmetic prohibited\n", + dst); return -EACCES; } if (ptr_reg->type == PTR_TO_MAP_VALUE_OR_NULL) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", - dst); + verbose(env, "R%d pointer arithmetic on PTR_TO_MAP_VALUE_OR_NULL prohibited, null-check it first\n", + dst); return -EACCES; } if (ptr_reg->type == CONST_PTR_TO_MAP) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", - dst); + verbose(env, "R%d pointer arithmetic on CONST_PTR_TO_MAP prohibited\n", + dst); return -EACCES; } if (ptr_reg->type == PTR_TO_PACKET_END) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", - dst); + verbose(env, "R%d pointer arithmetic on PTR_TO_PACKET_END prohibited\n", + dst); return -EACCES; } @@ -1862,6 +2561,10 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg->type = ptr_reg->type; dst_reg->id = ptr_reg->id; + if (!check_reg_sane_offset(env, off_reg, ptr_reg->type) || + !check_reg_sane_offset(env, ptr_reg, ptr_reg->type)) + return -EINVAL; + switch (opcode) { case BPF_ADD: /* We can take a fixed offset as long as it doesn't overflow @@ -1915,9 +2618,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_SUB: if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d tried to subtract pointer from scalar\n", - dst); + verbose(env, "R%d tried to subtract pointer from scalar\n", + dst); return -EACCES; } /* We don't allow subtraction from FP, because (according to @@ -1925,9 +2627,8 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, * be able to deal with it. */ if (ptr_reg->type == PTR_TO_STACK) { - if (!env->allow_ptr_leaks) - verbose(env, "R%d subtraction from stack pointer prohibited\n", - dst); + verbose(env, "R%d subtraction from stack pointer prohibited\n", + dst); return -EACCES; } if (known && (ptr_reg->off - smin_val == @@ -1976,28 +2677,30 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, case BPF_AND: case BPF_OR: case BPF_XOR: - /* bitwise ops on pointers are troublesome, prohibit for now. - * (However, in principle we could allow some cases, e.g. - * ptr &= ~3 which would reduce min_value by 3.) - */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d bitwise operator %s on pointer prohibited\n", - dst, bpf_alu_string[opcode >> 4]); + /* bitwise ops on pointers are troublesome, prohibit. */ + verbose(env, "R%d bitwise operator %s on pointer prohibited\n", + dst, bpf_alu_string[opcode >> 4]); return -EACCES; default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ - if (!env->allow_ptr_leaks) - verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", - dst, bpf_alu_string[opcode >> 4]); + verbose(env, "R%d pointer arithmetic with %s operator prohibited\n", + dst, bpf_alu_string[opcode >> 4]); return -EACCES; } + if (!check_reg_sane_offset(env, dst_reg, ptr_reg->type)) + return -EINVAL; + __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); return 0; } +/* WARNING: This function does calculations on 64-bit values, but the actual + * execution may occur on 32-bit values. Therefore, things like bitshifts + * need extra checks in the 32-bit case. + */ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_reg_state *dst_reg, @@ -2008,12 +2711,8 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, bool src_known, dst_known; s64 smin_val, smax_val; u64 umin_val, umax_val; + u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; - if (BPF_CLASS(insn->code) != BPF_ALU64) { - /* 32-bit ALU ops are (32,32)->64 */ - coerce_reg_to_32(dst_reg); - coerce_reg_to_32(&src_reg); - } smin_val = src_reg.smin_value; smax_val = src_reg.smax_value; umin_val = src_reg.umin_value; @@ -2021,6 +2720,21 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, src_known = tnum_is_const(src_reg.var_off); dst_known = tnum_is_const(dst_reg->var_off); + if ((src_known && (smin_val != smax_val || umin_val != umax_val)) || + smin_val > smax_val || umin_val > umax_val) { + /* Taint dst register if offset had invalid bounds derived from + * e.g. dead branches. + */ + __mark_reg_unknown(dst_reg); + return 0; + } + + if (!src_known && + opcode != BPF_ADD && opcode != BPF_SUB && opcode != BPF_AND) { + __mark_reg_unknown(dst_reg); + return 0; + } + switch (opcode) { case BPF_ADD: if (signed_add_overflows(dst_reg->smin_value, smin_val) || @@ -2149,9 +2863,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; case BPF_LSH: - if (umax_val > 63) { - /* Shifts greater than 63 are undefined. This includes - * shifts by a negative number. + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. */ mark_reg_unknown(env, regs, insn->dst_reg); break; @@ -2177,27 +2891,29 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); break; case BPF_RSH: - if (umax_val > 63) { - /* Shifts greater than 63 are undefined. This includes - * shifts by a negative number. + if (umax_val >= insn_bitness) { + /* Shifts greater than 31 or 63 are undefined. + * This includes shifts by a negative number. */ mark_reg_unknown(env, regs, insn->dst_reg); break; } - /* BPF_RSH is an unsigned shift, so make the appropriate casts */ - if (dst_reg->smin_value < 0) { - if (umin_val) { - /* Sign bit will be cleared */ - dst_reg->smin_value = 0; - } else { - /* Lost sign bit information */ - dst_reg->smin_value = S64_MIN; - dst_reg->smax_value = S64_MAX; - } - } else { - dst_reg->smin_value = - (u64)(dst_reg->smin_value) >> umax_val; - } + /* BPF_RSH is an unsigned shift. If the value in dst_reg might + * be negative, then either: + * 1) src_reg might be zero, so the sign bit of the result is + * unknown, so we lose our signed bounds + * 2) it's known negative, thus the unsigned bounds capture the + * signed bounds + * 3) the signed bounds cross zero, so they tell us nothing + * about the result + * If the value in dst_reg is known nonnegative, then again the + * unsigned bounts capture the signed bounds. + * Thus, in all cases it suffices to blow away our signed bounds + * and rely on inferring new ones from the unsigned bounds and + * var_off of the result. + */ + dst_reg->smin_value = S64_MIN; + dst_reg->smax_value = S64_MAX; if (src_known) dst_reg->var_off = tnum_rshift(dst_reg->var_off, umin_val); @@ -2213,6 +2929,12 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, break; } + if (BPF_CLASS(insn->code) != BPF_ALU64) { + /* 32-bit ALU ops are (32,32)->32 */ + coerce_reg_to_size(dst_reg, 4); + coerce_reg_to_size(&src_reg, 4); + } + __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); return 0; @@ -2224,10 +2946,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); - int rc; dst_reg = ®s[insn->dst_reg]; src_reg = NULL; @@ -2238,43 +2961,29 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, if (src_reg->type != SCALAR_VALUE) { if (dst_reg->type != SCALAR_VALUE) { /* Combining two pointers by any ALU op yields - * an arbitrary scalar. + * an arbitrary scalar. Disallow all math except + * pointer subtraction */ - if (!env->allow_ptr_leaks) { - verbose(env, "R%d pointer %s pointer prohibited\n", - insn->dst_reg, - bpf_alu_string[opcode >> 4]); - return -EACCES; + if (opcode == BPF_SUB){ + mark_reg_unknown(env, regs, insn->dst_reg); + return 0; } - mark_reg_unknown(env, regs, insn->dst_reg); - return 0; + verbose(env, "R%d pointer %s pointer prohibited\n", + insn->dst_reg, + bpf_alu_string[opcode >> 4]); + return -EACCES; } else { /* scalar += pointer * This is legal, but we have to reverse our * src/dest handling in computing the range */ - rc = adjust_ptr_min_max_vals(env, insn, - src_reg, dst_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* scalar += unknown scalar */ - __mark_reg_unknown(&off_reg); - return adjust_scalar_min_max_vals( - env, insn, - dst_reg, off_reg); - } - return rc; + return adjust_ptr_min_max_vals(env, insn, + src_reg, dst_reg); } } else if (ptr_reg) { /* pointer += scalar */ - rc = adjust_ptr_min_max_vals(env, insn, - dst_reg, src_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* unknown scalar += scalar */ - __mark_reg_unknown(dst_reg); - return adjust_scalar_min_max_vals( - env, insn, dst_reg, *src_reg); - } - return rc; + return adjust_ptr_min_max_vals(env, insn, + dst_reg, src_reg); } } else { /* Pretend the src is a reg with a known value, since we only @@ -2283,27 +2992,19 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, off_reg.type = SCALAR_VALUE; __mark_reg_known(&off_reg, insn->imm); src_reg = &off_reg; - if (ptr_reg) { /* pointer += K */ - rc = adjust_ptr_min_max_vals(env, insn, - ptr_reg, src_reg); - if (rc == -EACCES && env->allow_ptr_leaks) { - /* unknown scalar += K */ - __mark_reg_unknown(dst_reg); - return adjust_scalar_min_max_vals( - env, insn, dst_reg, off_reg); - } - return rc; - } + if (ptr_reg) /* pointer += K */ + return adjust_ptr_min_max_vals(env, insn, + ptr_reg, src_reg); } /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@ -2390,17 +3091,20 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EACCES; } mark_reg_unknown(env, regs, insn->dst_reg); - /* high 32 bits are known zero. */ - regs[insn->dst_reg].var_off = tnum_cast( - regs[insn->dst_reg].var_off, 4); - __update_reg_bounds(®s[insn->dst_reg]); + coerce_reg_to_size(®s[insn->dst_reg], 4); } } else { /* case: R = imm * remember the value we stored into this reg */ regs[insn->dst_reg].type = SCALAR_VALUE; - __mark_reg_known(regs + insn->dst_reg, insn->imm); + if (BPF_CLASS(insn->code) == BPF_ALU64) { + __mark_reg_known(regs + insn->dst_reg, + insn->imm); + } else { + __mark_reg_known(regs + insn->dst_reg, + (u32)insn->imm); + } } } else if (opcode > BPF_END) { @@ -2436,6 +3140,11 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (opcode == BPF_ARSH && BPF_CLASS(insn->code) != BPF_ALU64) { + verbose(env, "BPF_ARSH not supported for 32 bit ALU\n"); + return -EINVAL; + } + if ((opcode == BPF_LSH || opcode == BPF_RSH || opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; @@ -2457,14 +3166,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static void find_good_pkt_pointers(struct bpf_verifier_state *state, +static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i; + int i, j; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -2534,12 +3244,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - reg = &state->stack[i].spilled_ptr; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + reg = &state->stack[i].spilled_ptr; + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); + } } } @@ -2777,20 +3490,24 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ -static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, +static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs; u32 id = regs[regno].id; - int i; + int i, j; for (i = 0; i < MAX_BPF_REG; i++) mark_map_reg(regs, i, id, is_null); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + } } } @@ -2890,8 +3607,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; - struct bpf_reg_state *regs = this_branch->regs, *dst_reg; + struct bpf_verifier_state *this_branch = env->cur_state; + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; + struct bpf_reg_state *dst_reg, *other_branch_regs; u8 opcode = BPF_OP(insn->code); int err; @@ -2934,8 +3653,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == SCALAR_VALUE && - tnum_equals_const(dst_reg->var_off, insn->imm)) { - if (opcode == BPF_JEQ) { + tnum_is_const(dst_reg->var_off)) { + if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || + (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { /* if (imm == imm) goto pc+off; * only follow the goto, ignore fall-through */ @@ -2953,6 +3673,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); if (!other_branch) return -EFAULT; + other_branch_regs = other_branch->frame[other_branch->curframe]->regs; /* detect if we are comparing against a constant value so we can adjust * our min/max values for our dst register. @@ -2965,22 +3686,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && regs[insn->src_reg].type == SCALAR_VALUE) { if (tnum_is_const(regs[insn->src_reg].var_off)) - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, regs[insn->src_reg].var_off.value, opcode); else if (tnum_is_const(dst_reg->var_off)) - reg_set_min_max_inv(&other_branch->regs[insn->src_reg], + reg_set_min_max_inv(&other_branch_regs[insn->src_reg], ®s[insn->src_reg], dst_reg->var_off.value, opcode); else if (opcode == BPF_JEQ || opcode == BPF_JNE) /* Comparing for equality, we can combine knowledge */ - reg_combine_min_max(&other_branch->regs[insn->src_reg], - &other_branch->regs[insn->dst_reg], + reg_combine_min_max(&other_branch_regs[insn->src_reg], + &other_branch_regs[insn->dst_reg], ®s[insn->src_reg], ®s[insn->dst_reg], opcode); } } else if (dst_reg->type == SCALAR_VALUE) { - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, insn->imm, opcode); } @@ -3001,7 +3722,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EACCES; } if (env->log.level) - print_verifier_state(env, this_branch); + print_verifier_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -3086,6 +3807,18 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->subprog_cnt) { + /* when program has LD_ABS insn JITs and interpreter assume + * that r1 == ctx == skb which is not the case for callees + * that can have arbitrary arguments. It's problematic + * for main prog as well since JITs would need to analyze + * all functions in order to make proper register save/restore + * decisions in the main prog. Hence disallow LD_ABS with calls + */ + verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); + return -EINVAL; + } + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { @@ -3262,6 +3995,10 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; + ret = check_subprogs(env); + if (ret < 0) + return ret; + insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@ -3294,6 +4031,14 @@ peek_stack: goto err_free; if (t + 1 < insn_cnt) env->explored_states[t + 1] = STATE_LIST_MARK; + if (insns[t].src_reg == BPF_PSEUDO_CALL) { + env->explored_states[t] = STATE_LIST_MARK; + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@ -3412,11 +4157,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct idpair *idmap) { + bool equal; + if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ return true; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0) + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + + if (rold->type == PTR_TO_STACK) + /* two stack pointers are equal only if they're pointing to + * the same stack frame, since fp-8 in foo != fp-8 in bar + */ + return equal && rold->frameno == rcur->frameno; + + if (equal) return true; if (rold->type == NOT_INIT) @@ -3431,15 +4186,14 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return range_within(rold, rcur) && tnum_in(rold->var_off, rcur->var_off); } else { - /* if we knew anything about the old value, we're not - * equal, because we can't know anything about the - * scalar value of the pointer in the new value. + /* We're trying to use a pointer in place of a scalar. + * Even if the scalar was unbounded, this could lead to + * pointer leaks because scalars are allowed to leak + * while pointers are not. We could make this safe in + * special cases if root is calling us, but it's + * probably not worth the hassle. */ - return rold->umin_value == 0 && - rold->umax_value == U64_MAX && - rold->smin_value == S64_MIN && - rold->smax_value == S64_MAX && - tnum_is_unknown(rold->var_off); + return false; } case PTR_TO_MAP_VALUE: /* If the new min/max/var_off satisfy the old ones and @@ -3489,7 +4243,6 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, tnum_in(rold->var_off, rcur->var_off); case PTR_TO_CTX: case CONST_PTR_TO_MAP: - case PTR_TO_STACK: case PTR_TO_PACKET_END: /* Only valid matches are exact, which memcmp() above * would have accepted @@ -3504,8 +4257,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } -static bool stacksafe(struct bpf_verifier_state *old, - struct bpf_verifier_state *cur, +static bool stacksafe(struct bpf_func_state *old, + struct bpf_func_state *cur, struct idpair *idmap) { int i, spi; @@ -3523,8 +4276,19 @@ static bool stacksafe(struct bpf_verifier_state *old, for (i = 0; i < old->allocated_stack; i++) { spi = i / BPF_REG_SIZE; + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) + /* explored state didn't use this */ + continue; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; + /* if old state was safe with misc data in the stack + * it will be safe with zero-initialized stack. + * The opposite is not true + */ + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && + cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) + continue; if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) /* Ex: old explored (safe) state has STACK_SPILL in @@ -3581,9 +4345,8 @@ static bool stacksafe(struct bpf_verifier_state *old, * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool states_equal(struct bpf_verifier_env *env, - struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) +static bool func_states_equal(struct bpf_func_state *old, + struct bpf_func_state *cur) { struct idpair *idmap; bool ret = false; @@ -3607,71 +4370,72 @@ out_free: return ret; } +static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + int i; + + if (old->curframe != cur->curframe) + return false; + + /* for states to be equal callsites have to be the same + * and all frame states need to be equivalent + */ + for (i = 0; i <= old->curframe; i++) { + if (old->frame[i]->callsite != cur->frame[i]->callsite) + return false; + if (!func_states_equal(old->frame[i], cur->frame[i])) + return false; + } + return true; +} + /* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at a - * jump target (in the first iteration of the propagate_liveness() loop), - * we didn't arrive by the straight-line code, so read marks in state must - * propagate to parent regardless of state's write marks. + * straight-line code between a state and its parent. When we arrive at an + * equivalent state (jump target or such) we didn't arrive by the straight-line + * code, so read marks in the state must propagate to the parent regardless + * of the state's write marks. That's what 'parent == state->parent' comparison + * in mark_reg_read() and mark_stack_slot_read() is for. */ -static bool do_propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) +static int propagate_liveness(struct bpf_verifier_env *env, + const struct bpf_verifier_state *vstate, + struct bpf_verifier_state *vparent) { - bool writes = parent == state->parent; /* Observe write marks */ - bool touched = false; /* any changes made? */ - int i; + int i, frame, err = 0; + struct bpf_func_state *state, *parent; - if (!parent) - return touched; + if (vparent->curframe != vstate->curframe) { + WARN(1, "propagate_live: parent frame %d current frame %d\n", + vparent->curframe, vstate->curframe); + return -EFAULT; + } /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); /* We don't need to worry about FP liveness because it's read-only */ for (i = 0; i < BPF_REG_FP; i++) { - if (parent->regs[i].live & REG_LIVE_READ) - continue; - if (writes && (state->regs[i].live & REG_LIVE_WRITTEN)) + if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; - if (state->regs[i].live & REG_LIVE_READ) { - parent->regs[i].live |= REG_LIVE_READ; - touched = true; + if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { + err = mark_reg_read(env, vstate, vparent, i); + if (err) + return err; } } + /* ... and stack slots */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) - continue; - if (writes && - (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) - continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { - parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; - touched = true; + for (frame = 0; frame <= vstate->curframe; frame++) { + state = vstate->frame[frame]; + parent = vparent->frame[frame]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) + continue; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) + mark_stack_slot_read(env, vstate, vparent, i, frame); } } - return touched; -} - -/* "parent" is "a state from which we reach the current state", but initially - * it is not the state->parent (i.e. "the state whose straight-line code leads - * to the current state"), instead it is the state that happened to arrive at - * a (prunable) equivalent of the current state. See comment above - * do_propagate_liveness() for consequences of this. - * This function is just a more efficient way of calling mark_reg_read() or - * mark_stack_slot_read() on each reg in "parent" that is read in "state", - * though it requires that parent != state->parent in the call arguments. - */ -static void propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) -{ - while (do_propagate_liveness(state, parent)) { - /* Something changed, so we need to feed those changes onward */ - state = parent; - parent = state->parent; - } + return err; } static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) @@ -3679,7 +4443,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state; - int i, err; + int i, j, err; sl = env->explored_states[insn_idx]; if (!sl) @@ -3700,7 +4464,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - propagate_liveness(&sl->state, cur); + err = propagate_liveness(env, &sl->state, cur); + if (err) + return err; return 1; } sl = sl->next; @@ -3708,9 +4474,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* there were no equivalent states, remember current one. * technically the current state is not proven to be safe yet, - * but it will either reach bpf_exit (which means it's safe) or - * it will be rejected. Since there are no loops, we won't be - * seeing this 'insn_idx' instruction again on the way to bpf_exit + * but it will either reach outer most bpf_exit (which means it's safe) + * or it will be rejected. Since there are no loops, we won't be + * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) + * again on the way to bpf_exit */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) @@ -3734,19 +4501,15 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - cur->regs[i].live = REG_LIVE_NONE; - for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) - if (cur->stack[i].slot_type[0] == STACK_SPILL) - cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; - return 0; -} + cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; -static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) -{ - if (env->dev_ops && env->dev_ops->insn_hook) - return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); + /* all stack frames are accessible from callee, clear them all */ + for (j = 0; j <= cur->curframe; j++) { + struct bpf_func_state *frame = cur->frame[j]; + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + } return 0; } @@ -3755,7 +4518,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len; + int insn_cnt = env->prog->len, i; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; @@ -3763,9 +4526,18 @@ static int do_check(struct bpf_verifier_env *env) state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; - env->cur_state = state; - init_reg_state(env, state->regs); + state->curframe = 0; state->parent = NULL; + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + if (!state->frame[0]) { + kfree(state); + return -ENOMEM; + } + env->cur_state = state; + init_func_state(env, state->frame[0], + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno */, + 0 /* subprogno, zero == main subprog */); insn_idx = 0; for (;;) { struct bpf_insn *insn; @@ -3812,19 +4584,25 @@ static int do_check(struct bpf_verifier_env *env) else verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(env, state); + print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } if (env->log.level) { + const struct bpf_insn_cbs cbs = { + .cb_print = verbose, + }; + verbose(env, "%d: ", insn_idx); - print_bpf_insn(verbose, env, insn, - env->allow_ptr_leaks); + print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); } - err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); - if (err) - return err; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + err = bpf_prog_offload_verify_insn(env, insn_idx, + prev_insn_idx); + if (err) + return err; + } regs = cur_regs(env); env->insn_aux_data[insn_idx].seen = true; @@ -3932,6 +4710,12 @@ static int do_check(struct bpf_verifier_env *env) if (err) return err; + if (is_ctx_reg(env, insn->dst_reg)) { + verbose(env, "BPF_ST stores into R%d context is not allowed\n", + insn->dst_reg); + return -EACCES; + } + /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, @@ -3945,13 +4729,17 @@ static int do_check(struct bpf_verifier_env *env) if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || - insn->src_reg != BPF_REG_0 || + (insn->src_reg != BPF_REG_0 && + insn->src_reg != BPF_PSEUDO_CALL) || insn->dst_reg != BPF_REG_0) { verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } - err = check_call(env, insn->imm, insn_idx); + if (insn->src_reg == BPF_PSEUDO_CALL) + err = check_func_call(env, insn, &insn_idx); + else + err = check_helper_call(env, insn->imm, insn_idx); if (err) return err; @@ -3976,6 +4764,16 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + if (state->curframe) { + /* exit from nested function */ + prev_insn_idx = insn_idx; + err = prepare_func_exit(env, &insn_idx); + if (err) + return err; + do_print_state = true; + continue; + } + /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time @@ -4036,8 +4834,17 @@ process_bpf_exit: insn_idx++; } - verbose(env, "processed %d insns, stack depth %d\n", insn_processed, - env->prog->aux->stack_depth); + verbose(env, "processed %d insns (limit %d), stack depth ", + insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); + for (i = 0; i < env->subprog_cnt + 1; i++) { + u32 depth = env->subprog_stack_depth[i]; + + verbose(env, "%d", depth); + if (i + 1 < env->subprog_cnt + 1) + verbose(env, "+"); + } + verbose(env, "\n"); + env->prog->aux->stack_depth = env->subprog_stack_depth[0]; return 0; } @@ -4070,6 +4877,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } } + + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && + !bpf_offload_dev_match(prog, map)) { + verbose(env, "offload device mismatch between prog and map\n"); + return -EINVAL; + } + return 0; } @@ -4167,6 +4981,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) next_insn: insn++; i++; + continue; + } + + /* Basic sanity check before we invest more work here. */ + if (!bpf_opcode_in_insntable(insn->code)) { + verbose(env, "unknown opcode %02x\n", insn->code); + return -EINVAL; } } @@ -4223,6 +5044,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, return 0; } +static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + for (i = 0; i < env->subprog_cnt; i++) { + if (env->subprog_starts[i] < off) + continue; + env->subprog_starts[i] += len - 1; + } +} + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -4233,17 +5067,25 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return NULL; if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; + adjust_subprog_starts(env, off, len); return new_prog; } -/* The verifier does more data flow analysis than llvm and will not explore - * branches that are dead at run time. Malicious programs can have dead code - * too. Therefore replace all dead at-run-time code with nops. +/* The verifier does more data flow analysis than llvm and will not + * explore branches that are dead at run time. Malicious programs can + * have dead code too. Therefore replace all dead at-run-time code + * with 'ja -1'. + * + * Just nops are not optimal, e.g. if they would sit at the end of the + * program and through another bug we would manage to jump there, then + * we'd execute beyond program memory otherwise. Returning exception + * code also wouldn't work since we can have subprogs where the dead + * code could be located. */ static void sanitize_dead_code(struct bpf_verifier_env *env) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); + struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1); struct bpf_insn *insn = env->prog->insnsi; const int insn_cnt = env->prog->len; int i; @@ -4251,7 +5093,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { if (aux_data[i].seen) continue; - memcpy(insn + i, &nop, sizeof(nop)); + memcpy(insn + i, &trap, sizeof(trap)); } } @@ -4367,6 +5209,180 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } +static int jit_subprogs(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog, **func, *tmp; + int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_insn *insn; + void *old_bpf_func; + int err = -ENOMEM; + + if (env->subprog_cnt == 0) + return 0; + + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = find_subprog(env, i + insn->imm + 1); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i + insn->imm + 1); + return -EFAULT; + } + /* temporarily remember subprog id inside insn instead of + * aux_data, since next loop will split up all insns into funcs + */ + insn->off = subprog + 1; + /* remember original imm in case JIT fails and fallback + * to interpreter will be needed + */ + env->insn_aux_data[i].call_imm = insn->imm; + /* point imm to __bpf_call_base+1 from JITs point of view */ + insn->imm = 1; + } + + func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); + if (!func) + return -ENOMEM; + + for (i = 0; i <= env->subprog_cnt; i++) { + subprog_start = subprog_end; + if (env->subprog_cnt == i) + subprog_end = prog->len; + else + subprog_end = env->subprog_starts[i]; + + len = subprog_end - subprog_start; + func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); + if (!func[i]) + goto out_free; + memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], + len * sizeof(struct bpf_insn)); + func[i]->type = prog->type; + func[i]->len = len; + if (bpf_prog_calc_tag(func[i])) + goto out_free; + func[i]->is_func = 1; + /* Use bpf_prog_F_tag to indicate functions in stack traces. + * Long term would need debug info to populate names + */ + func[i]->aux->name[0] = 'F'; + func[i]->aux->stack_depth = env->subprog_stack_depth[i]; + func[i]->jit_requested = 1; + func[i] = bpf_int_jit_compile(func[i]); + if (!func[i]->jited) { + err = -ENOTSUPP; + goto out_free; + } + cond_resched(); + } + /* at this point all bpf functions were successfully JITed + * now populate all bpf_calls with correct addresses and + * run last pass of JIT + */ + for (i = 0; i <= env->subprog_cnt; i++) { + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = insn->off; + insn->off = 0; + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) + func[subprog]->bpf_func - + __bpf_call_base; + } + } + for (i = 0; i <= env->subprog_cnt; i++) { + old_bpf_func = func[i]->bpf_func; + tmp = bpf_int_jit_compile(func[i]); + if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { + verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); + err = -EFAULT; + goto out_free; + } + cond_resched(); + } + + /* finally lock prog and jit images for all functions and + * populate kallsysm + */ + for (i = 0; i <= env->subprog_cnt; i++) { + bpf_prog_lock_ro(func[i]); + bpf_prog_kallsyms_add(func[i]); + } + + /* Last step: make now unused interpreter insns from main + * prog consistent for later dump requests, so they can + * later look the same as if they were interpreted only. + */ + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + unsigned long addr; + + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + insn->off = env->insn_aux_data[i].call_imm; + subprog = find_subprog(env, i + insn->off + 1); + addr = (unsigned long)func[subprog + 1]->bpf_func; + addr &= PAGE_MASK; + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) + addr - __bpf_call_base; + } + + prog->jited = 1; + prog->bpf_func = func[0]->bpf_func; + prog->aux->func = func; + prog->aux->func_cnt = env->subprog_cnt + 1; + return 0; +out_free: + for (i = 0; i <= env->subprog_cnt; i++) + if (func[i]) + bpf_jit_free(func[i]); + kfree(func); + /* cleanup main prog to be interpreted */ + prog->jit_requested = 0; + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + insn->off = 0; + insn->imm = env->insn_aux_data[i].call_imm; + } + return err; +} + +static int fixup_call_args(struct bpf_verifier_env *env) +{ +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = prog->insnsi; + int i, depth; +#endif + int err; + + err = 0; + if (env->prog->jit_requested) { + err = jit_subprogs(env); + if (err == 0) + return 0; + } +#ifndef CONFIG_BPF_JIT_ALWAYS_ON + for (i = 0; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + depth = get_callee_stack_depth(env, insn, i); + if (depth < 0) + return depth; + bpf_patch_call_args(insn, depth); + } + err = 0; +#endif + return err; +} + /* fixup insn->imm field of bpf_call instructions * and inline eligible helpers as explicit sequence of BPF instructions * @@ -4384,13 +5400,57 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + struct bpf_insn mask_and_div[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx div 0 -> 0 */ + BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + *insn, + }; + struct bpf_insn mask_and_mod[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx mod 0 -> Rx */ + BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + *insn, + }; + struct bpf_insn *patchlet; + + if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + patchlet = mask_and_div + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); + } else { + patchlet = mask_and_mod + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); + } + + new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; + if (insn->src_reg == BPF_PSEUDO_CALL) + continue; if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_override_return) + prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can @@ -4407,13 +5467,42 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + + /* instead of changing every JIT dealing with tail_call + * emit two extra insns: + * if (index >= max_entries) goto out; + * index &= array->index_mask; + * to avoid out-of-bounds cpu speculation + */ + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON) { + verbose(env, "tail_call abusing map_ptr\n"); + return -EINVAL; + } + if (!map_ptr->unpriv_array) + continue; + insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, + map_ptr->max_entries, 2); + insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, + container_of(map_ptr, + struct bpf_array, + map)->index_mask); + insn_buf[2] = *insn; + cnt = 3; + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; continue; } /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * handlers are currently limited to 64 bit only. */ - if (ebpf_jit_enabled() && BITS_PER_LONG == 64 && + if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_map_lookup_elem) { map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON || @@ -4548,7 +5637,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - if (env->prog->aux->offload) { + if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env); if (ret) goto err_unlock; @@ -4565,12 +5654,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->explored_states) goto skip_full_check; + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = check_cfg(env); if (ret < 0) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); @@ -4585,12 +5674,18 @@ skip_full_check: sanitize_dead_code(env); if (ret == 0) + ret = check_max_stack_depth(env); + + if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); if (ret == 0) ret = fixup_bpf_calls(env); + if (ret == 0) + ret = fixup_call_args(env); + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 024085daab1a..a2c05d2476ac 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) */ do { css_task_iter_start(&from->self, 0, &it); - task = css_task_iter_next(&it); + + do { + task = css_task_iter_next(&it); + } while (task && (task->flags & PF_EXITING)); + if (task) get_task_struct(task); css_task_iter_end(&it); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 0b1ffe147f24..8cda3bc3ae22 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strncpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strcpy(root->release_agent_path, opts->release_agent); + strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strcpy(root->name, opts->name); + strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } @@ -4125,26 +4125,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) static void css_task_iter_advance(struct css_task_iter *it) { - struct list_head *l = it->task_pos; + struct list_head *next; lockdep_assert_held(&css_set_lock); - WARN_ON_ONCE(!l); - repeat: /* * Advance iterator to find next entry. cset->tasks is consumed * first and then ->mg_tasks. After ->mg_tasks, we move onto the * next cset. */ - l = l->next; + next = it->task_pos->next; - if (l == it->tasks_head) - l = it->mg_tasks_head->next; + if (next == it->tasks_head) + next = it->mg_tasks_head->next; - if (l == it->mg_tasks_head) + if (next == it->mg_tasks_head) css_task_iter_advance_css_set(it); else - it->task_pos = l; + it->task_pos = next; /* if PROCS, skip over tasks which aren't group leaders */ if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && @@ -4449,6 +4447,7 @@ static struct cftype cgroup_base_files[] = { }, { .name = "cgroup.threads", + .flags = CFTYPE_NS_DELEGATABLE, .release = cgroup_procs_release, .seq_start = cgroup_threads_start, .seq_next = cgroup_procs_next, diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index f7efa7b4d825..b42037e6e81d 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1254,9 +1254,9 @@ done: return retval; } -int current_cpuset_is_being_rebound(void) +bool current_cpuset_is_being_rebound(void) { - int ret; + bool ret; rcu_read_lock(); ret = task_cs(current) == cpuset_being_rebound; diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f780d8f6a9d..9caeda610249 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -50,7 +50,7 @@ static int current_css_set_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); refcnt = refcount_read(&cset->refcount); seq_printf(seq, "css_set %pK %d", cset, refcnt); if (refcnt > cset->nr_tasks) @@ -96,7 +96,7 @@ static int current_css_set_cg_links_read(struct seq_file *seq, void *v) spin_lock_irq(&css_set_lock); rcu_read_lock(); - cset = rcu_dereference(current->cgroups); + cset = task_css_set(current); list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { struct cgroup *c = link->cgrp; diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c index 133b465691d6..1e111dd455c4 100644 --- a/kernel/cgroup/stat.c +++ b/kernel/cgroup/stat.c @@ -296,8 +296,12 @@ int cgroup_stat_init(struct cgroup *cgrp) } /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) - cgroup_cpu_stat(cgrp, cpu)->updated_children = cgrp; + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + cstat->updated_children = cgrp; + u64_stats_init(&cstat->sync); + } prev_cputime_init(&cgrp->stat.prev_cputime); diff --git a/kernel/compat.c b/kernel/compat.c index d1cee656a7ed..3247fe761f60 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -355,7 +355,7 @@ COMPAT_SYSCALL_DEFINE3(sched_getaffinity, compat_pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); + unsigned int retlen = min(len, cpumask_size()); if (compat_put_bitmap(user_mask_ptr, cpumask_bits(mask), retlen * 8)) ret = -EFAULT; diff --git a/kernel/configs/kvm_guest.config b/kernel/configs/kvm_guest.config index 8d9643767142..108fecc20fc1 100644 --- a/kernel/configs/kvm_guest.config +++ b/kernel/configs/kvm_guest.config @@ -18,6 +18,7 @@ CONFIG_VIRTUALIZATION=y CONFIG_HYPERVISOR_GUEST=y CONFIG_PARAVIRT=y CONFIG_KVM_GUEST=y +CONFIG_S390_GUEST=y CONFIG_VIRTIO=y CONFIG_VIRTIO_PCI=y CONFIG_VIRTIO_BLK=y diff --git a/kernel/configs/nopm.config b/kernel/configs/nopm.config new file mode 100644 index 000000000000..81ff07863576 --- /dev/null +++ b/kernel/configs/nopm.config @@ -0,0 +1,15 @@ +CONFIG_PM=n +CONFIG_SUSPEND=n +CONFIG_HIBERNATION=n + +# Triggers PM on OMAP +CONFIG_CPU_IDLE=n + +# Triggers enablement via hibernate callbacks +CONFIG_XEN=n + +# ARM/ARM64 architectures that select PM unconditionally +CONFIG_ARCH_OMAP2PLUS_TYPICAL=n +CONFIG_ARCH_RENESAS=n +CONFIG_ARCH_TEGRA=n +CONFIG_ARCH_VEXPRESS=n diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index 7fa0c4ae6394..9bfdffc100da 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -10,3 +10,7 @@ CONFIG_OPTIMIZE_INLINING=y # CONFIG_SLAB is not set # CONFIG_SLUB is not set CONFIG_SLOB=y +CONFIG_CC_STACKPROTECTOR_NONE=y +# CONFIG_CC_STACKPROTECTOR_REGULAR is not set +# CONFIG_CC_STACKPROTECTOR_STRONG is not set +# CONFIG_CC_STACKPROTECTOR_AUTO is not set diff --git a/kernel/cpu.c b/kernel/cpu.c index 04892a82f6ac..53f7dc65f9a3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -80,19 +80,19 @@ static struct lockdep_map cpuhp_state_down_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-down", &cpuhp_state_down_map); -static void inline cpuhp_lock_acquire(bool bringup) +static inline void cpuhp_lock_acquire(bool bringup) { lock_map_acquire(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } -static void inline cpuhp_lock_release(bool bringup) +static inline void cpuhp_lock_release(bool bringup) { lock_map_release(bringup ? &cpuhp_state_up_map : &cpuhp_state_down_map); } #else -static void inline cpuhp_lock_acquire(bool bringup) { } -static void inline cpuhp_lock_release(bool bringup) { } +static inline void cpuhp_lock_acquire(bool bringup) { } +static inline void cpuhp_lock_release(bool bringup) { } #endif @@ -780,8 +780,8 @@ static int takedown_cpu(unsigned int cpu) BUG_ON(cpu_online(cpu)); /* - * The CPUHP_AP_SCHED_MIGRATE_DYING callback will have removed all - * runnable tasks from the cpu, there's only the idle task left now + * The teardown callback for CPUHP_AP_SCHED_STARTING will have removed + * all runnable tasks from the CPU, there's only the idle task left now * that the migration thread is done doing the stop_machine thing. * * Wait for the stop thread to go away. @@ -1277,9 +1277,9 @@ static struct cpuhp_step cpuhp_bp_states[] = { * before blk_mq_queue_reinit_notify() from notify_dead(), * otherwise a RCU stall occurs. */ - [CPUHP_TIMERS_DEAD] = { + [CPUHP_TIMERS_PREPARE] = { .name = "timers:dead", - .startup.single = NULL, + .startup.single = timers_prepare_cpu, .teardown.single = timers_dead_cpu, }, /* Kicks the plugged cpu into life */ @@ -1289,11 +1289,6 @@ static struct cpuhp_step cpuhp_bp_states[] = { .teardown.single = NULL, .cant_stop = true, }, - [CPUHP_AP_SMPCFD_DYING] = { - .name = "smpcfd:dying", - .startup.single = NULL, - .teardown.single = smpcfd_dying_cpu, - }, /* * Handled on controll processor until the plugged processor manages * this itself. @@ -1335,6 +1330,11 @@ static struct cpuhp_step cpuhp_ap_states[] = { .startup.single = NULL, .teardown.single = rcutree_dying_cpu, }, + [CPUHP_AP_SMPCFD_DYING] = { + .name = "smpcfd:dying", + .startup.single = NULL, + .teardown.single = smpcfd_dying_cpu, + }, /* Entry state on starting. Interrupts enabled from here on. Transient * state for synchronsization */ [CPUHP_AP_ONLINE] = { diff --git a/kernel/crash_core.c b/kernel/crash_core.c index b3663896278e..4f63597c824d 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -410,7 +410,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(contig_page_data); #endif #ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_SYMBOL_ARRAY(mem_section); VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index e74be38245ad..ed5d34925ad0 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -350,7 +350,7 @@ poll_again: } kdb_printf("\n"); for (i = 0; i < count; i++) { - if (kallsyms_symbol_next(p_tmp, i) < 0) + if (WARN_ON(!kallsyms_symbol_next(p_tmp, i))) break; kdb_printf("%s ", p_tmp); *(p_tmp + len) = '\0'; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index c8146d53ca67..dbb0781a0533 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2441,7 +2441,6 @@ static int kdb_kill(int argc, const char **argv) long sig, pid; char *endp; struct task_struct *p; - struct siginfo info; if (argc != 2) return KDB_ARGCOUNT; @@ -2449,7 +2448,7 @@ static int kdb_kill(int argc, const char **argv) sig = simple_strtol(argv[1], &endp, 0); if (*endp) return KDB_BADINT; - if (sig >= 0) { + if ((sig >= 0) || !valid_signal(-sig)) { kdb_printf("Invalid signal parameter.<-signal>\n"); return 0; } @@ -2470,12 +2469,7 @@ static int kdb_kill(int argc, const char **argv) return 0; } p = p->group_leader; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = pid; /* same capabilities as process being signalled */ - info.si_uid = 0; /* kdb has root authority */ - kdb_send_sig_info(p, &info); + kdb_send_sig(p, sig); return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index fc224fbcf954..1e5a502ba4a7 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -208,7 +208,7 @@ extern unsigned long kdb_task_state(const struct task_struct *p, extern void kdb_ps_suppressed(void); extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); -extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); +extern void kdb_send_sig(struct task_struct *p, int sig); extern void kdb_meminfo_proc_show(void); extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 4a1c33416b6a..e2764d767f18 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -51,16 +51,16 @@ void __delayacct_tsk_init(struct task_struct *tsk) * Finish delay accounting for a statistic using its timestamps (@start), * accumalator (@total) and @count */ -static void delayacct_end(u64 *start, u64 *total, u32 *count) +static void delayacct_end(spinlock_t *lock, u64 *start, u64 *total, u32 *count) { s64 ns = ktime_get_ns() - *start; unsigned long flags; if (ns > 0) { - spin_lock_irqsave(¤t->delays->lock, flags); + spin_lock_irqsave(lock, flags); *total += ns; (*count)++; - spin_unlock_irqrestore(¤t->delays->lock, flags); + spin_unlock_irqrestore(lock, flags); } } @@ -69,17 +69,25 @@ void __delayacct_blkio_start(void) current->delays->blkio_start = ktime_get_ns(); } -void __delayacct_blkio_end(void) +/* + * We cannot rely on the `current` macro, as we haven't yet switched back to + * the process being woken. + */ +void __delayacct_blkio_end(struct task_struct *p) { - if (current->delays->flags & DELAYACCT_PF_SWAPIN) - /* Swapin block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->swapin_delay, - ¤t->delays->swapin_count); - else /* Other block I/O */ - delayacct_end(¤t->delays->blkio_start, - ¤t->delays->blkio_delay, - ¤t->delays->blkio_count); + struct task_delay_info *delays = p->delays; + u64 *total; + u32 *count; + + if (p->delays->flags & DELAYACCT_PF_SWAPIN) { + total = &delays->swapin_delay; + count = &delays->swapin_count; + } else { + total = &delays->blkio_delay; + count = &delays->blkio_count; + } + + delayacct_end(&delays->lock, &delays->blkio_start, total, count); } int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -153,8 +161,10 @@ void __delayacct_freepages_start(void) void __delayacct_freepages_end(void) { - delayacct_end(¤t->delays->freepages_start, - ¤t->delays->freepages_delay, - ¤t->delays->freepages_count); + delayacct_end( + ¤t->delays->lock, + ¤t->delays->freepages_start, + ¤t->delays->freepages_delay, + ¤t->delays->freepages_count); } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 1b2be63c8528..772a43fea825 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -179,21 +179,6 @@ put_callchain_entry(int rctx) } struct perf_callchain_entry * -perf_callchain(struct perf_event *event, struct pt_regs *regs) -{ - bool kernel = !event->attr.exclude_callchain_kernel; - bool user = !event->attr.exclude_callchain_user; - /* Disallow cross-task user callchains. */ - bool crosstask = event->ctx->task && event->ctx->task != current; - const u32 max_stack = event->attr.sample_max_stack; - - if (!kernel && !user) - return NULL; - - return get_perf_callchain(regs, 0, kernel, user, max_stack, crosstask, true); -} - -struct perf_callchain_entry * get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, u32 max_stack, bool crosstask, bool add_mark) { diff --git a/kernel/events/core.c b/kernel/events/core.c index 16beab4767e1..96db9ae5d5af 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1231,6 +1231,10 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem + * + * cpu_hotplug_lock + * pmus_lock + * cpuctx->mutex / perf_event_context::mutex */ static struct perf_event_context * perf_event_ctx_lock_nested(struct perf_event *event, int nesting) @@ -4196,6 +4200,7 @@ int perf_event_release_kernel(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct perf_event *child, *tmp; + LIST_HEAD(free_list); /* * If we got here through err_file: fput(event_file); we will not have @@ -4268,8 +4273,7 @@ again: struct perf_event, child_list); if (tmp == child) { perf_remove_from_context(child, DETACH_GROUP); - list_del(&child->child_list); - free_event(child); + list_move(&child->child_list, &free_list); /* * This matches the refcount bump in inherit_event(); * this can't be the last reference. @@ -4284,6 +4288,11 @@ again: } mutex_unlock(&event->child_mutex); + list_for_each_entry_safe(child, tmp, &free_list, child_list) { + list_del(&child->child_list); + free_event(child); + } + no_ctx: put_event(event); /* Must be the 'last' reference */ return 0; @@ -4511,11 +4520,11 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) return ret; } -static unsigned int perf_poll(struct file *file, poll_table *wait) +static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct ring_buffer *rb; - unsigned int events = POLLHUP; + __poll_t events = EPOLLHUP; poll_wait(file, &event->waitq, wait); @@ -4723,6 +4732,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon rcu_read_unlock(); return 0; } + + case PERF_EVENT_IOC_QUERY_BPF: + return perf_event_query_prog_array(event, (void __user *)arg); default: return -ENOTTY; } @@ -4904,6 +4916,7 @@ void perf_event_update_userpage(struct perf_event *event) unlock: rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(perf_event_update_userpage); static int perf_mmap_fault(struct vm_fault *vmf) { @@ -5815,19 +5828,11 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_read(handle, event); if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (data->callchain) { - int size = 1; - - if (data->callchain) - size += data->callchain->nr; - - size *= sizeof(u64); + int size = 1; - __output_copy(handle, data->callchain, size); - } else { - u64 nr = 0; - perf_output_put(handle, nr); - } + size += data->callchain->nr; + size *= sizeof(u64); + __output_copy(handle, data->callchain, size); } if (sample_type & PERF_SAMPLE_RAW) { @@ -5980,6 +5985,26 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } +static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; + +static struct perf_callchain_entry * +perf_callchain(struct perf_event *event, struct pt_regs *regs) +{ + bool kernel = !event->attr.exclude_callchain_kernel; + bool user = !event->attr.exclude_callchain_user; + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; + const u32 max_stack = event->attr.sample_max_stack; + struct perf_callchain_entry *callchain; + + if (!kernel && !user) + return &__empty_callchain; + + callchain = get_perf_callchain(regs, 0, kernel, user, + max_stack, crosstask, true); + return callchain ?: &__empty_callchain; +} + void perf_prepare_sample(struct perf_event_header *header, struct perf_sample_data *data, struct perf_event *event, @@ -6002,9 +6027,7 @@ void perf_prepare_sample(struct perf_event_header *header, int size = 1; data->callchain = perf_callchain(event, regs); - - if (data->callchain) - size += data->callchain->nr; + size += data->callchain->nr; header->size += size * sizeof(u64); } @@ -6639,6 +6662,7 @@ static void perf_event_namespaces_output(struct perf_event *event, struct perf_namespaces_event *namespaces_event = data; struct perf_output_handle handle; struct perf_sample_data sample; + u16 header_size = namespaces_event->event_id.header.size; int ret; if (!perf_event_namespaces_match(event)) @@ -6649,7 +6673,7 @@ static void perf_event_namespaces_output(struct perf_event *event, ret = perf_output_begin(&handle, event, namespaces_event->event_id.header.size); if (ret) - return; + goto out; namespaces_event->event_id.pid = perf_event_pid(event, namespaces_event->task); @@ -6661,6 +6685,8 @@ static void perf_event_namespaces_output(struct perf_event *event, perf_event__output_id_sample(event, &handle, &sample); perf_output_end(&handle); +out: + namespaces_event->event_id.header.size = header_size; } static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, @@ -7987,11 +8013,11 @@ static void bpf_overflow_handler(struct perf_event *event, { struct bpf_perf_event_data_kern ctx = { .data = data, - .regs = regs, .event = event, }; int ret = 0; + ctx.regs = perf_arch_bpf_user_pt_regs(regs); preempt_disable(); if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) goto out; @@ -8077,6 +8103,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } + /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { + bpf_prog_put(prog); + return -EINVAL; + } + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); @@ -8513,6 +8546,29 @@ fail_clear_files: return ret; } +static int +perf_tracepoint_set_filter(struct perf_event *event, char *filter_str) +{ + struct perf_event_context *ctx = event->ctx; + int ret; + + /* + * Beware, here be dragons!! + * + * the tracepoint muck will deadlock against ctx->mutex, but the tracepoint + * stuff does not actually need it. So temporarily drop ctx->mutex. As per + * perf_event_ctx_lock() we already have a reference on ctx. + * + * This can result in event getting moved to a different ctx, but that + * does not affect the tracepoint state. + */ + mutex_unlock(&ctx->mutex); + ret = ftrace_profile_set_filter(event, event->attr.config, filter_str); + mutex_lock(&ctx->mutex); + + return ret; +} + static int perf_event_set_filter(struct perf_event *event, void __user *arg) { char *filter_str; @@ -8529,8 +8585,7 @@ static int perf_event_set_filter(struct perf_event *event, void __user *arg) if (IS_ENABLED(CONFIG_EVENT_TRACING) && event->attr.type == PERF_TYPE_TRACEPOINT) - ret = ftrace_profile_set_filter(event, event->attr.config, - filter_str); + ret = perf_tracepoint_set_filter(event, filter_str); else if (has_addr_filter(event)) ret = perf_event_set_addr_filter(event, filter_str); @@ -9165,7 +9220,13 @@ static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) if (!try_module_get(pmu->module)) return -ENODEV; - if (event->group_leader != event) { + /* + * A number of pmu->event_init() methods iterate the sibling_list to, + * for example, validate if the group fits on the PMU. Therefore, + * if this is a sibling event, acquire the ctx->mutex to protect + * the sibling_list. + */ + if (event->group_leader != event && pmu->task_ctx_nr != perf_sw_context) { /* * This ctx->mutex can nest when we're called through * inheritance. See the perf_event_ctx_lock_nested() comment. @@ -10700,6 +10761,19 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + + if ((child_event->attach_state & PERF_ATTACH_TASK_DATA) && + !child_ctx->task_ctx_data) { + struct pmu *pmu = child_event->pmu; + + child_ctx->task_ctx_data = kzalloc(pmu->task_ctx_size, + GFP_KERNEL); + if (!child_ctx->task_ctx_data) { + free_event(child_event); + return NULL; + } + } + /* * is_orphaned_event() and list_add_tail(&parent_event->child_list) * must be under the same lock in order to serialize against @@ -10710,6 +10784,7 @@ inherit_event(struct perf_event *parent_event, if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { mutex_unlock(&parent_event->child_mutex); + /* task_ctx_data is freed with child_ctx */ free_event(child_event); return NULL; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 09b1537ae06c..6dc725a7e7bc 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -201,10 +201,6 @@ arch_perf_out_copy_user(void *dst, const void *src, unsigned long n) DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) -/* Callchain handling */ -extern struct perf_callchain_entry * -perf_callchain(struct perf_event *event, struct pt_regs *regs); - static inline int get_recursion_context(int *recursion) { int rctx; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 141aa2ca8728..6c6b3c48db71 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -19,7 +19,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) { - atomic_set(&handle->rb->poll, POLLIN); + atomic_set(&handle->rb->poll, EPOLLIN); handle->event->pending_wakeup = 1; irq_work_queue(&handle->event->pending); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 267f6ef91d97..ce6848e46e94 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1167,8 +1167,8 @@ static int xol_add_vma(struct mm_struct *mm, struct xol_area *area) } ret = 0; - smp_wmb(); /* pairs with get_xol_area() */ - mm->uprobes_state.xol_area = area; + /* pairs with get_xol_area() */ + smp_store_release(&mm->uprobes_state.xol_area, area); /* ^^^ */ fail: up_write(&mm->mmap_sem); @@ -1230,8 +1230,8 @@ static struct xol_area *get_xol_area(void) if (!mm->uprobes_state.xol_area) __create_xol_area(0); - area = mm->uprobes_state.xol_area; - smp_read_barrier_depends(); /* pairs with wmb in xol_add_vma() */ + /* Pairs with xol_add_vma() smp_store_release() */ + area = READ_ONCE(mm->uprobes_state.xol_area); /* ^^^ */ return area; } @@ -1528,8 +1528,8 @@ static unsigned long get_trampoline_vaddr(void) struct xol_area *area; unsigned long trampoline_vaddr = -1; - area = current->mm->uprobes_state.xol_area; - smp_read_barrier_depends(); + /* Pairs with xol_add_vma() smp_store_release() */ + area = READ_ONCE(current->mm->uprobes_state.xol_area); /* ^^^ */ if (area) trampoline_vaddr = area->vaddr; diff --git a/kernel/exit.c b/kernel/exit.c index 6b4298a41167..995453d9fb55 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1755,3 +1755,12 @@ Efault: return -EFAULT; } #endif + +__weak void abort(void) +{ + BUG(); + + /* if that doesn't kill us, halt */ + panic("Oops failed to kill thread"); +} +EXPORT_SYMBOL(abort); diff --git a/kernel/fail_function.c b/kernel/fail_function.c new file mode 100644 index 000000000000..21b0122cb39c --- /dev/null +++ b/kernel/fail_function.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fail_function.c: Function-based error injection + */ +#include <linux/error-injection.h> +#include <linux/debugfs.h> +#include <linux/fault-inject.h> +#include <linux/kallsyms.h> +#include <linux/kprobes.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); + +struct fei_attr { + struct list_head list; + struct kprobe kp; + unsigned long retval; +}; +static DEFINE_MUTEX(fei_lock); +static LIST_HEAD(fei_attr_list); +static DECLARE_FAULT_ATTR(fei_fault_attr); +static struct dentry *fei_debugfs_dir; + +static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv) +{ + switch (get_injectable_error_type(addr)) { + case EI_ETYPE_NULL: + if (retv != 0) + return 0; + break; + case EI_ETYPE_ERRNO: + if (retv < (unsigned long)-MAX_ERRNO) + return (unsigned long)-EINVAL; + break; + case EI_ETYPE_ERRNO_NULL: + if (retv != 0 && retv < (unsigned long)-MAX_ERRNO) + return (unsigned long)-EINVAL; + break; + } + + return retv; +} + +static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr) +{ + struct fei_attr *attr; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (attr) { + attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL); + if (!attr->kp.symbol_name) { + kfree(attr); + return NULL; + } + attr->kp.pre_handler = fei_kprobe_handler; + attr->retval = adjust_error_retval(addr, 0); + INIT_LIST_HEAD(&attr->list); + } + return attr; +} + +static void fei_attr_free(struct fei_attr *attr) +{ + if (attr) { + kfree(attr->kp.symbol_name); + kfree(attr); + } +} + +static struct fei_attr *fei_attr_lookup(const char *sym) +{ + struct fei_attr *attr; + + list_for_each_entry(attr, &fei_attr_list, list) { + if (!strcmp(attr->kp.symbol_name, sym)) + return attr; + } + + return NULL; +} + +static bool fei_attr_is_valid(struct fei_attr *_attr) +{ + struct fei_attr *attr; + + list_for_each_entry(attr, &fei_attr_list, list) { + if (attr == _attr) + return true; + } + + return false; +} + +static int fei_retval_set(void *data, u64 val) +{ + struct fei_attr *attr = data; + unsigned long retv = (unsigned long)val; + int err = 0; + + mutex_lock(&fei_lock); + /* + * Since this operation can be done after retval file is removed, + * It is safer to check the attr is still valid before accessing + * its member. + */ + if (!fei_attr_is_valid(attr)) { + err = -ENOENT; + goto out; + } + + if (attr->kp.addr) { + if (adjust_error_retval((unsigned long)attr->kp.addr, + val) != retv) + err = -EINVAL; + } + if (!err) + attr->retval = val; +out: + mutex_unlock(&fei_lock); + + return err; +} + +static int fei_retval_get(void *data, u64 *val) +{ + struct fei_attr *attr = data; + int err = 0; + + mutex_lock(&fei_lock); + /* Here we also validate @attr to ensure it still exists. */ + if (!fei_attr_is_valid(attr)) + err = -ENOENT; + else + *val = attr->retval; + mutex_unlock(&fei_lock); + + return err; +} +DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, + "%llx\n"); + +static int fei_debugfs_add_attr(struct fei_attr *attr) +{ + struct dentry *dir; + + dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); + if (!dir) + return -ENOMEM; + + if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) { + debugfs_remove_recursive(dir); + return -ENOMEM; + } + + return 0; +} + +static void fei_debugfs_remove_attr(struct fei_attr *attr) +{ + struct dentry *dir; + + dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir); + if (dir) + debugfs_remove_recursive(dir); +} + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) +{ + struct fei_attr *attr = container_of(kp, struct fei_attr, kp); + + if (should_fail(&fei_fault_attr, 1)) { + regs_set_return_value(regs, attr->retval); + override_function_with_return(regs); + /* Kprobe specific fixup */ + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + + return 0; +} +NOKPROBE_SYMBOL(fei_kprobe_handler) + +static void *fei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&fei_lock); + return seq_list_start(&fei_attr_list, *pos); +} + +static void fei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&fei_lock); +} + +static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &fei_attr_list, pos); +} + +static int fei_seq_show(struct seq_file *m, void *v) +{ + struct fei_attr *attr = list_entry(v, struct fei_attr, list); + + seq_printf(m, "%pf\n", attr->kp.addr); + return 0; +} + +static const struct seq_operations fei_seq_ops = { + .start = fei_seq_start, + .next = fei_seq_next, + .stop = fei_seq_stop, + .show = fei_seq_show, +}; + +static int fei_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fei_seq_ops); +} + +static void fei_attr_remove(struct fei_attr *attr) +{ + fei_debugfs_remove_attr(attr); + unregister_kprobe(&attr->kp); + list_del(&attr->list); + fei_attr_free(attr); +} + +static void fei_attr_remove_all(void) +{ + struct fei_attr *attr, *n; + + list_for_each_entry_safe(attr, n, &fei_attr_list, list) { + fei_attr_remove(attr); + } +} + +static ssize_t fei_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct fei_attr *attr; + unsigned long addr; + char *buf, *sym; + int ret; + + /* cut off if it is too long */ + if (count > KSYM_NAME_LEN) + count = KSYM_NAME_LEN; + buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, buffer, count)) { + ret = -EFAULT; + goto out; + } + buf[count] = '\0'; + sym = strstrip(buf); + + mutex_lock(&fei_lock); + + /* Writing just spaces will remove all injection points */ + if (sym[0] == '\0') { + fei_attr_remove_all(); + ret = count; + goto out; + } + /* Writing !function will remove one injection point */ + if (sym[0] == '!') { + attr = fei_attr_lookup(sym + 1); + if (!attr) { + ret = -ENOENT; + goto out; + } + fei_attr_remove(attr); + ret = count; + goto out; + } + + addr = kallsyms_lookup_name(sym); + if (!addr) { + ret = -EINVAL; + goto out; + } + if (!within_error_injection_list(addr)) { + ret = -ERANGE; + goto out; + } + if (fei_attr_lookup(sym)) { + ret = -EBUSY; + goto out; + } + attr = fei_attr_new(sym, addr); + if (!attr) { + ret = -ENOMEM; + goto out; + } + + ret = register_kprobe(&attr->kp); + if (!ret) + ret = fei_debugfs_add_attr(attr); + if (ret < 0) + fei_attr_remove(attr); + else { + list_add_tail(&attr->list, &fei_attr_list); + ret = count; + } +out: + kfree(buf); + mutex_unlock(&fei_lock); + return ret; +} + +static const struct file_operations fei_ops = { + .open = fei_open, + .read = seq_read, + .write = fei_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init fei_debugfs_init(void) +{ + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_function", NULL, + &fei_fault_attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + /* injectable attribute is just a symlink of error_inject/list */ + if (!debugfs_create_symlink("injectable", dir, + "../error_injection/list")) + goto error; + + if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops)) + goto error; + + fei_debugfs_dir = dir; + + return 0; +error: + debugfs_remove_recursive(dir); + return -ENOMEM; +} + +late_initcall(fei_debugfs_init); diff --git a/kernel/fork.c b/kernel/fork.c index 432eadf6b58c..be8aa5b98666 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,6 +77,7 @@ #include <linux/blkdev.h> #include <linux/fs_struct.h> #include <linux/magic.h> +#include <linux/sched/mm.h> #include <linux/perf_event.h> #include <linux/posix-timers.h> #include <linux/user-return-notifier.h> @@ -282,8 +283,9 @@ static void free_thread_stack(struct task_struct *tsk) void thread_stack_cache_init(void) { - thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, - THREAD_SIZE, 0, NULL); + thread_stack_cache = kmem_cache_create_usercopy("thread_stack", + THREAD_SIZE, THREAD_SIZE, 0, 0, + THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } # endif @@ -390,6 +392,246 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +#ifdef CONFIG_MMU +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) +{ + struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge; + LIST_HEAD(uf); + + uprobe_start_dup_mmap(); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; + + prev = NULL; + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + struct file *file; + + if (mpnt->vm_flags & VM_DONTCOPY) { + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + tmp->vm_next = tmp->vm_prev = NULL; + file = tmp->vm_file; + if (file) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; + + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + i_mmap_lock_write(mapping); + if (tmp->vm_flags & VM_SHARED) + atomic_inc(&mapping->i_mmap_writable); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* + * Link in the new vma and copy the page table entries. + */ + *pprev = tmp; + pprev = &tmp->vm_next; + tmp->vm_prev = prev; + prev = tmp; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto out; + } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); + retval = 0; +out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); + up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); +fail_uprobe_end: + uprobe_end_dup_mmap(); + return retval; +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +} + +static inline int mm_alloc_pgd(struct mm_struct *mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct *mm) +{ + pgd_free(mm, mm->pgd); +} +#else +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); +#endif +} + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +static void __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); + hmm_mm_destroy(mm); + mmu_notifier_mm_destroy(mm); + check_mm(mm); + put_user_ns(mm->user_ns); + free_mm(mm); +} + +void mmdrop(struct mm_struct *mm) +{ + /* + * The implicit full barrier implied by atomic_dec_and_test() is + * required by the membarrier system call before returning to + * user-space, after storing to rq->curr. + */ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) + __mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmdrop); + +static void mmdrop_async_fn(struct work_struct *work) +{ + struct mm_struct *mm; + + mm = container_of(work, struct mm_struct, async_put_work); + __mmdrop(mm); +} + +static void mmdrop_async(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) { + INIT_WORK(&mm->async_put_work, mmdrop_async_fn); + schedule_work(&mm->async_put_work); + } +} + static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); @@ -457,6 +699,21 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif +static void task_struct_whitelist(unsigned long *offset, unsigned long *size) +{ + /* Fetch thread_struct whitelist for the architecture. */ + arch_thread_struct_whitelist(offset, size); + + /* + * Handle zero-sized whitelist or empty thread_struct, otherwise + * adjust offset to position of thread_struct in task_struct. + */ + if (unlikely(*size == 0)) + *offset = 0; + else + *offset += offsetof(struct task_struct, thread); +} + void __init fork_init(void) { int i; @@ -465,11 +722,14 @@ void __init fork_init(void) #define ARCH_MIN_TASKALIGN 0 #endif int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); + unsigned long useroffset, usersize; /* create a slab on which task_structs can be allocated */ - task_struct_cachep = kmem_cache_create("task_struct", + task_struct_whitelist(&useroffset, &usersize); + task_struct_cachep = kmem_cache_create_usercopy("task_struct", arch_task_struct_size, align, - SLAB_PANIC|SLAB_ACCOUNT, NULL); + SLAB_PANIC|SLAB_ACCOUNT, + useroffset, usersize, NULL); #endif /* do the arch specific task caches init */ @@ -594,182 +854,8 @@ free_tsk: return NULL; } -#ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; - int retval; - unsigned long charge; - LIST_HEAD(uf); - - uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { - retval = -EINTR; - goto fail_uprobe_end; - } - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; - - prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { - struct file *file; - - if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* VM_WIPEONFORK gets a clean slate in the child. */ - tmp->anon_vma = NULL; - if (anon_vma_prepare(tmp)) - goto fail_nomem_anon_vma_fork; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); - tmp->vm_next = tmp->vm_prev = NULL; - file = tmp->vm_file; - if (file) { - struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; - - get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); - i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only - */ - if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); - - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - - mm->map_count++; - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - if (retval) - goto out; - } - /* a new mm has just been created */ - arch_dup_mmap(oldmm, mm); - retval = 0; -out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); - dup_userfaultfd_complete(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); - return retval; -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto out; -} - -static inline int mm_alloc_pgd(struct mm_struct *mm) -{ - mm->pgd = pgd_alloc(mm); - if (unlikely(!mm->pgd)) - return -ENOMEM; - return 0; -} - -static inline void mm_free_pgd(struct mm_struct *mm) -{ - pgd_free(mm, mm->pgd); -} -#else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - down_write(&oldmm->mmap_sem); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); - return 0; -} -#define mm_alloc_pgd(mm) (0) -#define mm_free_pgd(mm) -#endif /* CONFIG_MMU */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) - static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) @@ -859,27 +945,6 @@ fail_nopgd: return NULL; } -static void check_mm(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); - - if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); - } - - if (mm_pgtables_bytes(mm)) - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", - mm_pgtables_bytes(mm)); - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -#endif -} - /* * Allocate and initialize an mm_struct. */ @@ -895,24 +960,6 @@ struct mm_struct *mm_alloc(void) return mm_init(mm, current, current_user_ns()); } -/* - * Called when the last reference to the mm - * is dropped: either by a lazy thread or by - * mmput. Free the page directory and the mm. - */ -void __mmdrop(struct mm_struct *mm) -{ - BUG_ON(mm == &init_mm); - mm_free_pgd(mm); - destroy_context(mm); - hmm_mm_destroy(mm); - mmu_notifier_mm_destroy(mm); - check_mm(mm); - put_user_ns(mm->user_ns); - free_mm(mm); -} -EXPORT_SYMBOL_GPL(__mmdrop); - static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); @@ -1545,6 +1592,10 @@ static __latent_entropy struct task_struct *copy_process( int retval; struct task_struct *p; + /* + * Don't allow sharing the root directory with processes in a different + * namespace + */ if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -2020,6 +2071,8 @@ long _do_fork(unsigned long clone_flags, int __user *child_tidptr, unsigned long tls) { + struct completion vfork; + struct pid *pid; struct task_struct *p; int trace = 0; long nr; @@ -2045,43 +2098,40 @@ long _do_fork(unsigned long clone_flags, p = copy_process(clone_flags, stack_start, stack_size, child_tidptr, NULL, trace, tls, NUMA_NO_NODE); add_latent_entropy(); + + if (IS_ERR(p)) + return PTR_ERR(p); + /* * Do this prior waking up the new thread - the thread pointer * might get invalid after that point, if the thread exits quickly. */ - if (!IS_ERR(p)) { - struct completion vfork; - struct pid *pid; + trace_sched_process_fork(current, p); - trace_sched_process_fork(current, p); + pid = get_task_pid(p, PIDTYPE_PID); + nr = pid_vnr(pid); - pid = get_task_pid(p, PIDTYPE_PID); - nr = pid_vnr(pid); + if (clone_flags & CLONE_PARENT_SETTID) + put_user(nr, parent_tidptr); - if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); - - if (clone_flags & CLONE_VFORK) { - p->vfork_done = &vfork; - init_completion(&vfork); - get_task_struct(p); - } + if (clone_flags & CLONE_VFORK) { + p->vfork_done = &vfork; + init_completion(&vfork); + get_task_struct(p); + } - wake_up_new_task(p); + wake_up_new_task(p); - /* forking complete and child started to run, tell ptracer */ - if (unlikely(trace)) - ptrace_event_pid(trace, pid); + /* forking complete and child started to run, tell ptracer */ + if (unlikely(trace)) + ptrace_event_pid(trace, pid); - if (clone_flags & CLONE_VFORK) { - if (!wait_for_vfork_done(p, &vfork)) - ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); - } - - put_pid(pid); - } else { - nr = PTR_ERR(p); + if (clone_flags & CLONE_VFORK) { + if (!wait_for_vfork_done(p, &vfork)) + ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid); } + + put_pid(pid); return nr; } @@ -2225,9 +2275,11 @@ void __init proc_caches_init(void) * maximum number of CPU's we can ever have. The cpumask_allocation * is at the end of the structure, exactly for that reason. */ - mm_cachep = kmem_cache_create("mm_struct", + mm_cachep = kmem_cache_create_usercopy("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct mm_struct, saved_auxv), + sizeof_field(struct mm_struct, saved_auxv), NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); diff --git a/kernel/futex.c b/kernel/futex.c index 76ed5921117a..1f450e092c74 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -862,24 +862,6 @@ static void put_pi_state(struct futex_pi_state *pi_state) } } -/* - * Look up the task based on what TID userspace gave us. - * We dont trust it. - */ -static struct task_struct *futex_find_get_task(pid_t pid) -{ - struct task_struct *p; - - rcu_read_lock(); - p = find_task_by_vpid(pid); - if (p) - get_task_struct(p); - - rcu_read_unlock(); - - return p; -} - #ifdef CONFIG_FUTEX_PI /* @@ -1183,7 +1165,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, */ if (!pid) return -ESRCH; - p = futex_find_get_task(pid); + p = find_get_task_by_vpid(pid); if (!p) return -ESRCH; @@ -1582,8 +1564,8 @@ static int futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) { unsigned int op = (encoded_op & 0x70000000) >> 28; unsigned int cmp = (encoded_op & 0x0f000000) >> 24; - int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 12); - int cmparg = sign_extend32(encoded_op & 0x00000fff, 12); + int oparg = sign_extend32((encoded_op & 0x00fff000) >> 12, 11); + int cmparg = sign_extend32(encoded_op & 0x00000fff, 11); int oldval, ret; if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) { @@ -1878,6 +1860,9 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_q *this, *next; DEFINE_WAKE_Q(wake_q); + if (nr_wake < 0 || nr_requeue < 0) + return -EINVAL; + /* * When PI not supported: return -ENOSYS if requeue_pi is true, * consequently the compiler knows requeue_pi is always false past @@ -2294,34 +2279,33 @@ static void unqueue_me_pi(struct futex_q *q) spin_unlock(q->lock_ptr); } -/* - * Fixup the pi_state owner with the new owner. - * - * Must be called with hash bucket lock held and mm->sem held for non - * private futexes. - */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *newowner) + struct task_struct *argowner) { - u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; u32 uval, uninitialized_var(curval), newval; - struct task_struct *oldowner; + struct task_struct *oldowner, *newowner; + u32 newtid; int ret; + lockdep_assert_held(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); oldowner = pi_state->owner; - /* Owner died? */ - if (!pi_state->owner) - newtid |= FUTEX_OWNER_DIED; /* - * We are here either because we stole the rtmutex from the - * previous highest priority waiter or we are the highest priority - * waiter but have failed to get the rtmutex the first time. + * We are here because either: + * + * - we stole the lock and pi_state->owner needs updating to reflect + * that (@argowner == current), * - * We have to replace the newowner TID in the user space variable. + * or: + * + * - someone stole our lock and we need to fix things to point to the + * new owner (@argowner == NULL). + * + * Either way, we have to replace the TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * * Note: We write the user space value _before_ changing the pi_state @@ -2334,6 +2318,45 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * in the PID check in lookup_pi_state. */ retry: + if (!argowner) { + if (oldowner != current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + + if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) { + /* We got the lock after all, nothing to fix. */ + ret = 0; + goto out_unlock; + } + + /* + * Since we just failed the trylock; there must be an owner. + */ + newowner = rt_mutex_owner(&pi_state->pi_mutex); + BUG_ON(!newowner); + } else { + WARN_ON_ONCE(argowner != current); + if (oldowner == current) { + /* + * We raced against a concurrent self; things are + * already fixed up. Nothing to do. + */ + ret = 0; + goto out_unlock; + } + newowner = argowner; + } + + newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; + if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; @@ -2434,9 +2457,9 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: * - * We can safely read pi_state->owner without holding wait_lock - * because we now own the rt_mutex, only the owner will attempt - * to change it. + * Speculative pi_state->owner read (we don't hold wait_lock); + * since we own the lock pi_state->owner == current is the + * stable state, anything else needs more attention. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); @@ -2444,6 +2467,19 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) } /* + * If we didn't get the lock; check if anybody stole it from us. In + * that case, we need to fix up the uval to point to them instead of + * us, otherwise bad things happen. [10] + * + * Another speculative read; pi_state->owner == current is unstable + * but needs our attention. + */ + if (q->pi_state->owner == current) { + ret = fixup_pi_state_owner(uaddr, q, NULL); + goto out; + } + + /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. */ diff --git a/kernel/groups.c b/kernel/groups.c index e357bc800111..daae2f2dc6d4 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -86,11 +86,12 @@ static int gid_cmp(const void *_a, const void *_b) return gid_gt(a, b) - gid_lt(a, b); } -static void groups_sort(struct group_info *group_info) +void groups_sort(struct group_info *group_info) { sort(group_info->gid, group_info->ngroups, sizeof(*group_info->gid), gid_cmp, NULL); } +EXPORT_SYMBOL(groups_sort); /* a simple bsearch */ int groups_search(const struct group_info *group_info, kgid_t grp) @@ -122,7 +123,6 @@ int groups_search(const struct group_info *group_info, kgid_t grp) void set_groups(struct cred *new, struct group_info *group_info) { put_group_info(new->group_info); - groups_sort(group_info); get_group_info(group_info); new->group_info = group_info; } @@ -206,6 +206,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 89e355866450..6fc87ccda1d7 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -103,16 +103,6 @@ config GENERIC_IRQ_MATRIX_ALLOCATOR config GENERIC_IRQ_RESERVATION_MODE bool -config IRQ_DOMAIN_DEBUG - bool "Expose hardware/virtual IRQ mapping via debugfs" - depends on IRQ_DOMAIN && DEBUG_FS - help - This option will show the mapping relationship between hardware irq - numbers and Linux irq numbers. The mapping is exposed via debugfs - in the file "irq_domain_mapping". - - If you don't know what this means you don't need it. - # Support forced irq threading config IRQ_FORCED_THREADING bool diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index e12d35108225..a37a3b4b6342 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, } } -static cpumask_var_t *alloc_node_to_present_cpumask(void) +static cpumask_var_t *alloc_node_to_possible_cpumask(void) { cpumask_var_t *masks; int node; @@ -62,7 +62,7 @@ out_unwind: return NULL; } -static void free_node_to_present_cpumask(cpumask_var_t *masks) +static void free_node_to_possible_cpumask(cpumask_var_t *masks) { int node; @@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks) kfree(masks); } -static void build_node_to_present_cpumask(cpumask_var_t *masks) +static void build_node_to_possible_cpumask(cpumask_var_t *masks) { int cpu; - for_each_present_cpu(cpu) + for_each_possible_cpu(cpu) cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]); } -static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask, +static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask, const struct cpumask *mask, nodemask_t *nodemsk) { int n, nodes = 0; /* Calculate the number of nodes in the supplied affinity mask */ for_each_node(n) { - if (cpumask_intersects(mask, node_to_present_cpumask[n])) { + if (cpumask_intersects(mask, node_to_possible_cpumask[n])) { node_set(n, *nodemsk); nodes++; } @@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int last_affv = affv + affd->pre_vectors; nodemask_t nodemsk = NODE_MASK_NONE; struct cpumask *masks; - cpumask_var_t nmsk, *node_to_present_cpumask; + cpumask_var_t nmsk, *node_to_possible_cpumask; /* * If there aren't any vectors left after applying the pre/post @@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (!masks) goto out; - node_to_present_cpumask = alloc_node_to_present_cpumask(); - if (!node_to_present_cpumask) + node_to_possible_cpumask = alloc_node_to_possible_cpumask(); + if (!node_to_possible_cpumask) goto out; /* Fill out vectors at the beginning that don't need affinity */ @@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) /* Stabilize the cpumasks */ get_online_cpus(); - build_node_to_present_cpumask(node_to_present_cpumask); - nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask, + build_node_to_possible_cpumask(node_to_possible_cpumask); + nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask, &nodemsk); /* @@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) if (affv <= nodes) { for_each_node_mask(n, nodemsk) { cpumask_copy(masks + curvec, - node_to_present_cpumask[n]); + node_to_possible_cpumask[n]); if (++curvec == last_affv) break; } @@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ - cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]); + cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]); /* Calculate the number of cpus per vector */ ncpus = cpumask_weight(nmsk); @@ -192,7 +192,7 @@ done: /* Fill out vectors at the end that don't need affinity */ for (; curvec < nvecs; curvec++) cpumask_copy(masks + curvec, irq_default_affinity); - free_node_to_present_cpumask(node_to_present_cpumask); + free_node_to_possible_cpumask(node_to_possible_cpumask); out: free_cpumask_var(nmsk); return masks; @@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity return 0; get_online_cpus(); - ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv; + ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv; put_online_cpus(); return ret; } diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 4e8089b319ae..8c82ea26e837 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -71,7 +71,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE)) + if (irq_activate_and_startup(desc, IRQ_NORESEND)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 043bfc35b353..c69357a43849 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -294,11 +294,11 @@ int irq_activate(struct irq_desc *desc) return 0; } -void irq_activate_and_startup(struct irq_desc *desc, bool resend) +int irq_activate_and_startup(struct irq_desc *desc, bool resend) { if (WARN_ON(irq_activate(desc))) - return; - irq_startup(desc, resend, IRQ_START_FORCE); + return 0; + return irq_startup(desc, resend, IRQ_START_FORCE); } static void __irq_disable(struct irq_desc *desc, bool mask); diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 17f05ef8f575..8ccb326d2977 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -3,8 +3,6 @@ * Debugging printout: */ -#include <linux/kallsyms.h> - #define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) #define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) /* FIXME */ @@ -12,16 +10,21 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 5); + + if (!__ratelimit(&ratelimit)) + return; + printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); - printk("->handle_irq(): %p, ", desc->handle_irq); - print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->irq_data.chip(): %p, ", desc->irq_data.chip); - print_symbol("%s\n", (unsigned long)desc->irq_data.chip); + printk("->handle_irq(): %p, %pS\n", + desc->handle_irq, desc->handle_irq); + printk("->irq_data.chip(): %p, %pS\n", + desc->irq_data.chip, desc->irq_data.chip); printk("->action(): %p\n", desc->action); if (desc->action) { - printk("->action->handler(): %p, ", desc->action->handler); - print_symbol("%s\n", (unsigned long)desc->action->handler); + printk("->action->handler(): %p, %pS\n", + desc->action->handler, desc->action->handler); } ___P(IRQ_LEVEL); diff --git a/kernel/irq/debugfs.c b/kernel/irq/debugfs.c index 7f608ac39653..acfaaef8672a 100644 --- a/kernel/irq/debugfs.c +++ b/kernel/irq/debugfs.c @@ -113,6 +113,7 @@ static const struct irq_bit_descr irqdata_states[] = { BIT_MASK_DESCR(IRQD_SETAFFINITY_PENDING), BIT_MASK_DESCR(IRQD_AFFINITY_MANAGED), BIT_MASK_DESCR(IRQD_MANAGED_SHUTDOWN), + BIT_MASK_DESCR(IRQD_CAN_RESERVE), BIT_MASK_DESCR(IRQD_FORWARDED_TO_VCPU), diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index c26c5bb6b491..508c03dfef25 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -364,10 +364,11 @@ irq_get_domain_generic_chip(struct irq_domain *d, unsigned int hw_irq) EXPORT_SYMBOL_GPL(irq_get_domain_generic_chip); /* - * Separate lockdep class for interrupt chip which can nest irq_desc - * lock. + * Separate lockdep classes for interrupt chip which can nest irq_desc + * lock and request mutex. */ static struct lock_class_key irq_nested_lock_class; +static struct lock_class_key irq_nested_request_class; /* * irq_map_generic_chip - Map a generic chip for an irq domain @@ -409,7 +410,8 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, set_bit(idx, &gc->installed); if (dgc->gc_flags & IRQ_GC_INIT_NESTED_LOCK) - irq_set_lockdep_class(virq, &irq_nested_lock_class); + irq_set_lockdep_class(virq, &irq_nested_lock_class, + &irq_nested_request_class); if (chip->irq_calc_mask) chip->irq_calc_mask(data); @@ -479,7 +481,8 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk, continue; if (flags & IRQ_GC_INIT_NESTED_LOCK) - irq_set_lockdep_class(i, &irq_nested_lock_class); + irq_set_lockdep_class(i, &irq_nested_lock_class, + &irq_nested_request_class); if (!(flags & IRQ_GC_NO_MASK)) { struct irq_data *d = irq_get_irq_data(i); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 07d08ca701ec..ca6afa267070 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc); #define IRQ_START_COND false extern int irq_activate(struct irq_desc *desc); -extern void irq_activate_and_startup(struct irq_desc *desc, bool resend); +extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); @@ -440,7 +440,7 @@ static inline bool irq_fixup_move_pending(struct irq_desc *desc, bool fclear) #endif /* !CONFIG_GENERIC_PENDING_IRQ */ #if !defined(CONFIG_IRQ_DOMAIN) || !defined(CONFIG_IRQ_DOMAIN_HIERARCHY) -static inline int irq_domain_activate_irq(struct irq_data *data, bool early) +static inline int irq_domain_activate_irq(struct irq_data *data, bool reserve) { irqd_set_activated(data); return 0; diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 4f4f60015e8a..82b8b18ee1eb 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -897,124 +897,6 @@ unsigned int irq_find_mapping(struct irq_domain *domain, } EXPORT_SYMBOL_GPL(irq_find_mapping); -#ifdef CONFIG_IRQ_DOMAIN_DEBUG -static void virq_debug_show_one(struct seq_file *m, struct irq_desc *desc) -{ - struct irq_domain *domain; - struct irq_data *data; - - domain = desc->irq_data.domain; - data = &desc->irq_data; - - while (domain) { - unsigned int irq = data->irq; - unsigned long hwirq = data->hwirq; - struct irq_chip *chip; - bool direct; - - if (data == &desc->irq_data) - seq_printf(m, "%5d ", irq); - else - seq_printf(m, "%5d+ ", irq); - seq_printf(m, "0x%05lx ", hwirq); - - chip = irq_data_get_irq_chip(data); - seq_printf(m, "%-15s ", (chip && chip->name) ? chip->name : "none"); - - seq_printf(m, "0x%p ", irq_data_get_irq_chip_data(data)); - - seq_printf(m, " %c ", (desc->action && desc->action->handler) ? '*' : ' '); - direct = (irq == hwirq) && (irq < domain->revmap_direct_max_irq); - seq_printf(m, "%6s%-8s ", - (hwirq < domain->revmap_size) ? "LINEAR" : "RADIX", - direct ? "(DIRECT)" : ""); - seq_printf(m, "%s\n", domain->name); -#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY - domain = domain->parent; - data = data->parent_data; -#else - domain = NULL; -#endif - } -} - -static int virq_debug_show(struct seq_file *m, void *private) -{ - unsigned long flags; - struct irq_desc *desc; - struct irq_domain *domain; - struct radix_tree_iter iter; - void __rcu **slot; - int i; - - seq_printf(m, " %-16s %-6s %-10s %-10s %s\n", - "name", "mapped", "linear-max", "direct-max", "devtree-node"); - mutex_lock(&irq_domain_mutex); - list_for_each_entry(domain, &irq_domain_list, link) { - struct device_node *of_node; - const char *name; - - int count = 0; - - of_node = irq_domain_get_of_node(domain); - if (of_node) - name = of_node_full_name(of_node); - else if (is_fwnode_irqchip(domain->fwnode)) - name = container_of(domain->fwnode, struct irqchip_fwid, - fwnode)->name; - else - name = ""; - - radix_tree_for_each_slot(slot, &domain->revmap_tree, &iter, 0) - count++; - seq_printf(m, "%c%-16s %6u %10u %10u %s\n", - domain == irq_default_domain ? '*' : ' ', domain->name, - domain->revmap_size + count, domain->revmap_size, - domain->revmap_direct_max_irq, - name); - } - mutex_unlock(&irq_domain_mutex); - - seq_printf(m, "%-5s %-7s %-15s %-*s %6s %-14s %s\n", "irq", "hwirq", - "chip name", (int)(2 * sizeof(void *) + 2), "chip data", - "active", "type", "domain"); - - for (i = 1; i < nr_irqs; i++) { - desc = irq_to_desc(i); - if (!desc) - continue; - - raw_spin_lock_irqsave(&desc->lock, flags); - virq_debug_show_one(m, desc); - raw_spin_unlock_irqrestore(&desc->lock, flags); - } - - return 0; -} - -static int virq_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, virq_debug_show, inode->i_private); -} - -static const struct file_operations virq_debug_fops = { - .open = virq_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init irq_debugfs_init(void) -{ - if (debugfs_create_file("irq_domain_mapping", S_IRUGO, NULL, - NULL, &virq_debug_fops) == NULL) - return -ENOMEM; - - return 0; -} -__initcall(irq_debugfs_init); -#endif /* CONFIG_IRQ_DOMAIN_DEBUG */ - /** * irq_domain_xlate_onecell() - Generic xlate for direct one cell bindings * @@ -1693,7 +1575,7 @@ static void __irq_domain_deactivate_irq(struct irq_data *irq_data) } } -static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) +static int __irq_domain_activate_irq(struct irq_data *irqd, bool reserve) { int ret = 0; @@ -1702,9 +1584,9 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) if (irqd->parent_data) ret = __irq_domain_activate_irq(irqd->parent_data, - early); + reserve); if (!ret && domain->ops->activate) { - ret = domain->ops->activate(domain, irqd, early); + ret = domain->ops->activate(domain, irqd, reserve); /* Rollback in case of error */ if (ret && irqd->parent_data) __irq_domain_deactivate_irq(irqd->parent_data); @@ -1716,17 +1598,18 @@ static int __irq_domain_activate_irq(struct irq_data *irqd, bool early) /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate * interrupt - * @irq_data: outermost irq_data associated with interrupt + * @irq_data: Outermost irq_data associated with interrupt + * @reserve: If set only reserve an interrupt vector instead of assigning one * * This is the second step to call domain_ops->activate to program interrupt * controllers, so the interrupt could actually get delivered. */ -int irq_domain_activate_irq(struct irq_data *irq_data, bool early) +int irq_domain_activate_irq(struct irq_data *irq_data, bool reserve) { int ret = 0; if (!irqd_is_activated(irq_data)) - ret = __irq_domain_activate_irq(irq_data, early); + ret = __irq_domain_activate_irq(irq_data, reserve); if (!ret) irqd_set_activated(irq_data); return ret; @@ -1843,25 +1726,14 @@ static int irq_domain_debug_show(struct seq_file *m, void *p) irq_domain_debug_show_one(m, d, 0); return 0; } - -static int irq_domain_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_domain_debug_show, inode->i_private); -} - -static const struct file_operations dfs_domain_ops = { - .open = irq_domain_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(irq_domain_debug); static void debugfs_add_domain_dir(struct irq_domain *d) { if (!d->name || !domain_dir || d->debugfs_file) return; d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, - &dfs_domain_ops); + &irq_domain_debug_fops); } static void debugfs_remove_domain_dir(struct irq_domain *d) @@ -1877,7 +1749,8 @@ void __init irq_domain_debugfs_init(struct dentry *root) if (!domain_dir) return; - debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops); + debugfs_create_file("default", 0444, domain_dir, NULL, + &irq_domain_debug_fops); mutex_lock(&irq_domain_mutex); list_for_each_entry(d, &irq_domain_list, link) debugfs_add_domain_dir(d); diff --git a/kernel/irq/matrix.c b/kernel/irq/matrix.c index 7df2480005f8..5187dfe809ac 100644 --- a/kernel/irq/matrix.c +++ b/kernel/irq/matrix.c @@ -321,15 +321,23 @@ void irq_matrix_remove_reserved(struct irq_matrix *m) int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, bool reserved, unsigned int *mapped_cpu) { - unsigned int cpu; + unsigned int cpu, best_cpu, maxavl = 0; + struct cpumap *cm; + unsigned int bit; + best_cpu = UINT_MAX; for_each_cpu(cpu, msk) { - struct cpumap *cm = per_cpu_ptr(m->maps, cpu); - unsigned int bit; + cm = per_cpu_ptr(m->maps, cpu); - if (!cm->online) + if (!cm->online || cm->available <= maxavl) continue; + best_cpu = cpu; + maxavl = cm->available; + } + + if (maxavl) { + cm = per_cpu_ptr(m->maps, best_cpu); bit = matrix_alloc_area(m, cm, 1, false); if (bit < m->alloc_end) { cm->allocated++; @@ -338,8 +346,8 @@ int irq_matrix_alloc(struct irq_matrix *m, const struct cpumask *msk, m->global_available--; if (reserved) m->global_reserved--; - *mapped_cpu = cpu; - trace_irq_matrix_alloc(bit, cpu, m, cm); + *mapped_cpu = best_cpu; + trace_irq_matrix_alloc(bit, best_cpu, m, cm); return bit; } } @@ -384,7 +392,9 @@ unsigned int irq_matrix_available(struct irq_matrix *m, bool cpudown) { struct cpumap *cm = this_cpu_ptr(m->maps); - return (m->global_available - cpudown) ? cm->available : 0; + if (!cpudown) + return m->global_available; + return m->global_available - cm->available; } /** diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index edb987b2c58d..2f3c4f5382cc 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -339,6 +339,40 @@ int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, return ret; } +/* + * Carefully check whether the device can use reservation mode. If + * reservation mode is enabled then the early activation will assign a + * dummy vector to the device. If the PCI/MSI device does not support + * masking of the entry then this can result in spurious interrupts when + * the device driver is not absolutely careful. But even then a malfunction + * of the hardware could result in a spurious interrupt on the dummy vector + * and render the device unusable. If the entry can be masked then the core + * logic will prevent the spurious interrupt and reservation mode can be + * used. For now reservation mode is restricted to PCI/MSI. + */ +static bool msi_check_reservation_mode(struct irq_domain *domain, + struct msi_domain_info *info, + struct device *dev) +{ + struct msi_desc *desc; + + if (domain->bus_token != DOMAIN_BUS_PCI_MSI) + return false; + + if (!(info->flags & MSI_FLAG_MUST_REACTIVATE)) + return false; + + if (IS_ENABLED(CONFIG_PCI_MSI) && pci_msi_ignore_mask) + return false; + + /* + * Checking the first MSI descriptor is sufficient. MSIX supports + * masking and MSI does so when the maskbit is set. + */ + desc = first_msi_entry(dev); + return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit; +} + /** * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain * @domain: The domain to allocate from @@ -353,9 +387,11 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, { struct msi_domain_info *info = domain->host_data; struct msi_domain_ops *ops = info->ops; - msi_alloc_info_t arg; + struct irq_data *irq_data; struct msi_desc *desc; + msi_alloc_info_t arg; int i, ret, virq; + bool can_reserve; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) @@ -385,6 +421,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, if (ops->msi_finish) ops->msi_finish(&arg, 0); + can_reserve = msi_check_reservation_mode(domain, info, dev); + for_each_msi_entry(desc, dev) { virq = desc->irq; if (desc->nvec_used == 1) @@ -397,15 +435,25 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, * the MSI entries before the PCI layer enables MSI in the * card. Otherwise the card latches a random msi message. */ - if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { - struct irq_data *irq_data; + if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY)) + continue; + irq_data = irq_domain_get_irq_data(domain, desc->irq); + if (!can_reserve) + irqd_clr_can_reserve(irq_data); + ret = irq_domain_activate_irq(irq_data, can_reserve); + if (ret) + goto cleanup; + } + + /* + * If these interrupts use reservation mode, clear the activated bit + * so request_irq() will assign the final vector. + */ + if (can_reserve) { + for_each_msi_entry(desc, dev) { irq_data = irq_domain_get_irq_data(domain, desc->irq); - ret = irq_domain_activate_irq(irq_data, true); - if (ret) - goto cleanup; - if (info->flags & MSI_FLAG_MUST_REACTIVATE) - irqd_clr_activated(irq_data); + irqd_clr_activated(irq_data); } } return 0; diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index ef2a47e0eab6..6cdecc6f4c53 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -10,7 +10,6 @@ #include <linux/jiffies.h> #include <linux/irq.h> #include <linux/module.h> -#include <linux/kallsyms.h> #include <linux/interrupt.h> #include <linux/moduleparam.h> #include <linux/timer.h> diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 40e9d739c169..6b7cdf17ccf8 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -36,7 +36,7 @@ static bool irq_work_claim(struct irq_work *work) */ flags = work->flags & ~IRQ_WORK_PENDING; for (;;) { - nflags = flags | IRQ_WORK_FLAGS; + nflags = flags | IRQ_WORK_CLAIMED; oflags = cmpxchg(&work->flags, flags, nflags); if (oflags == flags) break; diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 8594d24e4adc..b4517095db6a 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -79,7 +79,7 @@ int static_key_count(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_count); -static void static_key_slow_inc_cpuslocked(struct static_key *key) +void static_key_slow_inc_cpuslocked(struct static_key *key) { int v, v1; @@ -180,7 +180,7 @@ void static_key_disable(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_disable); -static void static_key_slow_dec_cpuslocked(struct static_key *key, +static void __static_key_slow_dec_cpuslocked(struct static_key *key, unsigned long rate_limit, struct delayed_work *work) { @@ -211,7 +211,7 @@ static void __static_key_slow_dec(struct static_key *key, struct delayed_work *work) { cpus_read_lock(); - static_key_slow_dec_cpuslocked(key, rate_limit, work); + __static_key_slow_dec_cpuslocked(key, rate_limit, work); cpus_read_unlock(); } @@ -229,6 +229,12 @@ void static_key_slow_dec(struct static_key *key) } EXPORT_SYMBOL_GPL(static_key_slow_dec); +void static_key_slow_dec_cpuslocked(struct static_key *key) +{ + STATIC_KEY_CHECK_USE(key); + __static_key_slow_dec_cpuslocked(key, 0, NULL); +} + void static_key_slow_dec_deferred(struct static_key_deferred *key) { STATIC_KEY_CHECK_USE(key); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 531ffa984bc2..a23e21ada81b 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -12,7 +12,6 @@ * compression (see scripts/kallsyms.c for a more complete description) */ #include <linux/kallsyms.h> -#include <linux/module.h> #include <linux/init.h> #include <linux/seq_file.h> #include <linux/fs.h> @@ -20,15 +19,12 @@ #include <linux/err.h> #include <linux/proc_fs.h> #include <linux/sched.h> /* for cond_resched */ -#include <linux/mm.h> #include <linux/ctype.h> #include <linux/slab.h> #include <linux/filter.h> #include <linux/ftrace.h> #include <linux/compiler.h> -#include <asm/sections.h> - /* * These will be re-linked against their real values * during the second link stage. @@ -52,37 +48,6 @@ extern const u16 kallsyms_token_index[] __weak; extern const unsigned long kallsyms_markers[] __weak; -static inline int is_kernel_inittext(unsigned long addr) -{ - if (addr >= (unsigned long)_sinittext - && addr <= (unsigned long)_einittext) - return 1; - return 0; -} - -static inline int is_kernel_text(unsigned long addr) -{ - if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || - arch_is_kernel_text(addr)) - return 1; - return in_gate_area_no_mm(addr); -} - -static inline int is_kernel(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) - return 1; - return in_gate_area_no_mm(addr); -} - -static int is_ksym_addr(unsigned long addr) -{ - if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) - return is_kernel(addr); - - return is_kernel_text(addr) || is_kernel_inittext(addr); -} - /* * Expand a compressed symbol data into the resulting uncompressed string, * if uncompressed string is too long (>= maxlen), it will be truncated, @@ -464,17 +429,6 @@ int sprint_backtrace(char *buffer, unsigned long address) return __sprint_symbol(buffer, address, -1, 1); } -/* Look up a kernel symbol and print it to the kernel messages. */ -void __print_symbol(const char *fmt, unsigned long address) -{ - char buffer[KSYM_SYMBOL_LEN]; - - sprint_symbol(buffer, address); - - printk(fmt, buffer); -} -EXPORT_SYMBOL(__print_symbol); - /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; @@ -614,14 +568,14 @@ static void s_stop(struct seq_file *m, void *p) static int s_show(struct seq_file *m, void *p) { - unsigned long value; + void *value; struct kallsym_iter *iter = m->private; /* Some debugging symbols have no name. Ignore them. */ if (!iter->name[0]) return 0; - value = iter->show_value ? iter->value : 0; + value = iter->show_value ? (void *)iter->value : NULL; if (iter->module_name[0]) { char type; @@ -632,10 +586,10 @@ static int s_show(struct seq_file *m, void *p) */ type = iter->exported ? toupper(iter->type) : tolower(iter->type); - seq_printf(m, KALLSYM_FMT " %c %s\t[%s]\n", value, + seq_printf(m, "%px %c %s\t[%s]\n", value, type, iter->name, iter->module_name); } else - seq_printf(m, KALLSYM_FMT " %c %s\n", value, + seq_printf(m, "%px %c %s\n", value, iter->type, iter->name); return 0; } diff --git a/kernel/kcov.c b/kernel/kcov.c index 15f33faf4013..2c16f1ab5e10 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -157,7 +157,7 @@ void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2); -void notrace __sanitizer_cov_trace_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_); } @@ -183,7 +183,7 @@ void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2) } EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2); -void notrace __sanitizer_cov_trace_const_cmp4(u16 arg1, u16 arg2) +void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2) { write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2, _RET_IP_); @@ -358,7 +358,8 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, */ if (kcov->mode != KCOV_MODE_INIT || !kcov->area) return -EINVAL; - if (kcov->t != NULL) + t = current; + if (kcov->t != NULL || t->kcov != NULL) return -EBUSY; if (arg == KCOV_TRACE_PC) kcov->mode = KCOV_MODE_TRACE_PC; @@ -370,7 +371,6 @@ static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, #endif else return -EINVAL; - t = current; /* Cache in task struct for performance. */ t->kcov_size = kcov->size; t->kcov_area = kcov->area; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index da2ccf142358..102160ff5c66 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -978,67 +978,90 @@ static int prepare_kprobe(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static void arm_kprobe_ftrace(struct kprobe *p) +static int arm_kprobe_ftrace(struct kprobe *p) { - int ret; + int ret = 0; ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 0, 0); - WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); - kprobe_ftrace_enabled++; - if (kprobe_ftrace_enabled == 1) { + if (ret) { + pr_debug("Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + return ret; + } + + if (kprobe_ftrace_enabled == 0) { ret = register_ftrace_function(&kprobe_ftrace_ops); - WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + if (ret) { + pr_debug("Failed to init kprobe-ftrace (%d)\n", ret); + goto err_ftrace; + } } + + kprobe_ftrace_enabled++; + return ret; + +err_ftrace: + /* + * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a + * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental + * empty filter_hash which would undesirably trace all functions. + */ + ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); + return ret; } /* Caller must lock kprobe_mutex */ -static void disarm_kprobe_ftrace(struct kprobe *p) +static int disarm_kprobe_ftrace(struct kprobe *p) { - int ret; + int ret = 0; - kprobe_ftrace_enabled--; - if (kprobe_ftrace_enabled == 0) { + if (kprobe_ftrace_enabled == 1) { ret = unregister_ftrace_function(&kprobe_ftrace_ops); - WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret)) + return ret; } + + kprobe_ftrace_enabled--; + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); + return ret; } #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) -#define arm_kprobe_ftrace(p) do {} while (0) -#define disarm_kprobe_ftrace(p) do {} while (0) +#define arm_kprobe_ftrace(p) (-ENODEV) +#define disarm_kprobe_ftrace(p) (-ENODEV) #endif /* Arm a kprobe with text_mutex */ -static void arm_kprobe(struct kprobe *kp) +static int arm_kprobe(struct kprobe *kp) { - if (unlikely(kprobe_ftrace(kp))) { - arm_kprobe_ftrace(kp); - return; - } + if (unlikely(kprobe_ftrace(kp))) + return arm_kprobe_ftrace(kp); + cpus_read_lock(); mutex_lock(&text_mutex); __arm_kprobe(kp); mutex_unlock(&text_mutex); cpus_read_unlock(); + + return 0; } /* Disarm a kprobe with text_mutex */ -static void disarm_kprobe(struct kprobe *kp, bool reopt) +static int disarm_kprobe(struct kprobe *kp, bool reopt) { - if (unlikely(kprobe_ftrace(kp))) { - disarm_kprobe_ftrace(kp); - return; - } + if (unlikely(kprobe_ftrace(kp))) + return disarm_kprobe_ftrace(kp); cpus_read_lock(); mutex_lock(&text_mutex); __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); cpus_read_unlock(); + + return 0; } /* @@ -1362,9 +1385,15 @@ out: if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; - if (!kprobes_all_disarmed) + if (!kprobes_all_disarmed) { /* Arm the breakpoint again. */ - arm_kprobe(ap); + ret = arm_kprobe(ap); + if (ret) { + ap->flags |= KPROBE_FLAG_DISABLED; + list_del_rcu(&p->list); + synchronize_sched(); + } + } } return ret; } @@ -1573,8 +1602,14 @@ int register_kprobe(struct kprobe *p) hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - if (!kprobes_all_disarmed && !kprobe_disabled(p)) - arm_kprobe(p); + if (!kprobes_all_disarmed && !kprobe_disabled(p)) { + ret = arm_kprobe(p); + if (ret) { + hlist_del_rcu(&p->hlist); + synchronize_sched(); + goto out; + } + } /* Try to optimize kprobe */ try_to_optimize_kprobe(p); @@ -1608,11 +1643,12 @@ static int aggr_kprobe_disabled(struct kprobe *ap) static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; + int ret; /* Get an original kprobe for return */ orig_p = __get_valid_kprobe(p); if (unlikely(orig_p == NULL)) - return NULL; + return ERR_PTR(-EINVAL); if (!kprobe_disabled(p)) { /* Disable probe if it is a child probe */ @@ -1626,8 +1662,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) * should have already been disarmed, so * skip unneed disarming process. */ - if (!kprobes_all_disarmed) - disarm_kprobe(orig_p, true); + if (!kprobes_all_disarmed) { + ret = disarm_kprobe(orig_p, true); + if (ret) { + p->flags &= ~KPROBE_FLAG_DISABLED; + return ERR_PTR(ret); + } + } orig_p->flags |= KPROBE_FLAG_DISABLED; } } @@ -1644,8 +1685,8 @@ static int __unregister_kprobe_top(struct kprobe *p) /* Disable kprobe. This will disarm it if needed. */ ap = __disable_kprobe(p); - if (ap == NULL) - return -EINVAL; + if (IS_ERR(ap)) + return PTR_ERR(ap); if (ap == p) /* @@ -2078,12 +2119,14 @@ static void kill_kprobe(struct kprobe *p) int disable_kprobe(struct kprobe *kp) { int ret = 0; + struct kprobe *p; mutex_lock(&kprobe_mutex); /* Disable this kprobe */ - if (__disable_kprobe(kp) == NULL) - ret = -EINVAL; + p = __disable_kprobe(kp); + if (IS_ERR(p)) + ret = PTR_ERR(p); mutex_unlock(&kprobe_mutex); return ret; @@ -2116,7 +2159,9 @@ int enable_kprobe(struct kprobe *kp) if (!kprobes_all_disarmed && kprobe_disabled(p)) { p->flags &= ~KPROBE_FLAG_DISABLED; - arm_kprobe(p); + ret = arm_kprobe(p); + if (ret) + p->flags |= KPROBE_FLAG_DISABLED; } out: mutex_unlock(&kprobe_mutex); @@ -2407,11 +2452,12 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; -static void arm_all_kprobes(void) +static int arm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; - unsigned int i; + unsigned int i, total = 0, errors = 0; + int err, ret = 0; mutex_lock(&kprobe_mutex); @@ -2428,46 +2474,74 @@ static void arm_all_kprobes(void) /* Arming kprobes doesn't optimize kprobe itself */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, head, hlist) - if (!kprobe_disabled(p)) - arm_kprobe(p); + /* Arm all kprobes on a best-effort basis */ + hlist_for_each_entry_rcu(p, head, hlist) { + if (!kprobe_disabled(p)) { + err = arm_kprobe(p); + if (err) { + errors++; + ret = err; + } + total++; + } + } } - printk(KERN_INFO "Kprobes globally enabled\n"); + if (errors) + pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n", + errors, total); + else + pr_info("Kprobes globally enabled\n"); already_enabled: mutex_unlock(&kprobe_mutex); - return; + return ret; } -static void disarm_all_kprobes(void) +static int disarm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; - unsigned int i; + unsigned int i, total = 0, errors = 0; + int err, ret = 0; mutex_lock(&kprobe_mutex); /* If kprobes are already disarmed, just return */ if (kprobes_all_disarmed) { mutex_unlock(&kprobe_mutex); - return; + return 0; } kprobes_all_disarmed = true; - printk(KERN_INFO "Kprobes globally disabled\n"); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; + /* Disarm all kprobes on a best-effort basis */ hlist_for_each_entry_rcu(p, head, hlist) { - if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - disarm_kprobe(p, false); + if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) { + err = disarm_kprobe(p, false); + if (err) { + errors++; + ret = err; + } + total++; + } } } + + if (errors) + pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n", + errors, total); + else + pr_info("Kprobes globally disabled\n"); + mutex_unlock(&kprobe_mutex); /* Wait for disarming all kprobes by optimizer */ wait_for_kprobe_optimizer(); + + return ret; } /* @@ -2494,6 +2568,7 @@ static ssize_t write_enabled_file_bool(struct file *file, { char buf[32]; size_t buf_size; + int ret = 0; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) @@ -2504,17 +2579,20 @@ static ssize_t write_enabled_file_bool(struct file *file, case 'y': case 'Y': case '1': - arm_all_kprobes(); + ret = arm_all_kprobes(); break; case 'n': case 'N': case '0': - disarm_all_kprobes(); + ret = disarm_all_kprobes(); break; default: return -EINVAL; } + if (ret) + return ret; + return count; } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index de9e45dca70f..3a4656fb7047 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -366,11 +366,6 @@ static int __klp_enable_patch(struct klp_patch *patch) /* * A reference is taken on the patch module to prevent it from being * unloaded. - * - * Note: For immediate (no consistency model) patches we don't allow - * patch modules to unload since there is no safe/sane method to - * determine if a thread is still running in the patched code contained - * in the patch module once the ftrace registration is successful. */ if (!try_module_get(patch->mod)) return -ENODEV; @@ -454,6 +449,8 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/<patch> * /sys/kernel/livepatch/<patch>/enabled * /sys/kernel/livepatch/<patch>/transition + * /sys/kernel/livepatch/<patch>/signal + * /sys/kernel/livepatch/<patch>/force * /sys/kernel/livepatch/<patch>/<object> * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> */ @@ -528,11 +525,73 @@ static ssize_t transition_show(struct kobject *kobj, patch == klp_transition_patch); } +static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct klp_patch *patch; + int ret; + bool val; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (!val) + return count; + + mutex_lock(&klp_mutex); + + patch = container_of(kobj, struct klp_patch, kobj); + if (patch != klp_transition_patch) { + mutex_unlock(&klp_mutex); + return -EINVAL; + } + + klp_send_signals(); + + mutex_unlock(&klp_mutex); + + return count; +} + +static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct klp_patch *patch; + int ret; + bool val; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (!val) + return count; + + mutex_lock(&klp_mutex); + + patch = container_of(kobj, struct klp_patch, kobj); + if (patch != klp_transition_patch) { + mutex_unlock(&klp_mutex); + return -EINVAL; + } + + klp_force_transition(); + + mutex_unlock(&klp_mutex); + + return count; +} + static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); +static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal); +static struct kobj_attribute force_kobj_attr = __ATTR_WO(force); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, &transition_kobj_attr.attr, + &signal_kobj_attr.attr, + &force_kobj_attr.attr, NULL }; @@ -830,12 +889,7 @@ int klp_register_patch(struct klp_patch *patch) if (!klp_initialized()) return -ENODEV; - /* - * Architectures without reliable stack traces have to set - * patch->immediate because there's currently no way to patch kthreads - * with the consistency model. - */ - if (!klp_have_reliable_stack() && !patch->immediate) { + if (!klp_have_reliable_stack()) { pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); return -ENOSYS; } diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 56add6327736..7c6631e693bc 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -33,6 +33,8 @@ struct klp_patch *klp_transition_patch; static int klp_target_state = KLP_UNDEFINED; +static bool klp_forced = false; + /* * This work can be performed periodically to finish patching or unpatching any * "straggler" tasks which failed to transition in the first attempt. @@ -80,7 +82,6 @@ static void klp_complete_transition(void) struct klp_func *func; struct task_struct *g, *task; unsigned int cpu; - bool immediate_func = false; pr_debug("'%s': completing %s transition\n", klp_transition_patch->mod->name, @@ -102,16 +103,9 @@ static void klp_complete_transition(void) klp_synchronize_transition(); } - if (klp_transition_patch->immediate) - goto done; - - klp_for_each_object(klp_transition_patch, obj) { - klp_for_each_func(obj, func) { + klp_for_each_object(klp_transition_patch, obj) + klp_for_each_func(obj, func) func->transition = false; - if (func->immediate) - immediate_func = true; - } - } /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ if (klp_target_state == KLP_PATCHED) @@ -130,7 +124,6 @@ static void klp_complete_transition(void) task->patch_state = KLP_UNDEFINED; } -done: klp_for_each_object(klp_transition_patch, obj) { if (!klp_is_object_loaded(obj)) continue; @@ -144,13 +137,11 @@ done: klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); /* - * See complementary comment in __klp_enable_patch() for why we - * keep the module reference for immediate patches. + * klp_forced set implies unbounded increase of module's ref count if + * the module is disabled/enabled in a loop. */ - if (!klp_transition_patch->immediate && !immediate_func && - klp_target_state == KLP_UNPATCHED) { + if (!klp_forced && klp_target_state == KLP_UNPATCHED) module_put(klp_transition_patch->mod); - } klp_target_state = KLP_UNDEFINED; klp_transition_patch = NULL; @@ -218,9 +209,6 @@ static int klp_check_stack_func(struct klp_func *func, struct klp_ops *ops; int i; - if (func->immediate) - return 0; - for (i = 0; i < trace->nr_entries; i++) { address = trace->entries[i]; @@ -383,13 +371,6 @@ void klp_try_complete_transition(void) WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); /* - * If the patch can be applied or reverted immediately, skip the - * per-task transitions. - */ - if (klp_transition_patch->immediate) - goto success; - - /* * Try to switch the tasks to the target patch state by walking their * stacks and looking for any to-be-patched or to-be-unpatched * functions. If such functions are found on a stack, or if the stack @@ -432,7 +413,6 @@ void klp_try_complete_transition(void) return; } -success: /* we're done, now cleanup the data structures */ klp_complete_transition(); } @@ -453,13 +433,6 @@ void klp_start_transition(void) klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); /* - * If the patch can be applied or reverted immediately, skip the - * per-task transitions. - */ - if (klp_transition_patch->immediate) - return; - - /* * Mark all normal tasks as needing a patch state update. They'll * switch either in klp_try_complete_transition() or as they exit the * kernel. @@ -509,13 +482,6 @@ void klp_init_transition(struct klp_patch *patch, int state) klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); /* - * If the patch can be applied or reverted immediately, skip the - * per-task transitions. - */ - if (patch->immediate) - return; - - /* * Initialize all tasks to the initial patch state to prepare them for * switching to the target state. */ @@ -608,3 +574,71 @@ void klp_copy_process(struct task_struct *child) /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */ } + +/* + * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. + * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this + * action currently. + */ +void klp_send_signals(void) +{ + struct task_struct *g, *task; + + pr_notice("signaling remaining tasks\n"); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) { + if (!klp_patch_pending(task)) + continue; + + /* + * There is a small race here. We could see TIF_PATCH_PENDING + * set and decide to wake up a kthread or send a fake signal. + * Meanwhile the task could migrate itself and the action + * would be meaningless. It is not serious though. + */ + if (task->flags & PF_KTHREAD) { + /* + * Wake up a kthread which sleeps interruptedly and + * still has not been migrated. + */ + wake_up_state(task, TASK_INTERRUPTIBLE); + } else { + /* + * Send fake signal to all non-kthread tasks which are + * still not migrated. + */ + spin_lock_irq(&task->sighand->siglock); + signal_wake_up(task, 0); + spin_unlock_irq(&task->sighand->siglock); + } + } + read_unlock(&tasklist_lock); +} + +/* + * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an + * existing transition to finish. + * + * NOTE: klp_update_patch_state(task) requires the task to be inactive or + * 'current'. This is not the case here and the consistency model could be + * broken. Administrator, who is the only one to execute the + * klp_force_transitions(), has to be aware of this. + */ +void klp_force_transition(void) +{ + struct task_struct *g, *task; + unsigned int cpu; + + pr_warn("forcing remaining tasks to the patched state\n"); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) + klp_update_patch_state(task); + read_unlock(&tasklist_lock); + + for_each_possible_cpu(cpu) + klp_update_patch_state(idle_task(cpu)); + + klp_forced = true; +} diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index 0f6e27c481f9..f9d0bc016067 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h @@ -11,5 +11,7 @@ void klp_cancel_transition(void); void klp_start_transition(void); void klp_try_complete_transition(void); void klp_reverse_transition(void); +void klp_send_signals(void); +void klp_force_transition(void); #endif /* _LIVEPATCH_TRANSITION_H */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 9776da8db180..89b5f83f1969 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -49,6 +49,7 @@ #include <linux/gfp.h> #include <linux/random.h> #include <linux/jhash.h> +#include <linux/nmi.h> #include <asm/sections.h> @@ -57,10 +58,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/lock.h> -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -#include <linux/slab.h> -#endif - #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -75,19 +72,6 @@ module_param(lock_stat, int, 0644); #define lock_stat 0 #endif -#ifdef CONFIG_BOOTPARAM_LOCKDEP_CROSSRELEASE_FULLSTACK -static int crossrelease_fullstack = 1; -#else -static int crossrelease_fullstack; -#endif -static int __init allow_crossrelease_fullstack(char *str) -{ - crossrelease_fullstack = 1; - return 0; -} - -early_param("crossrelease_fullstack", allow_crossrelease_fullstack); - /* * lockdep_lock: protects the lockdep graph, the hashes and the * class/list/hash allocators. @@ -664,18 +648,12 @@ static int count_matching_names(struct lock_class *new_class) return count + 1; } -/* - * Register a lock's class in the hash-table, if the class is not present - * yet. Otherwise we look it up. We cache the result in the lock object - * itself, so actual lookup of the hash should be once per lock object. - */ static inline struct lock_class * -look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) +look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; - bool is_static = false; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); @@ -688,24 +666,11 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } /* - * Static locks do not have their class-keys yet - for them the key - * is the lock object itself. If the lock is in the per cpu area, - * the canonical address of the lock (per cpu offset removed) is - * used. + * If it is not initialised then it has never been locked, + * so it won't be present in the hash table. */ - if (unlikely(!lock->key)) { - unsigned long can_addr, addr = (unsigned long)lock; - - if (__is_kernel_percpu_address(addr, &can_addr)) - lock->key = (void *)can_addr; - else if (__is_module_percpu_address(addr, &can_addr)) - lock->key = (void *)can_addr; - else if (static_obj(lock)) - lock->key = (void *)lock; - else - return ERR_PTR(-EINVAL); - is_static = true; - } + if (unlikely(!lock->key)) + return NULL; /* * NOTE: the class-key must be unique. For dynamic locks, a static @@ -737,20 +702,36 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } } - return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); + return NULL; } -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -static void cross_init(struct lockdep_map *lock, int cross); -static int cross_lock(struct lockdep_map *lock); -static int lock_acquire_crosslock(struct held_lock *hlock); -static int lock_release_crosslock(struct lockdep_map *lock); -#else -static inline void cross_init(struct lockdep_map *lock, int cross) {} -static inline int cross_lock(struct lockdep_map *lock) { return 0; } -static inline int lock_acquire_crosslock(struct held_lock *hlock) { return 2; } -static inline int lock_release_crosslock(struct lockdep_map *lock) { return 2; } -#endif +/* + * Static locks do not have their class-keys yet - for them the key is + * the lock object itself. If the lock is in the per cpu area, the + * canonical address of the lock (per cpu offset removed) is used. + */ +static bool assign_lock_key(struct lockdep_map *lock) +{ + unsigned long can_addr, addr = (unsigned long)lock; + + if (__is_kernel_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (__is_module_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (static_obj(lock)) + lock->key = (void *)lock; + else { + /* Debug-check: all keys must be persistent! */ + debug_locks_off(); + pr_err("INFO: trying to register non-static key.\n"); + pr_err("the code is fine but needs lockdep annotation.\n"); + pr_err("turning off the locking correctness validator.\n"); + dump_stack(); + return false; + } + + return true; +} /* * Register a lock's class in the hash-table, if the class is not present @@ -767,18 +748,13 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); - if (likely(!IS_ERR_OR_NULL(class))) + if (likely(class)) goto out_set_class_cache; - /* - * Debug-check: all keys must be persistent! - */ - if (IS_ERR(class)) { - debug_locks_off(); - printk("INFO: trying to register non-static key.\n"); - printk("the code is fine but needs lockdep annotation.\n"); - printk("turning off the locking correctness validator.\n"); - dump_stack(); + if (!lock->key) { + if (!assign_lock_key(lock)) + return NULL; + } else if (!static_obj(lock->key)) { return NULL; } @@ -1151,41 +1127,22 @@ print_circular_lock_scenario(struct held_lock *src, printk(KERN_CONT "\n\n"); } - if (cross_lock(tgt->instance)) { - printk(" Possible unsafe locking scenario by crosslock:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk(" unlock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } else { - printk(" Possible unsafe locking scenario:\n\n"); - printk(" CPU0 CPU1\n"); - printk(" ---- ----\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(parent); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(target); - printk(KERN_CONT ");\n"); - printk(" lock("); - __print_lock_name(source); - printk(KERN_CONT ");\n"); - printk("\n *** DEADLOCK ***\n\n"); - } + printk(" Possible unsafe locking scenario:\n\n"); + printk(" CPU0 CPU1\n"); + printk(" ---- ----\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(parent); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(target); + printk(KERN_CONT ");\n"); + printk(" lock("); + __print_lock_name(source); + printk(KERN_CONT ");\n"); + printk("\n *** DEADLOCK ***\n\n"); } /* @@ -1211,10 +1168,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, curr->comm, task_pid_nr(curr)); print_lock(check_src); - if (cross_lock(check_tgt->instance)) - pr_warn("\nbut now in release context of a crosslock acquired at the following:\n"); - else - pr_warn("\nbut task is already holding lock:\n"); + pr_warn("\nbut task is already holding lock:\n"); print_lock(check_tgt); pr_warn("\nwhich lock already depends on the new lock.\n\n"); @@ -1244,9 +1198,7 @@ static noinline int print_circular_bug(struct lock_list *this, if (!debug_locks_off_graph_unlock() || debug_locks_silent) return 0; - if (cross_lock(check_tgt->instance)) - this->trace = *trace; - else if (!save_trace(&this->trace)) + if (!save_trace(&this->trace)) return 0; depth = get_lock_depth(target); @@ -1850,9 +1802,6 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, if (nest) return 2; - if (cross_lock(prev->instance)) - continue; - return print_deadlock_bug(curr, prev, next); } return 1; @@ -2018,31 +1967,26 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) for (;;) { int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth - 1; + /* - * Only non-crosslock entries get new dependencies added. - * Crosslock entries will be added by commit later: + * Only non-recursive-read entries get new dependencies + * added: */ - if (!cross_lock(hlock->instance)) { + if (hlock->read != 2 && hlock->check) { + int ret = check_prev_add(curr, hlock, next, distance, &trace, save_trace); + if (!ret) + return 0; + /* - * Only non-recursive-read entries get new dependencies - * added: + * Stop after the first non-trylock entry, + * as non-trylock entries have added their + * own direct dependencies already, so this + * lock is connected to them indirectly: */ - if (hlock->read != 2 && hlock->check) { - int ret = check_prev_add(curr, hlock, next, - distance, &trace, save_trace); - if (!ret) - return 0; - - /* - * Stop after the first non-trylock entry, - * as non-trylock entries have added their - * own direct dependencies already, so this - * lock is connected to them indirectly: - */ - if (!hlock->trylock) - break; - } + if (!hlock->trylock) + break; } + depth--; /* * End of lock-stack? @@ -3292,21 +3236,10 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { - cross_init(lock, 0); __lockdep_init_map(lock, name, key, subclass); } EXPORT_SYMBOL_GPL(lockdep_init_map); -#ifdef CONFIG_LOCKDEP_CROSSRELEASE -void lockdep_init_map_crosslock(struct lockdep_map *lock, const char *name, - struct lock_class_key *key, int subclass) -{ - cross_init(lock, 1); - __lockdep_init_map(lock, name, key, subclass); -} -EXPORT_SYMBOL_GPL(lockdep_init_map_crosslock); -#endif - struct lock_class_key __lockdep_no_validate__; EXPORT_SYMBOL_GPL(__lockdep_no_validate__); @@ -3344,7 +3277,7 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; } -static int __lock_is_held(struct lockdep_map *lock, int read); +static int __lock_is_held(const struct lockdep_map *lock, int read); /* * This gets called for every mutex_lock*()/spin_lock*() operation. @@ -3362,7 +3295,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, int chain_head = 0; int class_idx; u64 chain_key; - int ret; if (unlikely(!debug_locks)) return 0; @@ -3411,8 +3343,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, class_idx = class - lock_classes + 1; - /* TODO: nest_lock is not implemented for crosslock yet. */ - if (depth && !cross_lock(lock)) { + if (depth) { hlock = curr->held_locks + depth - 1; if (hlock->class_idx == class_idx && nest_lock) { if (hlock->references) { @@ -3500,14 +3431,6 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, if (!validate_chain(curr, lock, hlock, chain_head, chain_key)) return 0; - ret = lock_acquire_crosslock(hlock); - /* - * 2 means normal acquire operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - curr->curr_chain_key = chain_key; curr->lockdep_depth++; check_chain_key(curr); @@ -3563,13 +3486,14 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; } -static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) +static int match_held_lock(const struct held_lock *hlock, + const struct lockdep_map *lock) { if (hlock->instance == lock) return 1; if (hlock->references) { - struct lock_class *class = lock->class_cache[0]; + const struct lock_class *class = lock->class_cache[0]; if (!class) class = look_up_lock_class(lock, 0); @@ -3580,7 +3504,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) * Clearly if the lock hasn't been acquired _ever_, we're not * holding it either, so report failure. */ - if (IS_ERR_OR_NULL(class)) + if (!class) return 0; /* @@ -3745,19 +3669,11 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) struct task_struct *curr = current; struct held_lock *hlock; unsigned int depth; - int ret, i; + int i; if (unlikely(!debug_locks)) return 0; - ret = lock_release_crosslock(lock); - /* - * 2 means normal release operations are needed. Otherwise, it's - * ok just to return with '0:fail, 1:success'. - */ - if (ret != 2) - return ret; - depth = curr->lockdep_depth; /* * So we're all set to release this lock.. wait what lock? We don't @@ -3813,7 +3729,7 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) return 1; } -static int __lock_is_held(struct lockdep_map *lock, int read) +static int __lock_is_held(const struct lockdep_map *lock, int read) { struct task_struct *curr = current; int i; @@ -4027,7 +3943,7 @@ void lock_release(struct lockdep_map *lock, int nested, } EXPORT_SYMBOL_GPL(lock_release); -int lock_is_held_type(struct lockdep_map *lock, int read) +int lock_is_held_type(const struct lockdep_map *lock, int read) { unsigned long flags; int ret = 0; @@ -4384,7 +4300,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) * If the class exists we look it up and zap it: */ class = look_up_lock_class(lock, j); - if (!IS_ERR_OR_NULL(class)) + if (class) zap_class(class); } /* @@ -4580,6 +4496,7 @@ retry: if (!unlock) if (read_trylock(&tasklist_lock)) unlock = 1; + touch_nmi_watchdog(); } while_each_thread(g, p); pr_warn("\n"); @@ -4675,494 +4592,3 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) dump_stack(); } EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious); - -#ifdef CONFIG_LOCKDEP_CROSSRELEASE - -/* - * Crossrelease works by recording a lock history for each thread and - * connecting those historic locks that were taken after the - * wait_for_completion() in the complete() context. - * - * Task-A Task-B - * - * mutex_lock(&A); - * mutex_unlock(&A); - * - * wait_for_completion(&C); - * lock_acquire_crosslock(); - * atomic_inc_return(&cross_gen_id); - * | - * | mutex_lock(&B); - * | mutex_unlock(&B); - * | - * | complete(&C); - * `-- lock_commit_crosslock(); - * - * Which will then add a dependency between B and C. - */ - -#define xhlock(i) (current->xhlocks[(i) % MAX_XHLOCKS_NR]) - -/* - * Whenever a crosslock is held, cross_gen_id will be increased. - */ -static atomic_t cross_gen_id; /* Can be wrapped */ - -/* - * Make an entry of the ring buffer invalid. - */ -static inline void invalidate_xhlock(struct hist_lock *xhlock) -{ - /* - * Normally, xhlock->hlock.instance must be !NULL. - */ - xhlock->hlock.instance = NULL; -} - -/* - * Lock history stacks; we have 2 nested lock history stacks: - * - * HARD(IRQ) - * SOFT(IRQ) - * - * The thing is that once we complete a HARD/SOFT IRQ the future task locks - * should not depend on any of the locks observed while running the IRQ. So - * what we do is rewind the history buffer and erase all our knowledge of that - * temporal event. - */ - -void crossrelease_hist_start(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (!cur->xhlocks) - return; - - cur->xhlock_idx_hist[c] = cur->xhlock_idx; - cur->hist_id_save[c] = cur->hist_id; -} - -void crossrelease_hist_end(enum xhlock_context_t c) -{ - struct task_struct *cur = current; - - if (cur->xhlocks) { - unsigned int idx = cur->xhlock_idx_hist[c]; - struct hist_lock *h = &xhlock(idx); - - cur->xhlock_idx = idx; - - /* Check if the ring was overwritten. */ - if (h->hist_id != cur->hist_id_save[c]) - invalidate_xhlock(h); - } -} - -/* - * lockdep_invariant_state() is used to annotate independence inside a task, to - * make one task look like multiple independent 'tasks'. - * - * Take for instance workqueues; each work is independent of the last. The - * completion of a future work does not depend on the completion of a past work - * (in general). Therefore we must not carry that (lock) dependency across - * works. - * - * This is true for many things; pretty much all kthreads fall into this - * pattern, where they have an invariant state and future completions do not - * depend on past completions. Its just that since they all have the 'same' - * form -- the kthread does the same over and over -- it doesn't typically - * matter. - * - * The same is true for system-calls, once a system call is completed (we've - * returned to userspace) the next system call does not depend on the lock - * history of the previous system call. - * - * They key property for independence, this invariant state, is that it must be - * a point where we hold no locks and have no history. Because if we were to - * hold locks, the restore at _end() would not necessarily recover it's history - * entry. Similarly, independence per-definition means it does not depend on - * prior state. - */ -void lockdep_invariant_state(bool force) -{ - /* - * We call this at an invariant point, no current state, no history. - * Verify the former, enforce the latter. - */ - WARN_ON_ONCE(!force && current->lockdep_depth); - invalidate_xhlock(&xhlock(current->xhlock_idx)); -} - -static int cross_lock(struct lockdep_map *lock) -{ - return lock ? lock->cross : 0; -} - -/* - * This is needed to decide the relationship between wrapable variables. - */ -static inline int before(unsigned int a, unsigned int b) -{ - return (int)(a - b) < 0; -} - -static inline struct lock_class *xhlock_class(struct hist_lock *xhlock) -{ - return hlock_class(&xhlock->hlock); -} - -static inline struct lock_class *xlock_class(struct cross_lock *xlock) -{ - return hlock_class(&xlock->hlock); -} - -/* - * Should we check a dependency with previous one? - */ -static inline int depend_before(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check && !hlock->trylock; -} - -/* - * Should we check a dependency with next one? - */ -static inline int depend_after(struct held_lock *hlock) -{ - return hlock->read != 2 && hlock->check; -} - -/* - * Check if the xhlock is valid, which would be false if, - * - * 1. Has not used after initializaion yet. - * 2. Got invalidated. - * - * Remind hist_lock is implemented as a ring buffer. - */ -static inline int xhlock_valid(struct hist_lock *xhlock) -{ - /* - * xhlock->hlock.instance must be !NULL. - */ - return !!xhlock->hlock.instance; -} - -/* - * Record a hist_lock entry. - * - * Irq disable is only required. - */ -static void add_xhlock(struct held_lock *hlock) -{ - unsigned int idx = ++current->xhlock_idx; - struct hist_lock *xhlock = &xhlock(idx); - -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * This can be done locklessly because they are all task-local - * state, we must however ensure IRQs are disabled. - */ - WARN_ON_ONCE(!irqs_disabled()); -#endif - - /* Initialize hist_lock's members */ - xhlock->hlock = *hlock; - xhlock->hist_id = ++current->hist_id; - - xhlock->trace.nr_entries = 0; - xhlock->trace.max_entries = MAX_XHLOCK_TRACE_ENTRIES; - xhlock->trace.entries = xhlock->trace_entries; - - if (crossrelease_fullstack) { - xhlock->trace.skip = 3; - save_stack_trace(&xhlock->trace); - } else { - xhlock->trace.nr_entries = 1; - xhlock->trace.entries[0] = hlock->acquire_ip; - } -} - -static inline int same_context_xhlock(struct hist_lock *xhlock) -{ - return xhlock->hlock.irq_context == task_irq_context(current); -} - -/* - * This should be lockless as far as possible because this would be - * called very frequently. - */ -static void check_add_xhlock(struct held_lock *hlock) -{ - /* - * Record a hist_lock, only in case that acquisitions ahead - * could depend on the held_lock. For example, if the held_lock - * is trylock then acquisitions ahead never depends on that. - * In that case, we don't need to record it. Just return. - */ - if (!current->xhlocks || !depend_before(hlock)) - return; - - add_xhlock(hlock); -} - -/* - * For crosslock. - */ -static int add_xlock(struct held_lock *hlock) -{ - struct cross_lock *xlock; - unsigned int gen_id; - - if (!graph_lock()) - return 0; - - xlock = &((struct lockdep_map_cross *)hlock->instance)->xlock; - - /* - * When acquisitions for a crosslock are overlapped, we use - * nr_acquire to perform commit for them, based on cross_gen_id - * of the first acquisition, which allows to add additional - * dependencies. - * - * Moreover, when no acquisition of a crosslock is in progress, - * we should not perform commit because the lock might not exist - * any more, which might cause incorrect memory access. So we - * have to track the number of acquisitions of a crosslock. - * - * depend_after() is necessary to initialize only the first - * valid xlock so that the xlock can be used on its commit. - */ - if (xlock->nr_acquire++ && depend_after(&xlock->hlock)) - goto unlock; - - gen_id = (unsigned int)atomic_inc_return(&cross_gen_id); - xlock->hlock = *hlock; - xlock->hlock.gen_id = gen_id; -unlock: - graph_unlock(); - return 1; -} - -/* - * Called for both normal and crosslock acquires. Normal locks will be - * pushed on the hist_lock queue. Cross locks will record state and - * stop regular lock_acquire() to avoid being placed on the held_lock - * stack. - * - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_acquire_crosslock(struct held_lock *hlock) -{ - /* - * CONTEXT 1 CONTEXT 2 - * --------- --------- - * lock A (cross) - * X = atomic_inc_return(&cross_gen_id) - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * Y = atomic_read_acquire(&cross_gen_id) - * lock B - * - * atomic_read_acquire() is for ordering between A and B, - * IOW, A happens before B, when CONTEXT 2 see Y >= X. - * - * Pairs with atomic_inc_return() in add_xlock(). - */ - hlock->gen_id = (unsigned int)atomic_read_acquire(&cross_gen_id); - - if (cross_lock(hlock->instance)) - return add_xlock(hlock); - - check_add_xhlock(hlock); - return 2; -} - -static int copy_trace(struct stack_trace *trace) -{ - unsigned long *buf = stack_trace + nr_stack_trace_entries; - unsigned int max_nr = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries; - unsigned int nr = min(max_nr, trace->nr_entries); - - trace->nr_entries = nr; - memcpy(buf, trace->entries, nr * sizeof(trace->entries[0])); - trace->entries = buf; - nr_stack_trace_entries += nr; - - if (nr_stack_trace_entries >= MAX_STACK_TRACE_ENTRIES-1) { - if (!debug_locks_off_graph_unlock()) - return 0; - - print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); - dump_stack(); - - return 0; - } - - return 1; -} - -static int commit_xhlock(struct cross_lock *xlock, struct hist_lock *xhlock) -{ - unsigned int xid, pid; - u64 chain_key; - - xid = xlock_class(xlock) - lock_classes; - chain_key = iterate_chain_key((u64)0, xid); - pid = xhlock_class(xhlock) - lock_classes; - chain_key = iterate_chain_key(chain_key, pid); - - if (lookup_chain_cache(chain_key)) - return 1; - - if (!add_chain_cache_classes(xid, pid, xhlock->hlock.irq_context, - chain_key)) - return 0; - - if (!check_prev_add(current, &xlock->hlock, &xhlock->hlock, 1, - &xhlock->trace, copy_trace)) - return 0; - - return 1; -} - -static void commit_xhlocks(struct cross_lock *xlock) -{ - unsigned int cur = current->xhlock_idx; - unsigned int prev_hist_id = xhlock(cur).hist_id; - unsigned int i; - - if (!graph_lock()) - return; - - if (xlock->nr_acquire) { - for (i = 0; i < MAX_XHLOCKS_NR; i++) { - struct hist_lock *xhlock = &xhlock(cur - i); - - if (!xhlock_valid(xhlock)) - break; - - if (before(xhlock->hlock.gen_id, xlock->hlock.gen_id)) - break; - - if (!same_context_xhlock(xhlock)) - break; - - /* - * Filter out the cases where the ring buffer was - * overwritten and the current entry has a bigger - * hist_id than the previous one, which is impossible - * otherwise: - */ - if (unlikely(before(prev_hist_id, xhlock->hist_id))) - break; - - prev_hist_id = xhlock->hist_id; - - /* - * commit_xhlock() returns 0 with graph_lock already - * released if fail. - */ - if (!commit_xhlock(xlock, xhlock)) - return; - } - } - - graph_unlock(); -} - -void lock_commit_crosslock(struct lockdep_map *lock) -{ - struct cross_lock *xlock; - unsigned long flags; - - if (unlikely(!debug_locks || current->lockdep_recursion)) - return; - - if (!current->xhlocks) - return; - - /* - * Do commit hist_locks with the cross_lock, only in case that - * the cross_lock could depend on acquisitions after that. - * - * For example, if the cross_lock does not have the 'check' flag - * then we don't need to check dependencies and commit for that. - * Just skip it. In that case, of course, the cross_lock does - * not depend on acquisitions ahead, either. - * - * WARNING: Don't do that in add_xlock() in advance. When an - * acquisition context is different from the commit context, - * invalid(skipped) cross_lock might be accessed. - */ - if (!depend_after(&((struct lockdep_map_cross *)lock)->xlock.hlock)) - return; - - raw_local_irq_save(flags); - check_flags(flags); - current->lockdep_recursion = 1; - xlock = &((struct lockdep_map_cross *)lock)->xlock; - commit_xhlocks(xlock); - current->lockdep_recursion = 0; - raw_local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(lock_commit_crosslock); - -/* - * Return: 0 - failure; - * 1 - crosslock, done; - * 2 - normal lock, continue to held_lock[] ops. - */ -static int lock_release_crosslock(struct lockdep_map *lock) -{ - if (cross_lock(lock)) { - if (!graph_lock()) - return 0; - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire--; - graph_unlock(); - return 1; - } - return 2; -} - -static void cross_init(struct lockdep_map *lock, int cross) -{ - if (cross) - ((struct lockdep_map_cross *)lock)->xlock.nr_acquire = 0; - - lock->cross = cross; - - /* - * Crossrelease assumes that the ring buffer size of xhlocks - * is aligned with power of 2. So force it on build. - */ - BUILD_BUG_ON(MAX_XHLOCKS_NR & (MAX_XHLOCKS_NR - 1)); -} - -void lockdep_init_task(struct task_struct *task) -{ - int i; - - task->xhlock_idx = UINT_MAX; - task->hist_id = 0; - - for (i = 0; i < XHLOCK_CTX_NR; i++) { - task->xhlock_idx_hist[i] = UINT_MAX; - task->hist_id_save[i] = 0; - } - - task->xhlocks = kzalloc(sizeof(struct hist_lock) * MAX_XHLOCKS_NR, - GFP_KERNEL); -} - -void lockdep_free_task(struct task_struct *task) -{ - if (task->xhlocks) { - void *tmp = task->xhlocks; - /* Diable crossrelease for current */ - task->xhlocks = NULL; - kfree(tmp); - } -} -#endif diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index f24582d4dad3..6850ffd69125 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -77,10 +77,6 @@ struct lock_stress_stats { long n_lock_acquired; }; -int torture_runnable = IS_ENABLED(MODULE); -module_param(torture_runnable, int, 0444); -MODULE_PARM_DESC(torture_runnable, "Start locktorture at module init"); - /* Forward reference. */ static void lock_torture_cleanup(void); @@ -130,10 +126,8 @@ static void torture_lock_busted_write_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_lock_busted_write_unlock(void) @@ -179,10 +173,8 @@ static void torture_spin_lock_write_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_spin_lock_write_unlock(void) __releases(torture_spinlock) @@ -352,10 +344,8 @@ static void torture_mutex_delay(struct torture_random_state *trsp) mdelay(longdelay_ms * 5); else mdelay(longdelay_ms / 5); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_mutex_unlock(void) __releases(torture_mutex) @@ -507,10 +497,8 @@ static void torture_rtmutex_delay(struct torture_random_state *trsp) if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rtmutex_unlock(void) __releases(torture_rtmutex) @@ -547,10 +535,8 @@ static void torture_rwsem_write_delay(struct torture_random_state *trsp) mdelay(longdelay_ms * 10); else mdelay(longdelay_ms / 10); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealwriters_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rwsem_up_write(void) __releases(torture_rwsem) @@ -570,14 +556,12 @@ static void torture_rwsem_read_delay(struct torture_random_state *trsp) /* We want a long delay occasionally to force massive contention. */ if (!(torture_random(trsp) % - (cxt.nrealwriters_stress * 2000 * longdelay_ms))) + (cxt.nrealreaders_stress * 2000 * longdelay_ms))) mdelay(longdelay_ms * 2); else mdelay(longdelay_ms / 2); -#ifdef CONFIG_PREEMPT if (!(torture_random(trsp) % (cxt.nrealreaders_stress * 20000))) - preempt_schedule(); /* Allow test to be preempted. */ -#endif + torture_preempt_schedule(); /* Allow test to be preempted. */ } static void torture_rwsem_up_read(void) __releases(torture_rwsem) @@ -715,8 +699,7 @@ static void __torture_print_stats(char *page, { bool fail = 0; int i, n_stress; - long max = 0; - long min = statp[0].n_lock_acquired; + long max = 0, min = statp ? statp[0].n_lock_acquired : 0; long long sum = 0; n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress; @@ -823,7 +806,7 @@ static void lock_torture_cleanup(void) * such, only perform the underlying torture-specific cleanups, * and avoid anything related to locktorture. */ - if (!cxt.lwsa) + if (!cxt.lwsa && !cxt.lrsa) goto end; if (writer_tasks) { @@ -879,7 +862,7 @@ static int __init lock_torture_init(void) &percpu_rwsem_lock_ops, }; - if (!torture_init_begin(torture_type, verbose, &torture_runnable)) + if (!torture_init_begin(torture_type, verbose)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ @@ -898,6 +881,13 @@ static int __init lock_torture_init(void) firsterr = -EINVAL; goto unwind; } + + if (nwriters_stress == 0 && nreaders_stress == 0) { + pr_alert("lock-torture: must run at least one locking thread\n"); + firsterr = -EINVAL; + goto unwind; + } + if (cxt.cur_ops->init) cxt.cur_ops->init(); @@ -921,17 +911,19 @@ static int __init lock_torture_init(void) #endif /* Initialize the statistics so that each run gets its own numbers. */ + if (nwriters_stress) { + lock_is_write_held = 0; + cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); + if (cxt.lwsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } - lock_is_write_held = 0; - cxt.lwsa = kmalloc(sizeof(*cxt.lwsa) * cxt.nrealwriters_stress, GFP_KERNEL); - if (cxt.lwsa == NULL) { - VERBOSE_TOROUT_STRING("cxt.lwsa: Out of memory"); - firsterr = -ENOMEM; - goto unwind; - } - for (i = 0; i < cxt.nrealwriters_stress; i++) { - cxt.lwsa[i].n_lock_fail = 0; - cxt.lwsa[i].n_lock_acquired = 0; + for (i = 0; i < cxt.nrealwriters_stress; i++) { + cxt.lwsa[i].n_lock_fail = 0; + cxt.lwsa[i].n_lock_acquired = 0; + } } if (cxt.cur_ops->readlock) { @@ -948,19 +940,21 @@ static int __init lock_torture_init(void) cxt.nrealreaders_stress = cxt.nrealwriters_stress; } - lock_is_read_held = 0; - cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); - if (cxt.lrsa == NULL) { - VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); - firsterr = -ENOMEM; - kfree(cxt.lwsa); - cxt.lwsa = NULL; - goto unwind; - } - - for (i = 0; i < cxt.nrealreaders_stress; i++) { - cxt.lrsa[i].n_lock_fail = 0; - cxt.lrsa[i].n_lock_acquired = 0; + if (nreaders_stress) { + lock_is_read_held = 0; + cxt.lrsa = kmalloc(sizeof(*cxt.lrsa) * cxt.nrealreaders_stress, GFP_KERNEL); + if (cxt.lrsa == NULL) { + VERBOSE_TOROUT_STRING("cxt.lrsa: Out of memory"); + firsterr = -ENOMEM; + kfree(cxt.lwsa); + cxt.lwsa = NULL; + goto unwind; + } + + for (i = 0; i < cxt.nrealreaders_stress; i++) { + cxt.lrsa[i].n_lock_fail = 0; + cxt.lrsa[i].n_lock_acquired = 0; + } } } @@ -990,12 +984,14 @@ static int __init lock_torture_init(void) goto unwind; } - writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), - GFP_KERNEL); - if (writer_tasks == NULL) { - VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); - firsterr = -ENOMEM; - goto unwind; + if (nwriters_stress) { + writer_tasks = kzalloc(cxt.nrealwriters_stress * sizeof(writer_tasks[0]), + GFP_KERNEL); + if (writer_tasks == NULL) { + VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory"); + firsterr = -ENOMEM; + goto unwind; + } } if (cxt.cur_ops->readlock) { diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 294294c71ba4..d880296245c5 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -170,7 +170,7 @@ static __always_inline void clear_pending_set_locked(struct qspinlock *lock) * @tail : The new queue tail code word * Return: The previous queue tail code word * - * xchg(lock, tail) + * xchg(lock, tail), which heads an address dependency * * p,*,* -> n,*,* ; prev = xchg(lock, node) */ @@ -379,6 +379,14 @@ queue: tail = encode_tail(smp_processor_id(), idx); node += idx; + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + node->locked = 0; node->next = NULL; pv_init_node(node); @@ -408,16 +416,15 @@ queue: */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); + /* - * The above xchg_tail() is also a load of @lock which generates, - * through decode_tail(), a pointer. - * - * The address dependency matches the RELEASE of xchg_tail() - * such that the access to @prev must happen after. + * We must ensure that the stores to @node are observed before + * the write to prev->next. The address dependency from + * xchg_tail is not sufficient to ensure this because the read + * component of xchg_tail is unordered with respect to the + * initialisation of @node. */ - smp_read_barrier_depends(); - - WRITE_ONCE(prev->next, node); + smp_store_release(&prev->next, node); pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6f3dba6e4e9e..65cc0cb984e6 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1290,6 +1290,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, return ret; } +static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock) +{ + int ret = try_to_take_rt_mutex(lock, current, NULL); + + /* + * try_to_take_rt_mutex() sets the lock waiters bit + * unconditionally. Clean this up. + */ + fixup_rt_mutex_waiters(lock); + + return ret; +} + /* * Slow path try-lock function: */ @@ -1312,13 +1325,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ raw_spin_lock_irqsave(&lock->wait_lock, flags); - ret = try_to_take_rt_mutex(lock, current, NULL); - - /* - * try_to_take_rt_mutex() sets the lock waiters bit - * unconditionally. Clean this up. - */ - fixup_rt_mutex_waiters(lock); + ret = __rt_mutex_slowtrylock(lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); @@ -1505,6 +1512,11 @@ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) return rt_mutex_slowtrylock(lock); } +int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) +{ + return __rt_mutex_slowtrylock(lock); +} + /** * rt_mutex_timed_lock - lock a rt_mutex interruptible * the timeout structure is provided diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 124e98ca0b17..68686b3ec3c1 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -148,6 +148,7 @@ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_futex_trylock(struct rt_mutex *l); +extern int __rt_mutex_futex_trylock(struct rt_mutex *l); extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 1fd1a7543cdd..936f3d14dd6b 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -66,12 +66,8 @@ void __lockfunc __raw_##op##_lock(locktype##_t *lock) \ break; \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ } \ \ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ @@ -86,12 +82,9 @@ unsigned long __lockfunc __raw_##op##_lock_irqsave(locktype##_t *lock) \ local_irq_restore(flags); \ preempt_enable(); \ \ - if (!(lock)->break_lock) \ - (lock)->break_lock = 1; \ - while ((lock)->break_lock) \ - arch_##op##_relax(&lock->raw_lock); \ + arch_##op##_relax(&lock->raw_lock); \ } \ - (lock)->break_lock = 0; \ + \ return flags; \ } \ \ diff --git a/kernel/memremap.c b/kernel/memremap.c index 403ab9cdb949..4849be5f9b3c 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -188,13 +188,6 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) -struct page_map { - struct resource res; - struct percpu_ref *ref; - struct dev_pagemap pgmap; - struct vmem_altmap altmap; -}; - static unsigned long order_at(struct resource *res, unsigned long pgoff) { unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; @@ -248,34 +241,36 @@ int device_private_entry_fault(struct vm_area_struct *vma, EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ -static void pgmap_radix_release(struct resource *res) +static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff) { unsigned long pgoff, order; mutex_lock(&pgmap_lock); - foreach_order_pgoff(res, order, pgoff) + foreach_order_pgoff(res, order, pgoff) { + if (pgoff >= end_pgoff) + break; radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); + } mutex_unlock(&pgmap_lock); synchronize_rcu(); } -static unsigned long pfn_first(struct page_map *page_map) +static unsigned long pfn_first(struct dev_pagemap *pgmap) { - struct dev_pagemap *pgmap = &page_map->pgmap; - const struct resource *res = &page_map->res; - struct vmem_altmap *altmap = pgmap->altmap; + const struct resource *res = &pgmap->res; + struct vmem_altmap *altmap = &pgmap->altmap; unsigned long pfn; pfn = res->start >> PAGE_SHIFT; - if (altmap) + if (pgmap->altmap_valid) pfn += vmem_altmap_offset(altmap); return pfn; } -static unsigned long pfn_end(struct page_map *page_map) +static unsigned long pfn_end(struct dev_pagemap *pgmap) { - const struct resource *res = &page_map->res; + const struct resource *res = &pgmap->res; return (res->start + resource_size(res)) >> PAGE_SHIFT; } @@ -283,15 +278,15 @@ static unsigned long pfn_end(struct page_map *page_map) #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) -static void devm_memremap_pages_release(struct device *dev, void *data) +static void devm_memremap_pages_release(void *data) { - struct page_map *page_map = data; - struct resource *res = &page_map->res; + struct dev_pagemap *pgmap = data; + struct device *dev = pgmap->dev; + struct resource *res = &pgmap->res; resource_size_t align_start, align_size; - struct dev_pagemap *pgmap = &page_map->pgmap; unsigned long pfn; - for_each_device_pfn(pfn, page_map) + for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); if (percpu_ref_tryget_live(pgmap->ref)) { @@ -301,56 +296,51 @@ static void devm_memremap_pages_release(struct device *dev, void *data) /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) + - align_start; mem_hotplug_begin(); - arch_remove_memory(align_start, align_size); + arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? + &pgmap->altmap : NULL); mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); - pgmap_radix_release(res); - dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, - "%s: failed to free all reserved pages\n", __func__); -} - -/* assumes rcu_read_lock() held at entry */ -struct dev_pagemap *find_dev_pagemap(resource_size_t phys) -{ - struct page_map *page_map; - - WARN_ON_ONCE(!rcu_read_lock_held()); - - page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); - return page_map ? &page_map->pgmap : NULL; + pgmap_radix_release(res, -1); + dev_WARN_ONCE(dev, pgmap->altmap.alloc, + "%s: failed to free all reserved pages\n", __func__); } /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res - * @res: "host memory" address range - * @ref: a live per-cpu reference count - * @altmap: optional descriptor for allocating the memmap from @res + * @pgmap: pointer to a struct dev_pgmap * * Notes: - * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time - * (or devm release event). The expected order of events is that @ref has + * 1/ At a minimum the res, ref and type members of @pgmap must be initialized + * by the caller before passing it to this function + * + * 2/ The altmap field may optionally be initialized, in which case altmap_valid + * must be set to true + * + * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages() + * time (or devm release event). The expected order of events is that ref has * been through percpu_ref_kill() before devm_memremap_pages_release(). The * wait for the completion of all references being dropped and * percpu_ref_exit() must occur after devm_memremap_pages_release(). * - * 2/ @res is expected to be a host memory range that could feasibly be + * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but * this is not enforced. */ -void *devm_memremap_pages(struct device *dev, struct resource *res, - struct percpu_ref *ref, struct vmem_altmap *altmap) +void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { resource_size_t align_start, align_size, align_end; + struct vmem_altmap *altmap = pgmap->altmap_valid ? + &pgmap->altmap : NULL; unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; - struct dev_pagemap *pgmap; - struct page_map *page_map; int error, nid, is_ram, i = 0; + struct resource *res = &pgmap->res; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -367,47 +357,18 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (is_ram == REGION_INTERSECTS) return __va(res->start); - if (!ref) + if (!pgmap->ref) return ERR_PTR(-EINVAL); - page_map = devres_alloc_node(devm_memremap_pages_release, - sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); - if (!page_map) - return ERR_PTR(-ENOMEM); - pgmap = &page_map->pgmap; - - memcpy(&page_map->res, res, sizeof(*res)); - pgmap->dev = dev; - if (altmap) { - memcpy(&page_map->altmap, altmap, sizeof(*altmap)); - pgmap->altmap = &page_map->altmap; - } - pgmap->ref = ref; - pgmap->res = &page_map->res; - pgmap->type = MEMORY_DEVICE_HOST; - pgmap->page_fault = NULL; - pgmap->page_free = NULL; - pgmap->data = NULL; mutex_lock(&pgmap_lock); error = 0; align_end = align_start + align_size - 1; foreach_order_pgoff(res, order, pgoff) { - struct dev_pagemap *dup; - - rcu_read_lock(); - dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff)); - rcu_read_unlock(); - if (dup) { - dev_err(dev, "%s: %pr collides with mapping for %s\n", - __func__, res, dev_name(dup->dev)); - error = -EBUSY; - break; - } error = __radix_tree_insert(&pgmap_radix, - PHYS_PFN(res->start) + pgoff, order, page_map); + PHYS_PFN(res->start) + pgoff, order, pgmap); if (error) { dev_err(dev, "%s: failed: %d\n", __func__, error); break; @@ -427,16 +388,16 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, goto err_pfn_remap; mem_hotplug_begin(); - error = arch_add_memory(nid, align_start, align_size, false); + error = arch_add_memory(nid, align_start, align_size, altmap, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT); + align_size >> PAGE_SHIFT, altmap); mem_hotplug_done(); if (error) goto err_add_memory; - for_each_device_pfn(pfn, page_map) { + for_each_device_pfn(pfn, pgmap) { struct page *page = pfn_to_page(pfn); /* @@ -447,19 +408,21 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, */ list_del(&page->lru); page->pgmap = pgmap; - percpu_ref_get(ref); + percpu_ref_get(pgmap->ref); if (!(++i % 1024)) cond_resched(); } - devres_add(dev, page_map); + + devm_add_action(dev, devm_memremap_pages_release, pgmap); + return __va(res->start); err_add_memory: untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: err_radix: - pgmap_radix_release(res); - devres_free(page_map); + pgmap_radix_release(res, pgoff); + devres_free(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); @@ -475,34 +438,39 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) altmap->alloc -= nr_pfns; } -struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +/** + * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn + * @pfn: page frame number to lookup page_map + * @pgmap: optional known pgmap that already has a reference + * + * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap + * is non-NULL but does not cover @pfn the reference to it will be released. + */ +struct dev_pagemap *get_dev_pagemap(unsigned long pfn, + struct dev_pagemap *pgmap) { - /* - * 'memmap_start' is the virtual address for the first "struct - * page" in this range of the vmemmap array. In the case of - * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple - * pointer arithmetic, so we can perform this to_vmem_altmap() - * conversion without concern for the initialization state of - * the struct page fields. - */ - struct page *page = (struct page *) memmap_start; - struct dev_pagemap *pgmap; + resource_size_t phys = PFN_PHYS(pfn); /* - * Unconditionally retrieve a dev_pagemap associated with the - * given physical address, this is only for use in the - * arch_{add|remove}_memory() for setting up and tearing down - * the memmap. + * In the cached case we're already holding a live reference. */ + if (pgmap) { + if (phys >= pgmap->res.start && phys <= pgmap->res.end) + return pgmap; + put_dev_pagemap(pgmap); + } + + /* fall back to slow path lookup */ rcu_read_lock(); - pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); + pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); + if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) + pgmap = NULL; rcu_read_unlock(); - return pgmap ? pgmap->altmap : NULL; + return pgmap; } #endif /* CONFIG_ZONE_DEVICE */ - #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) void put_zone_device_private_or_public_page(struct page *page) { diff --git a/kernel/module.c b/kernel/module.c index f0411a271765..ad2d420024f6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2863,6 +2863,15 @@ static int check_modinfo_livepatch(struct module *mod, struct load_info *info) } #endif /* CONFIG_LIVEPATCH */ +static void check_modinfo_retpoline(struct module *mod, struct load_info *info) +{ + if (retpoline_module_ok(get_modinfo(info, "retpoline"))) + return; + + pr_warn("%s: loading module not compiled with retpoline compiler.\n", + mod->name); +} + /* Sets info->hdr and info->len. */ static int copy_module_from_user(const void __user *umod, unsigned long len, struct load_info *info) @@ -3029,6 +3038,8 @@ static int check_modinfo(struct module *mod, struct load_info *info, int flags) add_taint_module(mod, TAINT_OOT_MODULE, LOCKDEP_STILL_OK); } + check_modinfo_retpoline(mod, info); + if (get_modinfo(info, "staging")) { add_taint_module(mod, TAINT_CRAP, LOCKDEP_STILL_OK); pr_warn("%s: module is from the staging directory, the quality " @@ -3118,7 +3129,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif - +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + mod->ei_funcs = section_objs(info, "_error_injection_whitelist", + sizeof(*mod->ei_funcs), + &mod->num_ei_funcs); +#endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); @@ -3789,6 +3804,7 @@ static int load_module(struct load_info *info, const char __user *uargs, module_disable_nx(mod); ddebug_cleanup: + ftrace_release_mod(mod); dynamic_debug_remove(mod, info->debug); synchronize_sched(); kfree(mod->args); @@ -3808,12 +3824,6 @@ static int load_module(struct load_info *info, const char __user *uargs, synchronize_sched(); mutex_unlock(&module_mutex); free_module: - /* - * Ftrace needs to clean up what it initialized. - * This does nothing if ftrace_module_init() wasn't called, - * but it must be called outside of module_mutex. - */ - ftrace_release_mod(mod); /* Free lock-classes; relies on the preceding sync_rcu() */ lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); @@ -3938,6 +3948,12 @@ static const char *get_ksymbol(struct module *mod, return symname(kallsyms, best); } +void * __weak dereference_module_function_descriptor(struct module *mod, + void *ptr) +{ + return ptr; +} + /* For kallsyms to ask for address resolution. NULL means not found. Careful * not to lock to avoid deadlock on oopses, simply disable preemption. */ const char *module_address_lookup(unsigned long addr, @@ -4157,7 +4173,7 @@ static int m_show(struct seq_file *m, void *p) { struct module *mod = list_entry(p, struct module, list); char buf[MODULE_FLAGS_BUF_SIZE]; - unsigned long value; + void *value; /* We always ignore unformed modules. */ if (mod->state == MODULE_STATE_UNFORMED) @@ -4173,8 +4189,8 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ - value = m->private ? 0 : (unsigned long)mod->core_layout.base; - seq_printf(m, " 0x" KALLSYM_FMT, value); + value = m->private ? NULL : mod->core_layout.base; + seq_printf(m, " 0x%px", value); /* Taints info */ if (mod->taints) diff --git a/kernel/padata.c b/kernel/padata.c index 57c0074d50cc..d568cc56405f 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * padata.c - generic interface to process data streams in parallel * diff --git a/kernel/pid.c b/kernel/pid.c index b13b624e2c49..ed6c343fe50d 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -41,7 +41,19 @@ #include <linux/sched/task.h> #include <linux/idr.h> -struct pid init_struct_pid = INIT_STRUCT_PID; +struct pid init_struct_pid = { + .count = ATOMIC_INIT(1), + .tasks = { + { .first = NULL }, + { .first = NULL }, + { .first = NULL }, + }, + .level = 0, + .numbers = { { + .nr = 0, + .ns = &init_pid_ns, + }, } +}; int pid_max = PID_MAX_DEFAULT; @@ -193,10 +205,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) } if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) { - disable_pid_allocation(ns); + if (pid_ns_prepare_proc(ns)) goto out_free; - } } get_pid_ns(ns); @@ -226,6 +236,10 @@ out_free: while (++i <= ns->level) idr_remove(&ns->idr, (pid->numbers + i)->nr); + /* On failure to allocate the first pid, reset the state */ + if (ns->pid_allocated == PIDNS_ADDING) + idr_set_cursor(&ns->idr, 0); + spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); @@ -329,6 +343,19 @@ struct task_struct *find_task_by_vpid(pid_t vnr) return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } +struct task_struct *find_get_task_by_vpid(pid_t nr) +{ + struct task_struct *task; + + rcu_read_lock(); + task = find_task_by_vpid(nr); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + return task; +} + struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; diff --git a/kernel/power/main.c b/kernel/power/main.c index 3a2ca9066583..705c2366dafe 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -22,6 +22,35 @@ DEFINE_MUTEX(pm_mutex); #ifdef CONFIG_PM_SLEEP +void lock_system_sleep(void) +{ + current->flags |= PF_FREEZER_SKIP; + mutex_lock(&pm_mutex); +} +EXPORT_SYMBOL_GPL(lock_system_sleep); + +void unlock_system_sleep(void) +{ + /* + * Don't use freezer_count() because we don't want the call to + * try_to_freeze() here. + * + * Reason: + * Fundamentally, we just don't need it, because freezing condition + * doesn't come into effect until we release the pm_mutex lock, + * since the freezer always works with pm_mutex held. + * + * More importantly, in the case of hibernation, + * unlock_system_sleep() gets called in snapshot_read() and + * snapshot_write() when the freezing condition is still in effect. + * Which means, if we use try_to_freeze() here, it would make them + * enter the refrigerator, thus causing hibernation to lockup. + */ + current->flags &= ~PF_FREEZER_SKIP; + mutex_unlock(&pm_mutex); +} +EXPORT_SYMBOL_GPL(unlock_system_sleep); + /* Routines for PM-transition notifications */ static BLOCKING_NOTIFIER_HEAD(pm_chain_head); diff --git a/kernel/power/power.h b/kernel/power/power.h index f29cd178df90..9e58bdc8a562 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -104,9 +104,6 @@ extern int in_suspend; extern dev_t swsusp_resume_device; extern sector_t swsusp_resume_block; -extern asmlinkage int swsusp_arch_suspend(void); -extern asmlinkage int swsusp_arch_resume(void); - extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index bce0464524d8..3d37c279c090 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1645,8 +1645,7 @@ static unsigned long free_unnecessary_pages(void) * [number of saveable pages] - [number of pages that can be freed in theory] * * where the second term is the sum of (1) reclaimable slab pages, (2) active - * and (3) inactive anonymous pages, (4) active and (5) inactive file pages, - * minus mapped file pages. + * and (3) inactive anonymous pages, (4) active and (5) inactive file pages. */ static unsigned long minimum_image_size(unsigned long saveable) { @@ -1656,8 +1655,7 @@ static unsigned long minimum_image_size(unsigned long saveable) + global_node_page_state(NR_ACTIVE_ANON) + global_node_page_state(NR_INACTIVE_ANON) + global_node_page_state(NR_ACTIVE_FILE) - + global_node_page_state(NR_INACTIVE_FILE) - - global_node_page_state(NR_FILE_MAPPED); + + global_node_page_state(NR_INACTIVE_FILE); return saveable <= size ? 0 : saveable - size; } diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 293ead59eccc..11b4282c2d20 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -240,7 +240,7 @@ static void hib_init_batch(struct hib_bio_batch *hb) static void hib_end_io(struct bio *bio) { struct hib_bio_batch *hb = bio->bi_private; - struct page *page = bio->bi_io_vec[0].bv_page; + struct page *page = bio_first_page_all(bio); if (bio->bi_status) { pr_alert("Read-error on swap-device (%u:%u:%Lu)\n", @@ -879,7 +879,7 @@ out_clean: * space avaiable from the resume partition. */ -static int enough_swap(unsigned int nr_pages, unsigned int flags) +static int enough_swap(unsigned int nr_pages) { unsigned int free_swap = count_swap_pages(root_swap, 1); unsigned int required; @@ -915,7 +915,7 @@ int swsusp_write(unsigned int flags) return error; } if (flags & SF_NOCOMPRESS_MODE) { - if (!enough_swap(pages, flags)) { + if (!enough_swap(pages)) { pr_err("Not enough free swap\n"); error = -ENOSPC; goto out_finish; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5d81206a572d..fc1123583fa6 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -131,13 +131,10 @@ static int __init control_devkmsg(char *str) /* * Set sysctl string accordingly: */ - if (devkmsg_log == DEVKMSG_LOG_MASK_ON) { - memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); - strncpy(devkmsg_log_str, "on", 2); - } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) { - memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); - strncpy(devkmsg_log_str, "off", 3); - } + if (devkmsg_log == DEVKMSG_LOG_MASK_ON) + strcpy(devkmsg_log_str, "on"); + else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) + strcpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* @@ -277,6 +274,13 @@ EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; +enum con_msg_format_flags { + MSG_FORMAT_DEFAULT = 0, + MSG_FORMAT_SYSLOG = (1 << 0), +}; + +static int console_msg_format = MSG_FORMAT_DEFAULT; + /* * The printk log buffer consists of a chain of concatenated variable * length records. Every record starts with a record header, containing @@ -920,13 +924,13 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) return ret; } -static unsigned int devkmsg_poll(struct file *file, poll_table *wait) +static __poll_t devkmsg_poll(struct file *file, poll_table *wait) { struct devkmsg_user *user = file->private_data; - int ret = 0; + __poll_t ret = 0; if (!user) - return POLLERR|POLLNVAL; + return EPOLLERR|EPOLLNVAL; poll_wait(file, &log_wait, wait); @@ -934,9 +938,9 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait) if (user->seq < log_next_seq) { /* return error when data has vanished underneath us */ if (user->seq < log_first_seq) - ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; + ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else - ret = POLLIN|POLLRDNORM; + ret = EPOLLIN|EPOLLRDNORM; } logbuf_unlock_irq(); @@ -1544,6 +1548,146 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) } /* + * Special console_lock variants that help to reduce the risk of soft-lockups. + * They allow to pass console_lock to another printk() call using a busy wait. + */ + +#ifdef CONFIG_LOCKDEP +static struct lockdep_map console_owner_dep_map = { + .name = "console_owner" +}; +#endif + +static DEFINE_RAW_SPINLOCK(console_owner_lock); +static struct task_struct *console_owner; +static bool console_waiter; + +/** + * console_lock_spinning_enable - mark beginning of code where another + * thread might safely busy wait + * + * This basically converts console_lock into a spinlock. This marks + * the section where the console_lock owner can not sleep, because + * there may be a waiter spinning (like a spinlock). Also it must be + * ready to hand over the lock at the end of the section. + */ +static void console_lock_spinning_enable(void) +{ + raw_spin_lock(&console_owner_lock); + console_owner = current; + raw_spin_unlock(&console_owner_lock); + + /* The waiter may spin on us after setting console_owner */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +} + +/** + * console_lock_spinning_disable_and_check - mark end of code where another + * thread was able to busy wait and check if there is a waiter + * + * This is called at the end of the section where spinning is allowed. + * It has two functions. First, it is a signal that it is no longer + * safe to start busy waiting for the lock. Second, it checks if + * there is a busy waiter and passes the lock rights to her. + * + * Important: Callers lose the lock if there was a busy waiter. + * They must not touch items synchronized by console_lock + * in this case. + * + * Return: 1 if the lock rights were passed, 0 otherwise. + */ +static int console_lock_spinning_disable_and_check(void) +{ + int waiter; + + raw_spin_lock(&console_owner_lock); + waiter = READ_ONCE(console_waiter); + console_owner = NULL; + raw_spin_unlock(&console_owner_lock); + + if (!waiter) { + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + return 0; + } + + /* The waiter is now free to continue */ + WRITE_ONCE(console_waiter, false); + + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + + /* + * Hand off console_lock to waiter. The waiter will perform + * the up(). After this, the waiter is the console_lock owner. + */ + mutex_release(&console_lock_dep_map, 1, _THIS_IP_); + return 1; +} + +/** + * console_trylock_spinning - try to get console_lock by busy waiting + * + * This allows to busy wait for the console_lock when the current + * owner is running in specially marked sections. It means that + * the current owner is running and cannot reschedule until it + * is ready to lose the lock. + * + * Return: 1 if we got the lock, 0 othrewise + */ +static int console_trylock_spinning(void) +{ + struct task_struct *owner = NULL; + bool waiter; + bool spin = false; + unsigned long flags; + + if (console_trylock()) + return 1; + + printk_safe_enter_irqsave(flags); + + raw_spin_lock(&console_owner_lock); + owner = READ_ONCE(console_owner); + waiter = READ_ONCE(console_waiter); + if (!waiter && owner && owner != current) { + WRITE_ONCE(console_waiter, true); + spin = true; + } + raw_spin_unlock(&console_owner_lock); + + /* + * If there is an active printk() writing to the + * consoles, instead of having it write our data too, + * see if we can offload that load from the active + * printer, and do some printing ourselves. + * Go into a spin only if there isn't already a waiter + * spinning, and there is an active printer, and + * that active printer isn't us (recursive printk?). + */ + if (!spin) { + printk_safe_exit_irqrestore(flags); + return 0; + } + + /* We spin waiting for the owner to release us */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); + /* Owner will clear console_waiter on hand off */ + while (READ_ONCE(console_waiter)) + cpu_relax(); + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + + printk_safe_exit_irqrestore(flags); + /* + * The owner passed the console lock to us. + * Since we did not spin on console lock, annotate + * this as a trylock. Otherwise lockdep will + * complain. + */ + mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + + return 1; +} + +/* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. * The console_lock must be held. @@ -1749,12 +1893,19 @@ asmlinkage int vprintk_emit(int facility, int level, /* If called from the scheduler, we can not call up(). */ if (!in_sched) { /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); + /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock()) + if (console_trylock_spinning()) console_unlock(); + preempt_enable(); } return printed_len; @@ -1855,6 +2006,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } +static void console_lock_spinning_enable(void) { } +static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, @@ -1913,6 +2066,17 @@ static int __add_preferred_console(char *name, int idx, char *options, c->index = idx; return 0; } + +static int __init console_msg_format_setup(char *str) +{ + if (!strcmp(str, "syslog")) + console_msg_format = MSG_FORMAT_SYSLOG; + if (!strcmp(str, "default")) + console_msg_format = MSG_FORMAT_DEFAULT; + return 1; +} +__setup("console_msg_format=", console_msg_format_setup); + /* * Set up a console. Called via do_early_param() in init/main.c * for each "console=" parameter in the boot command line. @@ -2069,20 +2233,7 @@ int console_trylock(void) return 0; } console_locked = 1; - /* - * When PREEMPT_COUNT disabled we can't reliably detect if it's - * safe to schedule (e.g. calling printk while holding a spin_lock), - * because preempt_disable()/preempt_enable() are just barriers there - * and preempt_count() is always 0. - * - * RCU read sections have a separate preemption counter when - * PREEMPT_RCU enabled thus we must take extra care and check - * rcu_preempt_depth(), otherwise RCU read sections modify - * preempt_count(). - */ - console_may_schedule = !oops_in_progress && - preemptible() && - !rcu_preempt_depth(); + console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); @@ -2215,7 +2366,10 @@ skip: goto skip; } - len += msg_print_text(msg, false, text + len, sizeof(text) - len); + len += msg_print_text(msg, + console_msg_format & MSG_FORMAT_SYSLOG, + text + len, + sizeof(text) - len); if (nr_ext_console_drivers) { ext_len = msg_print_ext_header(ext_text, sizeof(ext_text), @@ -2229,14 +2383,29 @@ skip: console_seq++; raw_spin_unlock(&logbuf_lock); + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + */ + console_lock_spinning_enable(); + stop_critical_timings(); /* don't trace print latency */ call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); + + if (console_lock_spinning_disable_and_check()) { + printk_safe_exit_irqrestore(flags); + return; + } + printk_safe_exit_irqrestore(flags); if (do_cond_resched) cond_resched(); } + console_locked = 0; /* Release the exclusive_console once it is used */ @@ -3141,9 +3310,6 @@ void dump_stack_print_info(const char *log_lvl) void show_regs_print_info(const char *log_lvl) { dump_stack_print_info(log_lvl); - - printk("%stask: %p task.stack: %p\n", - log_lvl, current, task_stack_page(current)); } #endif diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 58291e9f3276..21fec73d45d4 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -659,7 +659,7 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) if (lock_task_sighand(child, &flags)) { error = -EINVAL; if (likely(child->last_siginfo != NULL)) { - *info = *child->last_siginfo; + copy_siginfo(info, child->last_siginfo); error = 0; } unlock_task_sighand(child, &flags); @@ -675,7 +675,7 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) if (lock_task_sighand(child, &flags)) { error = -EINVAL; if (likely(child->last_siginfo != NULL)) { - *child->last_siginfo = *info; + copy_siginfo(child->last_siginfo, info); error = 0; } unlock_task_sighand(child, &flags); @@ -1103,21 +1103,6 @@ int ptrace_request(struct task_struct *child, long request, return ret; } -static struct task_struct *ptrace_get_task_struct(pid_t pid) -{ - struct task_struct *child; - - rcu_read_lock(); - child = find_task_by_vpid(pid); - if (child) - get_task_struct(child); - rcu_read_unlock(); - - if (!child) - return ERR_PTR(-ESRCH); - return child; -} - #ifndef arch_ptrace_attach #define arch_ptrace_attach(child) do { } while (0) #endif @@ -1135,9 +1120,9 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out; } - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) { - ret = PTR_ERR(child); + child = find_get_task_by_vpid(pid); + if (!child) { + ret = -ESRCH; goto out; } @@ -1230,7 +1215,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, break; case PTRACE_SETSIGINFO: - memset(&siginfo, 0, sizeof siginfo); if (copy_siginfo_from_user32( &siginfo, (struct compat_siginfo __user *) datap)) ret = -EFAULT; @@ -1282,9 +1266,9 @@ COMPAT_SYSCALL_DEFINE4(ptrace, compat_long_t, request, compat_long_t, pid, goto out; } - child = ptrace_get_task_struct(pid); - if (IS_ERR(child)) { - ret = PTR_ERR(child); + child = find_get_task_by_vpid(pid); + if (!child) { + ret = -ESRCH; goto out; } diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 59c471de342a..6334f2c1abd0 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -30,31 +30,8 @@ #define RCU_TRACE(stmt) #endif /* #else #ifdef CONFIG_RCU_TRACE */ -/* - * Process-level increment to ->dynticks_nesting field. This allows for - * architectures that use half-interrupts and half-exceptions from - * process context. - * - * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH - * that counts the number of process-based reasons why RCU cannot - * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE - * is the value used to increment or decrement this field. - * - * The rest of the bits could in principle be used to count interrupts, - * but this would mean that a negative-one value in the interrupt - * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field. - * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK - * that is set to DYNTICK_TASK_FLAG upon initial exit from idle. - * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon - * initial exit from idle. - */ -#define DYNTICK_TASK_NEST_WIDTH 7 -#define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1) -#define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1) -#define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2) -#define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3) -#define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ - DYNTICK_TASK_FLAG) +/* Offset to allow for unmatched rcu_irq_{enter,exit}(). */ +#define DYNTICK_IRQ_NONIDLE ((LONG_MAX / 2) + 1) /* diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c index 1f87a02c3399..d1ebdf9868bb 100644 --- a/kernel/rcu/rcuperf.c +++ b/kernel/rcu/rcuperf.c @@ -106,10 +106,6 @@ static int rcu_perf_writer_state; #define MAX_MEAS 10000 #define MIN_MEAS 100 -static int perf_runnable = IS_ENABLED(MODULE); -module_param(perf_runnable, int, 0444); -MODULE_PARM_DESC(perf_runnable, "Start rcuperf at boot"); - /* * Operations vector for selecting different types of tests. */ @@ -646,7 +642,7 @@ rcu_perf_init(void) &tasks_ops, }; - if (!torture_init_begin(perf_type, verbose, &perf_runnable)) + if (!torture_init_begin(perf_type, verbose)) return -EBUSY; /* Process args and tell the world that the perf'er is on the job. */ diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 74f6b0146b98..308e6fdbced8 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -187,10 +187,6 @@ static const char *rcu_torture_writer_state_getname(void) return rcu_torture_writer_state_names[i]; } -static int torture_runnable = IS_ENABLED(MODULE); -module_param(torture_runnable, int, 0444); -MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot"); - #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) #define rcu_can_boost() 1 #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ @@ -315,11 +311,9 @@ static void rcu_read_delay(struct torture_random_state *rrsp) } if (!(torture_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) udelay(shortdelay_us); -#ifdef CONFIG_PREEMPT if (!preempt_count() && - !(torture_random(rrsp) % (nrealreaders * 20000))) - preempt_schedule(); /* No QS if preempt_disable() in effect */ -#endif + !(torture_random(rrsp) % (nrealreaders * 500))) + torture_preempt_schedule(); /* QS only if preemptible. */ } static void rcu_torture_read_unlock(int idx) __releases(RCU) @@ -1731,7 +1725,7 @@ rcu_torture_init(void) &sched_ops, &tasks_ops, }; - if (!torture_init_begin(torture_type, verbose, &torture_runnable)) + if (!torture_init_begin(torture_type, verbose)) return -EBUSY; /* Process args and tell the world that the torturer is on the job. */ diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index 6d5880089ff6..d5cea81378cc 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -53,6 +53,33 @@ static void srcu_invoke_callbacks(struct work_struct *work); static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); static void process_srcu(struct work_struct *work); +/* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ +#define spin_lock_rcu_node(p) \ +do { \ + spin_lock(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) + +#define spin_lock_irq_rcu_node(p) \ +do { \ + spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_irq_rcu_node(p) \ + spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) + +#define spin_lock_irqsave_rcu_node(p, flags) \ +do { \ + spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define spin_unlock_irqrestore_rcu_node(p, flags) \ + spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ + /* * Initialize SRCU combining tree. Note that statically allocated * srcu_struct structures might already have srcu_read_lock() and @@ -77,7 +104,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) /* Each pass through this loop initializes one srcu_node structure. */ rcu_for_each_node_breadth_first(sp, snp) { - raw_spin_lock_init(&ACCESS_PRIVATE(snp, lock)); + spin_lock_init(&ACCESS_PRIVATE(snp, lock)); WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != ARRAY_SIZE(snp->srcu_data_have_cbs)); for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { @@ -111,7 +138,7 @@ static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) snp_first = sp->level[level]; for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); rcu_segcblist_init(&sdp->srcu_cblist); sdp->srcu_cblist_invoking = false; sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; @@ -170,7 +197,7 @@ int __init_srcu_struct(struct srcu_struct *sp, const char *name, /* Don't re-initialize a lock while it is held. */ debug_check_no_locks_freed((void *)sp, sizeof(*sp)); lockdep_init_map(&sp->dep_map, name, key, 0); - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(__init_srcu_struct); @@ -187,7 +214,7 @@ EXPORT_SYMBOL_GPL(__init_srcu_struct); */ int init_srcu_struct(struct srcu_struct *sp) { - raw_spin_lock_init(&ACCESS_PRIVATE(sp, lock)); + spin_lock_init(&ACCESS_PRIVATE(sp, lock)); return init_srcu_struct_fields(sp, false); } EXPORT_SYMBOL_GPL(init_srcu_struct); @@ -210,13 +237,13 @@ static void check_init_srcu_struct(struct srcu_struct *sp) /* The smp_load_acquire() pairs with the smp_store_release(). */ if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ return; /* Already initialized. */ - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); return; } init_srcu_struct_fields(sp, true); - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -513,7 +540,7 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_lock(&sp->srcu_cb_mutex); /* End the current grace period. */ - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); idx = rcu_seq_state(sp->srcu_gp_seq); WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); cbdelay = srcu_get_delay(sp); @@ -522,7 +549,7 @@ static void srcu_gp_end(struct srcu_struct *sp) gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) sp->srcu_gp_seq_needed_exp = gpseq; - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); /* A new grace period can start at this point. But only one. */ @@ -530,7 +557,7 @@ static void srcu_gp_end(struct srcu_struct *sp) idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); rcu_for_each_node_breadth_first(sp, snp) { - raw_spin_lock_irq_rcu_node(snp); + spin_lock_irq_rcu_node(snp); cbs = false; if (snp >= sp->level[rcu_num_lvls - 1]) cbs = snp->srcu_have_cbs[idx] == gpseq; @@ -540,7 +567,7 @@ static void srcu_gp_end(struct srcu_struct *sp) snp->srcu_gp_seq_needed_exp = gpseq; mask = snp->srcu_data_have_cbs[idx]; snp->srcu_data_have_cbs[idx] = 0; - raw_spin_unlock_irq_rcu_node(snp); + spin_unlock_irq_rcu_node(snp); if (cbs) srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); @@ -548,11 +575,11 @@ static void srcu_gp_end(struct srcu_struct *sp) if (!(gpseq & counter_wrap_check)) for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_irqsave_rcu_node(sdp, flags); + spin_lock_irqsave_rcu_node(sdp, flags); if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) sdp->srcu_gp_seq_needed = gpseq; - raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); } } @@ -560,17 +587,17 @@ static void srcu_gp_end(struct srcu_struct *sp) mutex_unlock(&sp->srcu_cb_mutex); /* Start a new grace period if needed. */ - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); gpseq = rcu_seq_current(&sp->srcu_gp_seq); if (!rcu_seq_state(gpseq) && ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { srcu_gp_start(sp); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); /* Throttle expedited grace periods: Should be rare! */ srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff ? 0 : SRCU_INTERVAL); } else { - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); } } @@ -590,18 +617,18 @@ static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, if (rcu_seq_done(&sp->srcu_gp_seq, s) || ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) return; - raw_spin_lock_irqsave_rcu_node(snp, flags); + spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); return; } WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); } - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) sp->srcu_gp_seq_needed_exp = s; - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -623,12 +650,12 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, for (; snp != NULL; snp = snp->srcu_parent) { if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) return; /* GP already done and CBs recorded. */ - raw_spin_lock_irqsave_rcu_node(snp, flags); + spin_lock_irqsave_rcu_node(snp, flags); if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { snp_seq = snp->srcu_have_cbs[idx]; if (snp == sdp->mynode && snp_seq == s) snp->srcu_data_have_cbs[idx] |= sdp->grpmask; - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); if (snp == sdp->mynode && snp_seq != s) { srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL @@ -644,11 +671,11 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, snp->srcu_data_have_cbs[idx] |= sdp->grpmask; if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) snp->srcu_gp_seq_needed_exp = s; - raw_spin_unlock_irqrestore_rcu_node(snp, flags); + spin_unlock_irqrestore_rcu_node(snp, flags); } /* Top of tree, must ensure the grace period will be started. */ - raw_spin_lock_irqsave_rcu_node(sp, flags); + spin_lock_irqsave_rcu_node(sp, flags); if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { /* * Record need for grace period s. Pair with load @@ -667,7 +694,7 @@ static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, queue_delayed_work(system_power_efficient_wq, &sp->work, srcu_get_delay(sp)); } - raw_spin_unlock_irqrestore_rcu_node(sp, flags); + spin_unlock_irqrestore_rcu_node(sp, flags); } /* @@ -830,7 +857,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, rhp->func = func; local_irq_save(flags); sdp = this_cpu_ptr(sp->sda); - raw_spin_lock_rcu_node(sdp); + spin_lock_rcu_node(sdp); rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); @@ -844,7 +871,7 @@ void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, sdp->srcu_gp_seq_needed_exp = s; needexp = true; } - raw_spin_unlock_irqrestore_rcu_node(sdp, flags); + spin_unlock_irqrestore_rcu_node(sdp, flags); if (needgp) srcu_funnel_gp_start(sp, sdp, s, do_norm); else if (needexp) @@ -900,7 +927,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) /* * Make sure that later code is ordered after the SRCU grace - * period. This pairs with the raw_spin_lock_irq_rcu_node() + * period. This pairs with the spin_lock_irq_rcu_node() * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed * because the current CPU might have been totally uninvolved with * (and thus unordered against) that grace period. @@ -1024,7 +1051,7 @@ void srcu_barrier(struct srcu_struct *sp) */ for_each_possible_cpu(cpu) { sdp = per_cpu_ptr(sp->sda, cpu); - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); atomic_inc(&sp->srcu_barrier_cpu_cnt); sdp->srcu_barrier_head.func = srcu_barrier_cb; debug_rcu_head_queue(&sdp->srcu_barrier_head); @@ -1033,7 +1060,7 @@ void srcu_barrier(struct srcu_struct *sp) debug_rcu_head_unqueue(&sdp->srcu_barrier_head); atomic_dec(&sp->srcu_barrier_cpu_cnt); } - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); } /* Remove the initial count, at which point reaching zero can happen. */ @@ -1082,17 +1109,17 @@ static void srcu_advance_state(struct srcu_struct *sp) */ idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ if (idx == SRCU_STATE_IDLE) { - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); mutex_unlock(&sp->srcu_gp_mutex); return; } idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); if (idx == SRCU_STATE_IDLE) srcu_gp_start(sp); - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); if (idx != SRCU_STATE_IDLE) { mutex_unlock(&sp->srcu_gp_mutex); return; /* Someone else started the grace period. */ @@ -1141,19 +1168,19 @@ static void srcu_invoke_callbacks(struct work_struct *work) sdp = container_of(work, struct srcu_data, work.work); sp = sdp->sp; rcu_cblist_init(&ready_cbs); - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); rcu_segcblist_advance(&sdp->srcu_cblist, rcu_seq_current(&sp->srcu_gp_seq)); if (sdp->srcu_cblist_invoking || !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); return; /* Someone else on the job or nothing to do. */ } /* We are on the job! Extract and invoke ready callbacks. */ sdp->srcu_cblist_invoking = true; rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); rhp = rcu_cblist_dequeue(&ready_cbs); for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { debug_rcu_head_unqueue(rhp); @@ -1166,13 +1193,13 @@ static void srcu_invoke_callbacks(struct work_struct *work) * Update counts, accelerate new callbacks, and if needed, * schedule another round of callback invocation. */ - raw_spin_lock_irq_rcu_node(sdp); + spin_lock_irq_rcu_node(sdp); rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, rcu_seq_snap(&sp->srcu_gp_seq)); sdp->srcu_cblist_invoking = false; more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); - raw_spin_unlock_irq_rcu_node(sdp); + spin_unlock_irq_rcu_node(sdp); if (more) srcu_schedule_cbs_sdp(sdp, 0); } @@ -1185,7 +1212,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) { bool pushgp = true; - raw_spin_lock_irq_rcu_node(sp); + spin_lock_irq_rcu_node(sp); if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { /* All requests fulfilled, time to go idle. */ @@ -1195,7 +1222,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) /* Outstanding request and no GP. Start one. */ srcu_gp_start(sp); } - raw_spin_unlock_irq_rcu_node(sp); + spin_unlock_irq_rcu_node(sp); if (pushgp) queue_delayed_work(system_power_efficient_wq, &sp->work, delay); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c0ca2ccf0c..491bdf39f276 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -265,25 +265,12 @@ void rcu_bh_qs(void) #endif static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { - .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, + .dynticks_nesting = 1, + .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), }; /* - * There's a few places, currently just in the tracing infrastructure, - * that uses rcu_irq_enter() to make sure RCU is watching. But there's - * a small location where that will not even work. In those cases - * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter() - * can be called. - */ -static DEFINE_PER_CPU(bool, disable_rcu_irq_enter); - -bool rcu_irq_enter_disabled(void) -{ - return this_cpu_read(disable_rcu_irq_enter); -} - -/* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state. */ @@ -762,68 +749,39 @@ cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) } /* - * rcu_eqs_enter_common - current CPU is entering an extended quiescent state + * Enter an RCU extended quiescent state, which can be either the + * idle loop or adaptive-tickless usermode execution. * - * Enter idle, doing appropriate accounting. The caller must have - * disabled interrupts. + * We crowbar the ->dynticks_nmi_nesting field to zero to allow for + * the possibility of usermode upcalls having messed up our count + * of interrupt nesting level during the prior busy period. */ -static void rcu_eqs_enter_common(bool user) +static void rcu_eqs_enter(bool user) { struct rcu_state *rsp; struct rcu_data *rdp; - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + struct rcu_dynticks *rdtp; - lockdep_assert_irqs_disabled(); - trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); - if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0); - rcu_ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ + rdtp = this_cpu_ptr(&rcu_dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + rdtp->dynticks_nesting == 0); + if (rdtp->dynticks_nesting != 1) { + rdtp->dynticks_nesting--; + return; } + + lockdep_assert_irqs_disabled(); + trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0, rdtp->dynticks); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - __this_cpu_inc(disable_rcu_irq_enter); - rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ - rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ - __this_cpu_dec(disable_rcu_irq_enter); + WRITE_ONCE(rdtp->dynticks_nesting, 0); /* Avoid irq-access tearing. */ + rcu_dynticks_eqs_enter(); rcu_dynticks_task_enter(); - - /* - * It is illegal to enter an extended quiescent state while - * in an RCU read-side critical section. - */ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), - "Illegal idle entry in RCU read-side critical section."); - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), - "Illegal idle entry in RCU-bh read-side critical section."); - RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), - "Illegal idle entry in RCU-sched read-side critical section."); -} - -/* - * Enter an RCU extended quiescent state, which can be either the - * idle loop or adaptive-tickless usermode execution. - */ -static void rcu_eqs_enter(bool user) -{ - struct rcu_dynticks *rdtp; - - rdtp = this_cpu_ptr(&rcu_dynticks); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); - if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) - rcu_eqs_enter_common(user); - else - rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; } /** @@ -834,10 +792,6 @@ static void rcu_eqs_enter(bool user) * critical sections can occur in irq handlers in idle, a possibility * handled by irq_enter() and irq_exit().) * - * We crowbar the ->dynticks_nesting field to zero to allow for - * the possibility of usermode upcalls having messed up our count - * of interrupt nesting level during the prior busy period. - * * If you add or remove a call to rcu_idle_enter(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ @@ -867,6 +821,46 @@ void rcu_user_enter(void) #endif /* CONFIG_NO_HZ_FULL */ /** + * rcu_nmi_exit - inform RCU of exit from NMI context + * + * If we are returning from the outermost NMI handler that interrupted an + * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting + * to let the RCU grace-period handling know that the CPU is back to + * being RCU-idle. + * + * If you add or remove a call to rcu_nmi_exit(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void rcu_nmi_exit(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + /* + * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. + * (We are exiting an NMI handler, so RCU better be paying attention + * to us!) + */ + WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); + + /* + * If the nesting level is not 1, the CPU wasn't RCU-idle, so + * leave it in non-RCU-idle state. + */ + if (rdtp->dynticks_nmi_nesting != 1) { + trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nmi_nesting, rdtp->dynticks_nmi_nesting - 2, rdtp->dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* No store tearing. */ + rdtp->dynticks_nmi_nesting - 2); + return; + } + + /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ + trace_rcu_dyntick(TPS("Startirq"), rdtp->dynticks_nmi_nesting, 0, rdtp->dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ + rcu_dynticks_eqs_enter(); +} + +/** * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle * * Exit from an interrupt handler, which might possibly result in entering @@ -875,8 +869,8 @@ void rcu_user_enter(void) * * This code assumes that the idle loop never does anything that might * result in unbalanced calls to irq_enter() and irq_exit(). If your - * architecture violates this assumption, RCU will give you what you - * deserve, good and hard. But very infrequently and irreproducibly. + * architecture's idle loop violates this assumption, RCU will give you what + * you deserve, good and hard. But very infrequently and irreproducibly. * * Use things like work queues to work around this limitation. * @@ -887,23 +881,14 @@ void rcu_user_enter(void) */ void rcu_irq_exit(void) { - struct rcu_dynticks *rdtp; + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); lockdep_assert_irqs_disabled(); - rdtp = this_cpu_ptr(&rcu_dynticks); - - /* Page faults can happen in NMI handlers, so check... */ - if (rdtp->dynticks_nmi_nesting) - return; - - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting < 1); - if (rdtp->dynticks_nesting <= 1) { - rcu_eqs_enter_common(true); - } else { - trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); - rdtp->dynticks_nesting--; - } + if (rdtp->dynticks_nmi_nesting == 1) + rcu_prepare_for_idle(); + rcu_nmi_exit(); + if (rdtp->dynticks_nmi_nesting == 0) + rcu_dynticks_task_enter(); } /* @@ -922,55 +907,33 @@ void rcu_irq_exit_irqson(void) } /* - * rcu_eqs_exit_common - current CPU moving away from extended quiescent state - * - * If the new value of the ->dynticks_nesting counter was previously zero, - * we really have exited idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. - */ -static void rcu_eqs_exit_common(long long oldval, int user) -{ - RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) - - rcu_dynticks_task_exit(); - rcu_dynticks_eqs_exit(); - rcu_cleanup_after_idle(); - trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); - if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - !user && !is_idle_task(current)) { - struct task_struct *idle __maybe_unused = - idle_task(smp_processor_id()); - - trace_rcu_dyntick(TPS("Error on exit: not idle task"), - oldval, rdtp->dynticks_nesting); - rcu_ftrace_dump(DUMP_ORIG); - WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", - current->pid, current->comm, - idle->pid, idle->comm); /* must be idle task! */ - } -} - -/* * Exit an RCU extended quiescent state, which can be either the * idle loop or adaptive-tickless usermode execution. + * + * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to + * allow for the possibility of usermode upcalls messing up our count of + * interrupt nesting level during the busy period that is just now starting. */ static void rcu_eqs_exit(bool user) { struct rcu_dynticks *rdtp; - long long oldval; + long oldval; lockdep_assert_irqs_disabled(); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); - if (oldval & DYNTICK_TASK_NEST_MASK) { - rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; - } else { - __this_cpu_inc(disable_rcu_irq_enter); - rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; - rcu_eqs_exit_common(oldval, user); - __this_cpu_dec(disable_rcu_irq_enter); + if (oldval) { + rdtp->dynticks_nesting++; + return; } + rcu_dynticks_task_exit(); + rcu_dynticks_eqs_exit(); + rcu_cleanup_after_idle(); + trace_rcu_dyntick(TPS("End"), rdtp->dynticks_nesting, 1, rdtp->dynticks); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); + WRITE_ONCE(rdtp->dynticks_nesting, 1); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); } /** @@ -979,11 +942,6 @@ static void rcu_eqs_exit(bool user) * Exit idle mode, in other words, -enter- the mode in which RCU * read-side critical sections can occur. * - * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to - * allow for the possibility of usermode upcalls messing up our count - * of interrupt nesting level during the busy period that is just - * now starting. - * * If you add or remove a call to rcu_idle_exit(), be sure to test with * CONFIG_RCU_EQS_DEBUG=y. */ @@ -1013,65 +971,6 @@ void rcu_user_exit(void) #endif /* CONFIG_NO_HZ_FULL */ /** - * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle - * - * Enter an interrupt handler, which might possibly result in exiting - * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. The caller must have disabled interrupts. - * - * Note that the Linux kernel is fully capable of entering an interrupt - * handler that it never exits, for example when doing upcalls to - * user mode! This code assumes that the idle loop never does upcalls to - * user mode. If your architecture does do upcalls from the idle loop (or - * does anything else that results in unbalanced calls to the irq_enter() - * and irq_exit() functions), RCU will give you what you deserve, good - * and hard. But very infrequently and irreproducibly. - * - * Use things like work queues to work around this limitation. - * - * You have been warned. - * - * If you add or remove a call to rcu_irq_enter(), be sure to test with - * CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_irq_enter(void) -{ - struct rcu_dynticks *rdtp; - long long oldval; - - lockdep_assert_irqs_disabled(); - rdtp = this_cpu_ptr(&rcu_dynticks); - - /* Page faults can happen in NMI handlers, so check... */ - if (rdtp->dynticks_nmi_nesting) - return; - - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting++; - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting == 0); - if (oldval) - trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); - else - rcu_eqs_exit_common(oldval, true); -} - -/* - * Wrapper for rcu_irq_enter() where interrupts are enabled. - * - * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. - */ -void rcu_irq_enter_irqson(void) -{ - unsigned long flags; - - local_irq_save(flags); - rcu_irq_enter(); - local_irq_restore(flags); -} - -/** * rcu_nmi_enter - inform RCU of entry to NMI context * * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and @@ -1086,7 +985,7 @@ void rcu_irq_enter_irqson(void) void rcu_nmi_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int incby = 2; + long incby = 2; /* Complain about underflow. */ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); @@ -1103,45 +1002,61 @@ void rcu_nmi_enter(void) rcu_dynticks_eqs_exit(); incby = 1; } - rdtp->dynticks_nmi_nesting += incby; + trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="), + rdtp->dynticks_nmi_nesting, + rdtp->dynticks_nmi_nesting + incby, rdtp->dynticks); + WRITE_ONCE(rdtp->dynticks_nmi_nesting, /* Prevent store tearing. */ + rdtp->dynticks_nmi_nesting + incby); barrier(); } /** - * rcu_nmi_exit - inform RCU of exit from NMI context + * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle * - * If we are returning from the outermost NMI handler that interrupted an - * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting - * to let the RCU grace-period handling know that the CPU is back to - * being RCU-idle. + * Enter an interrupt handler, which might possibly result in exiting + * idle mode, in other words, entering the mode in which read-side critical + * sections can occur. The caller must have disabled interrupts. * - * If you add or remove a call to rcu_nmi_exit(), be sure to test - * with CONFIG_RCU_EQS_DEBUG=y. + * Note that the Linux kernel is fully capable of entering an interrupt + * handler that it never exits, for example when doing upcalls to user mode! + * This code assumes that the idle loop never does upcalls to user mode. + * If your architecture's idle loop does do upcalls to user mode (or does + * anything else that results in unbalanced calls to the irq_enter() and + * irq_exit() functions), RCU will give you what you deserve, good and hard. + * But very infrequently and irreproducibly. + * + * Use things like work queues to work around this limitation. + * + * You have been warned. + * + * If you add or remove a call to rcu_irq_enter(), be sure to test with + * CONFIG_RCU_EQS_DEBUG=y. */ -void rcu_nmi_exit(void) +void rcu_irq_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - /* - * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. - * (We are exiting an NMI handler, so RCU better be paying attention - * to us!) - */ - WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); - WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); + lockdep_assert_irqs_disabled(); + if (rdtp->dynticks_nmi_nesting == 0) + rcu_dynticks_task_exit(); + rcu_nmi_enter(); + if (rdtp->dynticks_nmi_nesting == 1) + rcu_cleanup_after_idle(); +} - /* - * If the nesting level is not 1, the CPU wasn't RCU-idle, so - * leave it in non-RCU-idle state. - */ - if (rdtp->dynticks_nmi_nesting != 1) { - rdtp->dynticks_nmi_nesting -= 2; - return; - } +/* + * Wrapper for rcu_irq_enter() where interrupts are enabled. + * + * If you add or remove a call to rcu_irq_enter_irqson(), be sure to test + * with CONFIG_RCU_EQS_DEBUG=y. + */ +void rcu_irq_enter_irqson(void) +{ + unsigned long flags; - /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ - rdtp->dynticks_nmi_nesting = 0; - rcu_dynticks_eqs_enter(); + local_irq_save(flags); + rcu_irq_enter(); + local_irq_restore(flags); } /** @@ -1233,7 +1148,8 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); */ static int rcu_is_cpu_rrupt_from_idle(void) { - return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; + return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 0 && + __this_cpu_read(rcu_dynticks.dynticks_nmi_nesting) <= 1; } /* @@ -2789,6 +2705,11 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) rdp->n_force_qs_snap = rsp->n_force_qs; } else if (count < rdp->qlen_last_fqs_check - qhimark) rdp->qlen_last_fqs_check = count; + + /* + * The following usually indicates a double call_rcu(). To track + * this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. + */ WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); local_irq_restore(flags); @@ -3723,7 +3644,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); - WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); + WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != 1); WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); rdp->cpu = cpu; rdp->rsp = rsp; @@ -3752,7 +3673,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ !init_nocb_callback_list(rdp)) rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ - rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rdp->dynticks->dynticks_nesting = 1; /* CPU not up, no tearing. */ rcu_dynticks_eqs_online(); raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 46a5d1991450..6488a3b0e729 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -38,9 +38,8 @@ * Dynticks per-CPU state. */ struct rcu_dynticks { - long long dynticks_nesting; /* Track irq/process nesting level. */ - /* Process level is worth LLONG_MAX/2. */ - int dynticks_nmi_nesting; /* Track NMI nesting level. */ + long dynticks_nesting; /* Track process nesting level. */ + long dynticks_nmi_nesting; /* Track irq/NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index db85ca3975f1..fb88a028deec 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -61,7 +61,6 @@ DEFINE_PER_CPU(char, rcu_cpu_has_work); #ifdef CONFIG_RCU_NOCB_CPU static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ -static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ @@ -1687,7 +1686,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) } print_cpu_stall_fast_no_hz(fast_no_hz, cpu); delta = rdp->mynode->gpnum - rdp->rcu_iw_gpnum; - pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n", + pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%03x/%ld/%ld softirq=%u/%u fqs=%ld %s\n", cpu, "O."[!!cpu_online(cpu)], "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], @@ -1752,7 +1751,6 @@ static void increment_cpu_stall_ticks(void) static int __init rcu_nocb_setup(char *str) { alloc_bootmem_cpumask_var(&rcu_nocb_mask); - have_rcu_nocb_mask = true; cpulist_parse(str, rcu_nocb_mask); return 1; } @@ -1801,7 +1799,7 @@ static void rcu_init_one_nocb(struct rcu_node *rnp) /* Is the specified CPU a no-CBs CPU? */ bool rcu_is_nocb_cpu(int cpu) { - if (have_rcu_nocb_mask) + if (cpumask_available(rcu_nocb_mask)) return cpumask_test_cpu(cpu, rcu_nocb_mask); return false; } @@ -2295,14 +2293,13 @@ void __init rcu_init_nohz(void) need_rcu_nocb_mask = true; #endif /* #if defined(CONFIG_NO_HZ_FULL) */ - if (!have_rcu_nocb_mask && need_rcu_nocb_mask) { + if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) { if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) { pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n"); return; } - have_rcu_nocb_mask = true; } - if (!have_rcu_nocb_mask) + if (!cpumask_available(rcu_nocb_mask)) return; #if defined(CONFIG_NO_HZ_FULL) @@ -2428,7 +2425,7 @@ static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp) struct rcu_data *rdp_leader = NULL; /* Suppress misguided gcc warn. */ struct rcu_data *rdp_prev = NULL; - if (!have_rcu_nocb_mask) + if (!cpumask_available(rcu_nocb_mask)) return; if (ls == -1) { ls = int_sqrt(nr_cpu_ids); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index fbd56d6e575b..68fa19a5e7bd 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -422,11 +422,13 @@ void init_rcu_head(struct rcu_head *head) { debug_object_init(head, &rcuhead_debug_descr); } +EXPORT_SYMBOL_GPL(init_rcu_head); void destroy_rcu_head(struct rcu_head *head) { debug_object_free(head, &rcuhead_debug_descr); } +EXPORT_SYMBOL_GPL(destroy_rcu_head); static bool rcuhead_is_static_object(void *addr) { diff --git a/kernel/relay.c b/kernel/relay.c index 39a9dfc69486..c3029402f15c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -611,7 +611,6 @@ free_bufs: kref_put(&chan->kref, relay_destroy_channel); mutex_unlock(&relay_channels_mutex); - kfree(chan); return NULL; } EXPORT_SYMBOL_GPL(relay_open); @@ -919,18 +918,18 @@ static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) * * Poll implemention. */ -static unsigned int relay_file_poll(struct file *filp, poll_table *wait) +static __poll_t relay_file_poll(struct file *filp, poll_table *wait) { - unsigned int mask = 0; + __poll_t mask = 0; struct rchan_buf *buf = filp->private_data; if (buf->finalized) - return POLLERR; + return EPOLLERR; if (filp->f_mode & FMODE_READ) { poll_wait(filp, &buf->read_wait, wait); if (!relay_buf_empty(buf)) - mask |= POLLIN | POLLRDNORM; + mask |= EPOLLIN | EPOLLRDNORM; } return mask; diff --git a/kernel/resource.c b/kernel/resource.c index 54ba6de3757c..e270b5048988 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1022,6 +1022,7 @@ static void __init __reserve_region_with_split(struct resource *root, struct resource *conflict; struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; + int type = resource_type(root); if (!res) return; @@ -1029,7 +1030,7 @@ static void __init __reserve_region_with_split(struct resource *root, res->name = name; res->start = start; res->end = end; - res->flags = IORESOURCE_BUSY; + res->flags = type | IORESOURCE_BUSY; res->desc = IORES_DESC_NONE; while (1) { @@ -1064,7 +1065,7 @@ static void __init __reserve_region_with_split(struct resource *root, next_res->name = name; next_res->start = conflict->end + 1; next_res->end = end; - next_res->flags = IORESOURCE_BUSY; + next_res->flags = type | IORESOURCE_BUSY; next_res->desc = IORES_DESC_NONE; } } else { @@ -1478,7 +1479,7 @@ void __devm_release_region(struct device *dev, struct resource *parent, EXPORT_SYMBOL(__devm_release_region); /* - * Called from init/main.c to reserve IO ports. + * Reserve I/O ports or memory based on "reserve=" kernel parameter. */ #define MAXRESERVE 4 static int __init reserve_setup(char *str) @@ -1489,26 +1490,38 @@ static int __init reserve_setup(char *str) for (;;) { unsigned int io_start, io_num; int x = reserved; + struct resource *parent; - if (get_option (&str, &io_start) != 2) + if (get_option(&str, &io_start) != 2) break; - if (get_option (&str, &io_num) == 0) + if (get_option(&str, &io_num) == 0) break; if (x < MAXRESERVE) { struct resource *res = reserve + x; + + /* + * If the region starts below 0x10000, we assume it's + * I/O port space; otherwise assume it's memory. + */ + if (io_start < 0x10000) { + res->flags = IORESOURCE_IO; + parent = &ioport_resource; + } else { + res->flags = IORESOURCE_MEM; + parent = &iomem_resource; + } res->name = "reserved"; res->start = io_start; res->end = io_start + io_num - 1; - res->flags = IORESOURCE_BUSY; + res->flags |= IORESOURCE_BUSY; res->desc = IORES_DESC_NONE; res->child = NULL; - if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) + if (request_resource(parent, res) == 0) reserved = x+1; } } return 1; } - __setup("reserve=", reserve_setup); /* @@ -1563,17 +1576,17 @@ static int strict_iomem_checks; /* * check if an address is reserved in the iomem resource tree - * returns 1 if reserved, 0 if not reserved. + * returns true if reserved, false if not reserved. */ -int iomem_is_exclusive(u64 addr) +bool iomem_is_exclusive(u64 addr) { struct resource *p = &iomem_resource; - int err = 0; + bool err = false; loff_t l; int size = PAGE_SIZE; if (!strict_iomem_checks) - return 0; + return false; addr = addr & PAGE_MASK; @@ -1596,7 +1609,7 @@ int iomem_is_exclusive(u64 addr) continue; if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) || p->flags & IORESOURCE_EXCLUSIVE) { - err = 1; + err = true; break; } } diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index a43df5193538..bb4b9fe026a1 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,13 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/kallsyms.h> #include <linux/utsname.h> #include <linux/security.h> #include <linux/export.h> +#include "sched.h" + unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 2ddaec40956f..0926aef10dad 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -34,11 +34,6 @@ void complete(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); - /* - * Perform commit of crossrelease here. - */ - complete_release_commit(x); - if (x->done != UINT_MAX) x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 75554f366fd3..e7c535eee0a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -508,7 +508,8 @@ void resched_cpu(int cpu) unsigned long flags; raw_spin_lock_irqsave(&rq->lock, flags); - resched_curr(rq); + if (cpu_online(cpu) || cpu == smp_processor_id()) + resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1629,16 +1630,16 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #ifdef CONFIG_SMP if (cpu == rq->cpu) { - schedstat_inc(rq->ttwu_local); - schedstat_inc(p->se.statistics.nr_wakeups_local); + __schedstat_inc(rq->ttwu_local); + __schedstat_inc(p->se.statistics.nr_wakeups_local); } else { struct sched_domain *sd; - schedstat_inc(p->se.statistics.nr_wakeups_remote); + __schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd->ttwu_wake_remote); + __schedstat_inc(sd->ttwu_wake_remote); break; } } @@ -1646,14 +1647,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - schedstat_inc(p->se.statistics.nr_wakeups_migrate); + __schedstat_inc(p->se.statistics.nr_wakeups_migrate); #endif /* CONFIG_SMP */ - schedstat_inc(rq->ttwu_count); - schedstat_inc(p->se.statistics.nr_wakeups); + __schedstat_inc(rq->ttwu_count); + __schedstat_inc(p->se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p->se.statistics.nr_wakeups_sync); + __schedstat_inc(p->se.statistics.nr_wakeups_sync); } static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) @@ -2045,7 +2046,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * If the owning (remote) CPU is still in the middle of schedule() with * this task as prev, wait until its done referencing the task. * - * Pairs with the smp_store_release() in finish_lock_switch(). + * Pairs with the smp_store_release() in finish_task(). * * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. @@ -2056,7 +2057,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->state = TASK_WAKING; if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2069,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #else /* CONFIG_SMP */ if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&task_rq(p)->nr_iowait); } @@ -2122,7 +2123,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) if (!task_on_rq_queued(p)) { if (p->in_iowait) { - delayacct_blkio_end(); + delayacct_blkio_end(p); atomic_dec(&rq->nr_iowait); } ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); @@ -2460,6 +2461,7 @@ void wake_up_new_task(struct task_struct *p) * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, * as we're not fully set-up yet. */ + p->recent_used_cpu = task_cpu(p); __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); @@ -2571,6 +2573,62 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, #endif /* CONFIG_PREEMPT_NOTIFIERS */ +static inline void prepare_task(struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * Claim the task as running, we do this before switching to it + * such that any running task will have this set. + */ + next->on_cpu = 1; +#endif +} + +static inline void finish_task(struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). + */ + smp_store_release(&prev->on_cpu, 0); +#endif +} + +static inline void +prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) +{ + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ + rq_unpin_lock(rq, rf); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#ifdef CONFIG_DEBUG_SPINLOCK + /* this is a valid case when another task releases the spinlock */ + rq->lock.owner = next; +#endif +} + +static inline void finish_lock_switch(struct rq *rq) +{ + /* + * If we are tracking spinlock dependencies then we have to + * fix up the runqueue lock - which gets 'carried over' from + * prev into current: + */ + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + raw_spin_unlock_irq(&rq->lock); +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -2591,7 +2649,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, sched_info_switch(rq, prev, next); perf_event_task_sched_out(prev, next); fire_sched_out_preempt_notifiers(prev, next); - prepare_lock_switch(rq, next); + prepare_task(next); prepare_arch_switch(next); } @@ -2646,29 +2704,34 @@ static struct rq *finish_task_switch(struct task_struct *prev) * the scheduled task must drop that reference. * * We must observe prev->state before clearing prev->on_cpu (in - * finish_lock_switch), otherwise a concurrent wakeup can get prev + * finish_task), otherwise a concurrent wakeup can get prev * running on another CPU and we could rave with its RUNNING -> DEAD * transition, resulting in a double drop. */ prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); - /* - * The membarrier system call requires a full memory barrier - * after storing to rq->curr, before going back to user-space. - * - * TODO: This smp_mb__after_unlock_lock can go away if PPC end - * up adding a full barrier to switch_mm(), or we should figure - * out if a smp_mb__after_unlock_lock is really the proper API - * to use. - */ - smp_mb__after_unlock_lock(); - finish_lock_switch(rq, prev); + finish_task(prev); + finish_lock_switch(rq); finish_arch_post_lock_switch(); fire_sched_in_preempt_notifiers(current); - if (mm) + /* + * When switching through a kernel thread, the loop in + * membarrier_{private,global}_expedited() may have observed that + * kernel thread and not issued an IPI. It is therefore possible to + * schedule between user->kernel->user threads without passing though + * switch_mm(). Membarrier requires a barrier after storing to + * rq->curr, before returning to userspace, so provide them here: + * + * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly + * provided by mmdrop(), + * - a sync_core for SYNC_CORE. + */ + if (mm) { + membarrier_mm_sync_core_before_usermode(mm); mmdrop(mm); + } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); @@ -2772,6 +2835,13 @@ context_switch(struct rq *rq, struct task_struct *prev, */ arch_start_context_switch(prev); + /* + * If mm is non-NULL, we pass through switch_mm(). If mm is + * NULL, we will pass through mmdrop() in finish_task_switch(). + * Both of these contain the full memory barrier required by + * membarrier after storing to rq->curr, before returning to + * user-space. + */ if (!mm) { next->active_mm = oldmm; mmgrab(oldmm); @@ -2786,14 +2856,7 @@ context_switch(struct rq *rq, struct task_struct *prev, rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - /* - * Since the runqueue lock will be released by the next - * task (which is an invalid locking op but in the case - * of the scheduler it's an obvious special-case), so we - * do an early lockdep release here: - */ - rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -3308,6 +3371,9 @@ static void __sched notrace __schedule(bool preempt) * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). + * + * The membarrier system call requires a full memory barrier + * after coming from user-space, before storing to rq->curr. */ rq_lock(rq, &rf); smp_mb__after_spinlock(); @@ -3355,17 +3421,16 @@ static void __sched notrace __schedule(bool preempt) /* * The membarrier system call requires each architecture * to have a full memory barrier after updating - * rq->curr, before returning to user-space. For TSO - * (e.g. x86), the architecture must provide its own - * barrier in switch_mm(). For weakly ordered machines - * for which spin_unlock() acts as a full memory - * barrier, finish_lock_switch() in common code takes - * care of this barrier. For weakly ordered machines for - * which spin_unlock() acts as a RELEASE barrier (only - * arm64 and PowerPC), arm64 has a full barrier in - * switch_to(), and PowerPC has - * smp_mb__after_unlock_lock() before - * finish_lock_switch(). + * rq->curr, before returning to user-space. + * + * Here are the schemes providing that barrier on the + * various architectures: + * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. + * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. + * - finish_lock_switch() for weakly-ordered + * architectures where spin_unlock is a full barrier, + * - switch_to() for arm64 (weakly-ordered, spin_unlock + * is a RELEASE barrier), */ ++*switch_count; @@ -4040,8 +4105,7 @@ recheck: return -EINVAL; } - if (attr->sched_flags & - ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM)) + if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) return -EINVAL; /* @@ -4108,6 +4172,9 @@ recheck: } if (user) { + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return -EINVAL; + retval = security_task_setscheduler(p); if (retval) return retval; @@ -4163,7 +4230,8 @@ change: } #endif #ifdef CONFIG_SMP - if (dl_bandwidth_enabled() && dl_policy(policy)) { + if (dl_bandwidth_enabled() && dl_policy(policy) && + !(attr->sched_flags & SCHED_FLAG_SUGOV)) { cpumask_t *span = rq->rd->span; /* @@ -4293,6 +4361,11 @@ int sched_setattr(struct task_struct *p, const struct sched_attr *attr) } EXPORT_SYMBOL_GPL(sched_setattr); +int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) +{ + return __sched_setscheduler(p, attr, false, true); +} + /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. * @p: the task in question. @@ -4799,7 +4872,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - size_t retlen = min_t(size_t, len, cpumask_size()); + unsigned int retlen = min(len, cpumask_size()); if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; @@ -5097,17 +5170,6 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) return ret; } -/** - * sys_sched_rr_get_interval - return the default timeslice of a process. - * @pid: pid of the process. - * @interval: userspace pointer to the timeslice value. - * - * this syscall writes the default timeslice value of a given process - * into the user-space timespec buffer. A value of '0' means infinity. - * - * Return: On success, 0 and the timeslice is in @interval. Otherwise, - * an error code. - */ static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; @@ -5144,6 +5206,17 @@ out_unlock: return retval; } +/** + * sys_sched_rr_get_interval - return the default timeslice of a process. + * @pid: pid of the process. + * @interval: userspace pointer to the timeslice value. + * + * this syscall writes the default timeslice value of a given process + * into the user-space timespec buffer. A value of '0' means infinity. + * + * Return: On success, 0 and the timeslice is in @interval. Otherwise, + * an error code. + */ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct timespec __user *, interval) { diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 2f52ec0f1539..7936f548e071 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -19,8 +19,6 @@ #include "sched.h" -#define SUGOV_KTHREAD_PRIORITY 50 - struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; @@ -60,7 +58,8 @@ struct sugov_cpu { u64 last_update; /* The fields below are only needed when sharing a policy. */ - unsigned long util; + unsigned long util_cfs; + unsigned long util_dl; unsigned long max; unsigned int flags; @@ -176,21 +175,28 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } -static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu) +static void sugov_get_util(struct sugov_cpu *sg_cpu) { - struct rq *rq = cpu_rq(cpu); - unsigned long cfs_max; + struct rq *rq = cpu_rq(sg_cpu->cpu); - cfs_max = arch_scale_cpu_capacity(NULL, cpu); + sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + sg_cpu->util_cfs = cpu_util_cfs(rq); + sg_cpu->util_dl = cpu_util_dl(rq); +} - *util = min(rq->cfs.avg.util_avg, cfs_max); - *max = cfs_max; +static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) +{ + /* + * Ideally we would like to set util_dl as min/guaranteed freq and + * util_cfs + util_dl as requested freq. However, cpufreq is not yet + * ready for such an interface. So, we only do the latter for now. + */ + return min(sg_cpu->util_cfs + sg_cpu->util_dl, sg_cpu->max); } -static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, - unsigned int flags) +static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time) { - if (flags & SCHED_CPUFREQ_IOWAIT) { + if (sg_cpu->flags & SCHED_CPUFREQ_IOWAIT) { if (sg_cpu->iowait_boost_pending) return; @@ -244,7 +250,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, #ifdef CONFIG_NO_HZ_COMMON static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { - unsigned long idle_calls = tick_nohz_get_idle_calls(); + unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); bool ret = idle_calls == sg_cpu->saved_idle_calls; sg_cpu->saved_idle_calls = idle_calls; @@ -264,7 +270,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int next_f; bool busy; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_set_iowait_boost(sg_cpu, time); sg_cpu->last_update = time; if (!sugov_should_update_freq(sg_policy, time)) @@ -272,10 +278,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, busy = sugov_cpu_is_busy(sg_cpu); - if (flags & SCHED_CPUFREQ_RT_DL) { + if (flags & SCHED_CPUFREQ_RT) { next_f = policy->cpuinfo.max_freq; } else { - sugov_get_util(&util, &max, sg_cpu->cpu); + sugov_get_util(sg_cpu); + max = sg_cpu->max; + util = sugov_aggregate_util(sg_cpu); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); /* @@ -305,23 +313,27 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) s64 delta_ns; /* - * If the CPU utilization was last updated before the previous - * frequency update and the time elapsed between the last update - * of the CPU utilization and the last frequency update is long - * enough, don't take the CPU into account as it probably is - * idle now (and clear iowait_boost for it). + * If the CFS CPU utilization was last updated before the + * previous frequency update and the time elapsed between the + * last update of the CPU utilization and the last frequency + * update is long enough, reset iowait_boost and util_cfs, as + * they are now probably stale. However, still consider the + * CPU contribution if it has some DEADLINE utilization + * (util_dl). */ delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost_pending = false; - continue; + j_sg_cpu->util_cfs = 0; + if (j_sg_cpu->util_dl == 0) + continue; } - if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) + if (j_sg_cpu->flags & SCHED_CPUFREQ_RT) return policy->cpuinfo.max_freq; - j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; + j_util = sugov_aggregate_util(j_sg_cpu); if (j_util * max > j_max * util) { util = j_util; max = j_max; @@ -338,22 +350,18 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, { struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); struct sugov_policy *sg_policy = sg_cpu->sg_policy; - unsigned long util, max; unsigned int next_f; - sugov_get_util(&util, &max, sg_cpu->cpu); - raw_spin_lock(&sg_policy->update_lock); - sg_cpu->util = util; - sg_cpu->max = max; + sugov_get_util(sg_cpu); sg_cpu->flags = flags; - sugov_set_iowait_boost(sg_cpu, time, flags); + sugov_set_iowait_boost(sg_cpu, time); sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - if (flags & SCHED_CPUFREQ_RT_DL) + if (flags & SCHED_CPUFREQ_RT) next_f = sg_policy->policy->cpuinfo.max_freq; else next_f = sugov_next_freq_shared(sg_cpu, time); @@ -383,9 +391,9 @@ static void sugov_irq_work(struct irq_work *irq_work) sg_policy = container_of(irq_work, struct sugov_policy, irq_work); /* - * For RT and deadline tasks, the schedutil governor shoots the - * frequency to maximum. Special care must be taken to ensure that this - * kthread doesn't result in the same behavior. + * For RT tasks, the schedutil governor shoots the frequency to maximum. + * Special care must be taken to ensure that this kthread doesn't result + * in the same behavior. * * This is (mostly) guaranteed by the work_in_progress flag. The flag is * updated only at the end of the sugov_work() function and before that @@ -470,7 +478,20 @@ static void sugov_policy_free(struct sugov_policy *sg_policy) static int sugov_kthread_create(struct sugov_policy *sg_policy) { struct task_struct *thread; - struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO / 2 }; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, + /* + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ + .sched_runtime = 1000000, + .sched_deadline = 10000000, + .sched_period = 10000000, + }; struct cpufreq_policy *policy = sg_policy->policy; int ret; @@ -488,10 +509,10 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) return PTR_ERR(thread); } - ret = sched_setscheduler_nocheck(thread, SCHED_FIFO, ¶m); + ret = sched_setattr_nocheck(thread, &attr); if (ret) { kthread_stop(thread); - pr_warn("%s: failed to set SCHED_FIFO\n", __func__); + pr_warn("%s: failed to set SCHED_DEADLINE\n", __func__); return ret; } @@ -655,7 +676,7 @@ static int sugov_start(struct cpufreq_policy *policy) memset(sg_cpu, 0, sizeof(*sg_cpu)); sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; - sg_cpu->flags = SCHED_CPUFREQ_RT; + sg_cpu->flags = 0; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2473736c7616..9df09782025c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -78,7 +78,7 @@ static inline int dl_bw_cpus(int i) #endif static inline -void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -86,10 +86,12 @@ void add_running_bw(u64 dl_bw, struct dl_rq *dl_rq) dl_rq->running_bw += dl_bw; SCHED_WARN_ON(dl_rq->running_bw < old); /* overflow */ SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); } static inline -void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->running_bw; @@ -98,10 +100,12 @@ void sub_running_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw > old); /* underflow */ if (dl_rq->running_bw > old) dl_rq->running_bw = 0; + /* kick cpufreq (see the comment in kernel/sched/sched.h). */ + cpufreq_update_util(rq_of_dl_rq(dl_rq), SCHED_CPUFREQ_DL); } static inline -void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; @@ -111,7 +115,7 @@ void add_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) } static inline -void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) +void __sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) { u64 old = dl_rq->this_bw; @@ -123,16 +127,46 @@ void sub_rq_bw(u64 dl_bw, struct dl_rq *dl_rq) SCHED_WARN_ON(dl_rq->running_bw > dl_rq->this_bw); } +static inline +void add_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __add_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_rq_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __sub_rq_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void add_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __add_running_bw(dl_se->dl_bw, dl_rq); +} + +static inline +void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) +{ + if (!dl_entity_is_special(dl_se)) + __sub_running_bw(dl_se->dl_bw, dl_rq); +} + void dl_change_utilization(struct task_struct *p, u64 new_bw) { struct rq *rq; + BUG_ON(p->dl.flags & SCHED_FLAG_SUGOV); + if (task_on_rq_queued(p)) return; rq = task_rq(p); if (p->dl.dl_non_contending) { - sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the @@ -144,8 +178,8 @@ void dl_change_utilization(struct task_struct *p, u64 new_bw) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); } - sub_rq_bw(p->dl.dl_bw, &rq->dl); - add_rq_bw(new_bw, &rq->dl); + __sub_rq_bw(p->dl.dl_bw, &rq->dl); + __add_rq_bw(new_bw, &rq->dl); } /* @@ -217,6 +251,9 @@ static void task_non_contending(struct task_struct *p) if (dl_se->dl_runtime == 0) return; + if (dl_entity_is_special(dl_se)) + return; + WARN_ON(hrtimer_active(&dl_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); @@ -236,12 +273,12 @@ static void task_non_contending(struct task_struct *p) */ if (zerolag_time < 0) { if (dl_task(p)) - sub_running_bw(dl_se->dl_bw, dl_rq); + sub_running_bw(dl_se, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) { struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (p->state == TASK_DEAD) - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); raw_spin_lock(&dl_b->lock); __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); @@ -268,7 +305,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) return; if (flags & ENQUEUE_MIGRATED) - add_rq_bw(dl_se->dl_bw, dl_rq); + add_rq_bw(dl_se, dl_rq); if (dl_se->dl_non_contending) { dl_se->dl_non_contending = 0; @@ -289,7 +326,7 @@ static void task_contending(struct sched_dl_entity *dl_se, int flags) * when the "inactive timer" fired). * So, add it back. */ - add_running_bw(dl_se->dl_bw, dl_rq); + add_running_bw(dl_se, dl_rq); } } @@ -1114,7 +1151,9 @@ static void update_curr_dl(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_dl_entity *dl_se = &curr->dl; - u64 delta_exec; + u64 delta_exec, scaled_delta_exec; + int cpu = cpu_of(rq); + u64 now; if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1127,34 +1166,58 @@ static void update_curr_dl(struct rq *rq) * natural solution, but the full ramifications of this * approach need further study. */ - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + now = rq_clock_task(rq); + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; return; } - /* kick cpufreq (see the comment in kernel/sched/sched.h). */ - cpufreq_update_util(rq, SCHED_CPUFREQ_DL); - schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq_clock_task(rq); + curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); - if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) - delta_exec = grub_reclaim(delta_exec, rq, &curr->dl); - dl_se->runtime -= delta_exec; + if (dl_entity_is_special(dl_se)) + return; + + /* + * For tasks that participate in GRUB, we implement GRUB-PA: the + * spare reclaimed bandwidth is used to clock down frequency. + * + * For the others, we still need to scale reservation parameters + * according to current frequency and CPU maximum capacity. + */ + if (unlikely(dl_se->flags & SCHED_FLAG_RECLAIM)) { + scaled_delta_exec = grub_reclaim(delta_exec, + rq, + &curr->dl); + } else { + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + unsigned long scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + scaled_delta_exec = cap_scale(delta_exec, scale_freq); + scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); + } + + dl_se->runtime -= scaled_delta_exec; throttle: if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; + + /* If requested, inform the user about runtime overruns. */ + if (dl_runtime_exceeded(dl_se) && + (dl_se->flags & SCHED_FLAG_DL_OVERRUN)) + dl_se->dl_overrun = 1; + __dequeue_task_dl(rq, curr, 0); if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH); @@ -1204,8 +1267,8 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) struct dl_bw *dl_b = dl_bw_of(task_cpu(p)); if (p->state == TASK_DEAD && dl_se->dl_non_contending) { - sub_running_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); - sub_rq_bw(p->dl.dl_bw, dl_rq_of_se(&p->dl)); + sub_running_bw(&p->dl, dl_rq_of_se(&p->dl)); + sub_rq_bw(&p->dl, dl_rq_of_se(&p->dl)); dl_se->dl_non_contending = 0; } @@ -1222,7 +1285,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); - sub_running_bw(dl_se->dl_bw, &rq->dl); + sub_running_bw(dl_se, &rq->dl); dl_se->dl_non_contending = 0; unlock: task_rq_unlock(rq, p, &rf); @@ -1416,8 +1479,8 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) dl_check_constrained_dl(&p->dl); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & ENQUEUE_RESTORE) { - add_rq_bw(p->dl.dl_bw, &rq->dl); - add_running_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(&p->dl, &rq->dl); + add_running_bw(&p->dl, &rq->dl); } /* @@ -1457,8 +1520,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) __dequeue_task_dl(rq, p, flags); if (p->on_rq == TASK_ON_RQ_MIGRATING || flags & DEQUEUE_SAVE) { - sub_running_bw(p->dl.dl_bw, &rq->dl); - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); } /* @@ -1564,7 +1627,7 @@ static void migrate_task_rq_dl(struct task_struct *p) */ raw_spin_lock(&rq->lock); if (p->dl.dl_non_contending) { - sub_running_bw(p->dl.dl_bw, &rq->dl); + sub_running_bw(&p->dl, &rq->dl); p->dl.dl_non_contending = 0; /* * If the timer handler is currently running and the @@ -1576,7 +1639,7 @@ static void migrate_task_rq_dl(struct task_struct *p) if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) put_task_struct(p); } - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); raw_spin_unlock(&rq->lock); } @@ -2019,11 +2082,11 @@ retry: } deactivate_task(rq, next_task, 0); - sub_running_bw(next_task->dl.dl_bw, &rq->dl); - sub_rq_bw(next_task->dl.dl_bw, &rq->dl); + sub_running_bw(&next_task->dl, &rq->dl); + sub_rq_bw(&next_task->dl, &rq->dl); set_task_cpu(next_task, later_rq->cpu); - add_rq_bw(next_task->dl.dl_bw, &later_rq->dl); - add_running_bw(next_task->dl.dl_bw, &later_rq->dl); + add_rq_bw(&next_task->dl, &later_rq->dl); + add_running_bw(&next_task->dl, &later_rq->dl); activate_task(later_rq, next_task, 0); ret = 1; @@ -2111,11 +2174,11 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); - sub_running_bw(p->dl.dl_bw, &src_rq->dl); - sub_rq_bw(p->dl.dl_bw, &src_rq->dl); + sub_running_bw(&p->dl, &src_rq->dl); + sub_rq_bw(&p->dl, &src_rq->dl); set_task_cpu(p, this_cpu); - add_rq_bw(p->dl.dl_bw, &this_rq->dl); - add_running_bw(p->dl.dl_bw, &this_rq->dl); + add_rq_bw(&p->dl, &this_rq->dl); + add_running_bw(&p->dl, &this_rq->dl); activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -2224,7 +2287,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) task_non_contending(p); if (!task_on_rq_queued(p)) - sub_rq_bw(p->dl.dl_bw, &rq->dl); + sub_rq_bw(&p->dl, &rq->dl); /* * We cannot use inactive_task_timer() to invoke sub_running_bw() @@ -2256,7 +2319,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) /* If p is not queued we will update its parameters at next wakeup. */ if (!task_on_rq_queued(p)) { - add_rq_bw(p->dl.dl_bw, &rq->dl); + add_rq_bw(&p->dl, &rq->dl); return; } @@ -2435,6 +2498,9 @@ int sched_dl_overflow(struct task_struct *p, int policy, u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0; int cpus, err = -1; + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return 0; + /* !deadline task may carry old deadline bandwidth */ if (new_bw == p->dl.dl_bw && task_has_dl_policy(p)) return 0; @@ -2521,6 +2587,10 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr) */ bool __checkparam_dl(const struct sched_attr *attr) { + /* special dl tasks don't actually use any parameter */ + if (attr->sched_flags & SCHED_FLAG_SUGOV) + return true; + /* deadline != 0 */ if (attr->sched_deadline == 0) return false; @@ -2566,6 +2636,7 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_throttled = 0; dl_se->dl_yielded = 0; dl_se->dl_non_contending = 0; + dl_se->dl_overrun = 0; } bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4037e19bbca2..5eb3ffc9be84 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -871,7 +871,7 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) likely(wait_start > prev_wait_start)) wait_start -= prev_wait_start; - schedstat_set(se->statistics.wait_start, wait_start); + __schedstat_set(se->statistics.wait_start, wait_start); } static inline void @@ -893,17 +893,17 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) * time stamp can be adjusted to accumulate wait time * prior to migration. */ - schedstat_set(se->statistics.wait_start, delta); + __schedstat_set(se->statistics.wait_start, delta); return; } trace_sched_stat_wait(p, delta); } - schedstat_set(se->statistics.wait_max, + __schedstat_set(se->statistics.wait_max, max(schedstat_val(se->statistics.wait_max), delta)); - schedstat_inc(se->statistics.wait_count); - schedstat_add(se->statistics.wait_sum, delta); - schedstat_set(se->statistics.wait_start, 0); + __schedstat_inc(se->statistics.wait_count); + __schedstat_add(se->statistics.wait_sum, delta); + __schedstat_set(se->statistics.wait_start, 0); } static inline void @@ -928,10 +928,10 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) delta = 0; if (unlikely(delta > schedstat_val(se->statistics.sleep_max))) - schedstat_set(se->statistics.sleep_max, delta); + __schedstat_set(se->statistics.sleep_max, delta); - schedstat_set(se->statistics.sleep_start, 0); - schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(se->statistics.sleep_start, 0); + __schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { account_scheduler_latency(tsk, delta >> 10, 1); @@ -945,15 +945,15 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) delta = 0; if (unlikely(delta > schedstat_val(se->statistics.block_max))) - schedstat_set(se->statistics.block_max, delta); + __schedstat_set(se->statistics.block_max, delta); - schedstat_set(se->statistics.block_start, 0); - schedstat_add(se->statistics.sum_sleep_runtime, delta); + __schedstat_set(se->statistics.block_start, 0); + __schedstat_add(se->statistics.sum_sleep_runtime, delta); if (tsk) { if (tsk->in_iowait) { - schedstat_add(se->statistics.iowait_sum, delta); - schedstat_inc(se->statistics.iowait_count); + __schedstat_add(se->statistics.iowait_sum, delta); + __schedstat_inc(se->statistics.iowait_count); trace_sched_stat_iowait(tsk, delta); } @@ -1012,10 +1012,10 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) struct task_struct *tsk = task_of(se); if (tsk->state & TASK_INTERRUPTIBLE) - schedstat_set(se->statistics.sleep_start, + __schedstat_set(se->statistics.sleep_start, rq_clock(rq_of(cfs_rq))); if (tsk->state & TASK_UNINTERRUPTIBLE) - schedstat_set(se->statistics.block_start, + __schedstat_set(se->statistics.block_start, rq_clock(rq_of(cfs_rq))); } } @@ -3020,9 +3020,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) /* * There are a few boundary cases this might miss but it should * get called often enough that that should (hopefully) not be - * a real problem -- added to that it only calls on the local - * CPU, so if we enqueue remotely we'll miss an update, but - * the next tick/schedule should update. + * a real problem. * * It will not get called when we go idle, because the idle * thread is a different class (!fair), nor will the utilization @@ -3091,8 +3089,6 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) return c1 + c2 + c3; } -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) - /* * Accumulate the three separate parts of the sum; d1 the remainder * of the last (incomplete) period, d2 the span of full periods and d3 @@ -3122,7 +3118,7 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ u64 periods; - scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_freq = arch_scale_freq_capacity(cpu); scale_cpu = arch_scale_cpu_capacity(NULL, cpu); delta += sa->period_contrib; @@ -3413,9 +3409,9 @@ void set_task_rq_fair(struct sched_entity *se, * _IFF_ we look at the pure running and runnable sums. Because they * represent the very same entity, just at different points in the hierarchy. * - * - * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and - * simply copies the running sum over. + * Per the above update_tg_cfs_util() is trivial and simply copies the running + * sum over (but still wrong, because the group entity and group rq do not have + * their PELT windows aligned). * * However, update_tg_cfs_runnable() is more complex. So we have: * @@ -3424,11 +3420,11 @@ void set_task_rq_fair(struct sched_entity *se, * And since, like util, the runnable part should be directly transferable, * the following would _appear_ to be the straight forward approach: * - * grq->avg.load_avg = grq->load.weight * grq->avg.running_avg (3) + * grq->avg.load_avg = grq->load.weight * grq->avg.runnable_avg (3) * * And per (1) we have: * - * ge->avg.running_avg == grq->avg.running_avg + * ge->avg.runnable_avg == grq->avg.runnable_avg * * Which gives: * @@ -3447,27 +3443,28 @@ void set_task_rq_fair(struct sched_entity *se, * to (shortly) return to us. This only works by keeping the weights as * integral part of the sum. We therefore cannot decompose as per (3). * - * OK, so what then? + * Another reason this doesn't work is that runnable isn't a 0-sum entity. + * Imagine a rq with 2 tasks that each are runnable 2/3 of the time. Then the + * rq itself is runnable anywhere between 2/3 and 1 depending on how the + * runnable section of these tasks overlap (or not). If they were to perfectly + * align the rq as a whole would be runnable 2/3 of the time. If however we + * always have at least 1 runnable task, the rq as a whole is always runnable. * + * So we'll have to approximate.. :/ * - * Another way to look at things is: + * Given the constraint: * - * grq->avg.load_avg = \Sum se->avg.load_avg + * ge->avg.running_sum <= ge->avg.runnable_sum <= LOAD_AVG_MAX * - * Therefore, per (2): + * We can construct a rule that adds runnable to a rq by assuming minimal + * overlap. * - * grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg + * On removal, we'll assume each task is equally runnable; which yields: * - * And the very thing we're propagating is a change in that sum (someone - * joined/left). So we can easily know the runnable change, which would be, per - * (2) the already tracked se->load_avg divided by the corresponding - * se->weight. + * grq->avg.runnable_sum = grq->avg.load_sum / grq->load.weight * - * Basically (4) but in differential form: + * XXX: only do this for the part of runnable > running ? * - * d(runnable_avg) += se->avg.load_avg / se->load.weight - * (5) - * ge->avg.load_avg += ge->load.weight * d(runnable_avg) */ static inline void @@ -3479,6 +3476,14 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq if (!delta) return; + /* + * The relation between sum and avg is: + * + * LOAD_AVG_MAX - 1024 + sa->period_contrib + * + * however, the PELT windows are not aligned between grq and gse. + */ + /* Set new sched_entity's utilization */ se->avg.util_avg = gcfs_rq->avg.util_avg; se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX; @@ -3491,33 +3496,68 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq static inline void update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long runnable_sum = gcfs_rq->prop_runnable_sum; - long runnable_load_avg, load_avg; - s64 runnable_load_sum, load_sum; + long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + unsigned long runnable_load_avg, load_avg; + u64 runnable_load_sum, load_sum = 0; + s64 delta_sum; if (!runnable_sum) return; gcfs_rq->prop_runnable_sum = 0; + if (runnable_sum >= 0) { + /* + * Add runnable; clip at LOAD_AVG_MAX. Reflects that until + * the CPU is saturated running == runnable. + */ + runnable_sum += se->avg.load_sum; + runnable_sum = min(runnable_sum, (long)LOAD_AVG_MAX); + } else { + /* + * Estimate the new unweighted runnable_sum of the gcfs_rq by + * assuming all tasks are equally runnable. + */ + if (scale_load_down(gcfs_rq->load.weight)) { + load_sum = div_s64(gcfs_rq->avg.load_sum, + scale_load_down(gcfs_rq->load.weight)); + } + + /* But make sure to not inflate se's runnable */ + runnable_sum = min(se->avg.load_sum, load_sum); + } + + /* + * runnable_sum can't be lower than running_sum + * As running sum is scale with cpu capacity wehreas the runnable sum + * is not we rescale running_sum 1st + */ + running_sum = se->avg.util_sum / + arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); + runnable_sum = max(runnable_sum, running_sum); + load_sum = (s64)se_weight(se) * runnable_sum; load_avg = div_s64(load_sum, LOAD_AVG_MAX); - add_positive(&se->avg.load_sum, runnable_sum); - add_positive(&se->avg.load_avg, load_avg); + delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; + delta_avg = load_avg - se->avg.load_avg; - add_positive(&cfs_rq->avg.load_avg, load_avg); - add_positive(&cfs_rq->avg.load_sum, load_sum); + se->avg.load_sum = runnable_sum; + se->avg.load_avg = load_avg; + add_positive(&cfs_rq->avg.load_avg, delta_avg); + add_positive(&cfs_rq->avg.load_sum, delta_sum); runnable_load_sum = (s64)se_runnable(se) * runnable_sum; runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX); + delta_sum = runnable_load_sum - se_weight(se) * se->avg.runnable_load_sum; + delta_avg = runnable_load_avg - se->avg.runnable_load_avg; - add_positive(&se->avg.runnable_load_sum, runnable_sum); - add_positive(&se->avg.runnable_load_avg, runnable_load_avg); + se->avg.runnable_load_sum = runnable_sum; + se->avg.runnable_load_avg = runnable_load_avg; if (se->on_rq) { - add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg); - add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum); + add_positive(&cfs_rq->avg.runnable_load_avg, delta_avg); + add_positive(&cfs_rq->avg.runnable_load_sum, delta_sum); } } @@ -4321,12 +4361,12 @@ static inline bool cfs_bandwidth_used(void) void cfs_bandwidth_usage_inc(void) { - static_key_slow_inc(&__cfs_bandwidth_used); + static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used); } void cfs_bandwidth_usage_dec(void) { - static_key_slow_dec(&__cfs_bandwidth_used); + static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used); } #else /* HAVE_JUMP_LABEL */ static bool cfs_bandwidth_used(void) @@ -5645,28 +5685,38 @@ static int wake_wide(struct task_struct *p) * soonest. For the purpose of speed we only consider the waking and previous * CPU. * - * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or - * will be) idle. + * wake_affine_idle() - only considers 'now', it check if the waking CPU is + * cache-affine and is (or will be) idle. * * wake_affine_weight() - considers the weight to reflect the average * scheduling latency of the CPUs. This seems to work * for the overloaded case. */ - -static bool -wake_affine_idle(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int prev_cpu, int sync) +static int +wake_affine_idle(int this_cpu, int prev_cpu, int sync) { - if (idle_cpu(this_cpu)) - return true; + /* + * If this_cpu is idle, it implies the wakeup is from interrupt + * context. Only allow the move if cache is shared. Otherwise an + * interrupt intensive workload could force all tasks onto one + * node depending on the IO topology or IRQ affinity settings. + * + * If the prev_cpu is idle and cache affine then avoid a migration. + * There is no guarantee that the cache hot data from an interrupt + * is more important than cache hot data on the prev_cpu and from + * a cpufreq perspective, it's better to have higher utilisation + * on one CPU. + */ + if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) + return idle_cpu(prev_cpu) ? prev_cpu : this_cpu; if (sync && cpu_rq(this_cpu)->nr_running == 1) - return true; + return this_cpu; - return false; + return nr_cpumask_bits; } -static bool +static int wake_affine_weight(struct sched_domain *sd, struct task_struct *p, int this_cpu, int prev_cpu, int sync) { @@ -5680,7 +5730,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, unsigned long current_load = task_h_load(current); if (current_load > this_eff_load) - return true; + return this_cpu; this_eff_load -= current_load; } @@ -5697,36 +5747,36 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; prev_eff_load *= capacity_of(this_cpu); - return this_eff_load <= prev_eff_load; + return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; } static int wake_affine(struct sched_domain *sd, struct task_struct *p, int prev_cpu, int sync) { int this_cpu = smp_processor_id(); - bool affine = false; + int target = nr_cpumask_bits; - if (sched_feat(WA_IDLE) && !affine) - affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_IDLE)) + target = wake_affine_idle(this_cpu, prev_cpu, sync); - if (sched_feat(WA_WEIGHT) && !affine) - affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); + if (sched_feat(WA_WEIGHT) && target == nr_cpumask_bits) + target = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync); schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts); - if (affine) { - schedstat_inc(sd->ttwu_move_affine); - schedstat_inc(p->se.statistics.nr_wakeups_affine); - } + if (target == nr_cpumask_bits) + return prev_cpu; - return affine; + schedstat_inc(sd->ttwu_move_affine); + schedstat_inc(p->se.statistics.nr_wakeups_affine); + return target; } -static inline int task_util(struct task_struct *p); -static int cpu_util_wake(int cpu, struct task_struct *p); +static inline unsigned long task_util(struct task_struct *p); +static unsigned long cpu_util_wake(int cpu, struct task_struct *p); static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) { - return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); + return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0); } /* @@ -5906,7 +5956,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this } } else if (shallowest_idle_cpu == -1) { load = weighted_cpuload(cpu_rq(i)); - if (load < min_load || (load == min_load && i == this_cpu)) { + if (load < min_load) { min_load = load; least_loaded_cpu = i; } @@ -6147,7 +6197,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; - int i; + int i, recent_used_cpu; if (idle_cpu(target)) return target; @@ -6158,6 +6208,21 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) return prev; + /* Check a recently used CPU as a potential idle candidate */ + recent_used_cpu = p->recent_used_cpu; + if (recent_used_cpu != prev && + recent_used_cpu != target && + cpus_share_cache(recent_used_cpu, target) && + idle_cpu(recent_used_cpu) && + cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { + /* + * Replace recent_used_cpu with prev as it is a potential + * candidate for the next wake. + */ + p->recent_used_cpu = prev; + return recent_used_cpu; + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; @@ -6203,7 +6268,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) * capacity_orig) as it useful for predicting the capacity required after task * migrations (scheduler-driven DVFS). */ -static int cpu_util(int cpu) +static unsigned long cpu_util(int cpu) { unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; unsigned long capacity = capacity_orig_of(cpu); @@ -6211,7 +6276,7 @@ static int cpu_util(int cpu) return (util >= capacity) ? capacity : util; } -static inline int task_util(struct task_struct *p) +static inline unsigned long task_util(struct task_struct *p) { return p->se.avg.util_avg; } @@ -6220,7 +6285,7 @@ static inline int task_util(struct task_struct *p) * cpu_util_wake: Compute cpu utilization with any contributions from * the waking task p removed. */ -static int cpu_util_wake(int cpu, struct task_struct *p) +static unsigned long cpu_util_wake(int cpu, struct task_struct *p) { unsigned long util, capacity; @@ -6311,8 +6376,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (cpu == prev_cpu) goto pick_cpu; - if (wake_affine(affine_sd, p, prev_cpu, sync)) - new_cpu = cpu; + new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); } if (sd && !(sd_flag & SD_BALANCE_FORK)) { @@ -6326,9 +6390,12 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (!sd) { pick_cpu: - if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ + if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + if (want_affine) + current->recent_used_cpu = cpu; + } } else { new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } @@ -6405,8 +6472,7 @@ static void task_dead_fair(struct task_struct *p) } #endif /* CONFIG_SMP */ -static unsigned long -wakeup_gran(struct sched_entity *curr, struct sched_entity *se) +static unsigned long wakeup_gran(struct sched_entity *se) { unsigned long gran = sysctl_sched_wakeup_granularity; @@ -6448,7 +6514,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) if (vdiff <= 0) return -1; - gran = wakeup_gran(curr, se); + gran = wakeup_gran(se); if (vdiff > gran) return 1; diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index dd7908743dab..5d0762633639 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -26,24 +26,110 @@ * Bitmask made from a "or" of all commands within enum membarrier_cmd, * except MEMBARRIER_CMD_QUERY. */ +#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ + (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) +#else +#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 +#endif + #define MEMBARRIER_CMD_BITMASK \ - (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ - | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED) + (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ + | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) static void ipi_mb(void *info) { smp_mb(); /* IPIs should be serializing but paranoid. */ } -static int membarrier_private_expedited(void) +static int membarrier_global_expedited(void) { int cpu; bool fallback = false; cpumask_var_t tmpmask; - if (!(atomic_read(¤t->mm->membarrier_state) - & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) - return -EPERM; + if (num_online_cpus() == 1) + return 0; + + /* + * Matches memory barriers around rq->curr modification in + * scheduler. + */ + smp_mb(); /* system call entry is not a mb. */ + + /* + * Expedited membarrier commands guarantee that they won't + * block, hence the GFP_NOWAIT allocation flag and fallback + * implementation. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { + /* Fallback for OOM. */ + fallback = true; + } + + cpus_read_lock(); + for_each_online_cpu(cpu) { + struct task_struct *p; + + /* + * Skipping the current CPU is OK even through we can be + * migrated at any point. The current CPU, at the point + * where we read raw_smp_processor_id(), is ensured to + * be in program order with respect to the caller + * thread. Therefore, we can skip this CPU from the + * iteration. + */ + if (cpu == raw_smp_processor_id()) + continue; + rcu_read_lock(); + p = task_rcu_dereference(&cpu_rq(cpu)->curr); + if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { + if (!fallback) + __cpumask_set_cpu(cpu, tmpmask); + else + smp_call_function_single(cpu, ipi_mb, NULL, 1); + } + rcu_read_unlock(); + } + if (!fallback) { + preempt_disable(); + smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); + free_cpumask_var(tmpmask); + } + cpus_read_unlock(); + + /* + * Memory barrier on the caller thread _after_ we finished + * waiting for the last IPI. Matches memory barriers around + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ + return 0; +} + +static int membarrier_private_expedited(int flags) +{ + int cpu; + bool fallback = false; + cpumask_var_t tmpmask; + + if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) + return -EINVAL; + if (!(atomic_read(¤t->mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) + return -EPERM; + } else { + if (!(atomic_read(¤t->mm->membarrier_state) & + MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) + return -EPERM; + } if (num_online_cpus() == 1) return 0; @@ -89,7 +175,9 @@ static int membarrier_private_expedited(void) rcu_read_unlock(); } if (!fallback) { + preempt_disable(); smp_call_function_many(tmpmask, ipi_mb, NULL, 1); + preempt_enable(); free_cpumask_var(tmpmask); } cpus_read_unlock(); @@ -103,21 +191,69 @@ static int membarrier_private_expedited(void) return 0; } -static void membarrier_register_private_expedited(void) +static int membarrier_register_global_expedited(void) { struct task_struct *p = current; struct mm_struct *mm = p->mm; + if (atomic_read(&mm->membarrier_state) & + MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) + return 0; + atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); + if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { + /* + * For single mm user, single threaded process, we can + * simply issue a memory barrier after setting + * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that + * no memory access following registration is reordered + * before registration. + */ + smp_mb(); + } else { + /* + * For multi-mm user threads, we need to ensure all + * future scheduler executions will observe the new + * thread flag state for this mm. + */ + synchronize_sched(); + } + atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, + &mm->membarrier_state); + return 0; +} + +static int membarrier_register_private_expedited(int flags) +{ + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; + + if (flags & MEMBARRIER_FLAG_SYNC_CORE) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) + return -EINVAL; + state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; + } + /* * We need to consider threads belonging to different thread * groups, which use the same mm. (CLONE_VM but not * CLONE_THREAD). */ - if (atomic_read(&mm->membarrier_state) - & MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY) - return; - atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, - &mm->membarrier_state); + if (atomic_read(&mm->membarrier_state) & state) + return 0; + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); + if (flags & MEMBARRIER_FLAG_SYNC_CORE) + atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, + &mm->membarrier_state); + if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { + /* + * Ensure all future scheduler executions will observe the + * new thread flag state for this process. + */ + synchronize_sched(); + } + atomic_or(state, &mm->membarrier_state); + return 0; } /** @@ -157,21 +293,28 @@ SYSCALL_DEFINE2(membarrier, int, cmd, int, flags) int cmd_mask = MEMBARRIER_CMD_BITMASK; if (tick_nohz_full_enabled()) - cmd_mask &= ~MEMBARRIER_CMD_SHARED; + cmd_mask &= ~MEMBARRIER_CMD_GLOBAL; return cmd_mask; } - case MEMBARRIER_CMD_SHARED: - /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */ + case MEMBARRIER_CMD_GLOBAL: + /* MEMBARRIER_CMD_GLOBAL is not compatible with nohz_full. */ if (tick_nohz_full_enabled()) return -EINVAL; if (num_online_cpus() > 1) synchronize_sched(); return 0; + case MEMBARRIER_CMD_GLOBAL_EXPEDITED: + return membarrier_global_expedited(); + case MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED: + return membarrier_register_global_expedited(); case MEMBARRIER_CMD_PRIVATE_EXPEDITED: - return membarrier_private_expedited(); + return membarrier_private_expedited(0); case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED: - membarrier_register_private_expedited(); - return 0; + return membarrier_register_private_expedited(0); + case MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE: + return membarrier_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); + case MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE: + return membarrier_register_private_expedited(MEMBARRIER_FLAG_SYNC_CORE); default: return -EINVAL; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 4056c19ca3f0..aad49451584e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -951,11 +951,13 @@ static void update_curr_rt(struct rq *rq) struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; u64 delta_exec; + u64 now; if (curr->sched_class != &rt_sched_class) return; - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + now = rq_clock_task(rq); + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; @@ -968,7 +970,7 @@ static void update_curr_rt(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq_clock_task(rq); + curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -1907,9 +1909,8 @@ static void push_rt_tasks(struct rq *rq) * the rt_loop_next will cause the iterator to perform another scan. * */ -static int rto_next_cpu(struct rq *rq) +static int rto_next_cpu(struct root_domain *rd) { - struct root_domain *rd = rq->rd; int next; int cpu; @@ -1985,19 +1986,24 @@ static void tell_cpu_to_push(struct rq *rq) * Otherwise it is finishing up and an ipi needs to be sent. */ if (rq->rd->rto_cpu < 0) - cpu = rto_next_cpu(rq); + cpu = rto_next_cpu(rq->rd); raw_spin_unlock(&rq->rd->rto_lock); rto_start_unlock(&rq->rd->rto_loop_start); - if (cpu >= 0) + if (cpu >= 0) { + /* Make sure the rd does not get freed while pushing */ + sched_get_rd(rq->rd); irq_work_queue_on(&rq->rd->rto_push_work, cpu); + } } /* Called from hardirq context */ void rto_push_irq_work_func(struct irq_work *work) { + struct root_domain *rd = + container_of(work, struct root_domain, rto_push_work); struct rq *rq; int cpu; @@ -2013,18 +2019,20 @@ void rto_push_irq_work_func(struct irq_work *work) raw_spin_unlock(&rq->lock); } - raw_spin_lock(&rq->rd->rto_lock); + raw_spin_lock(&rd->rto_lock); /* Pass the IPI to the next rt overloaded queue */ - cpu = rto_next_cpu(rq); + cpu = rto_next_cpu(rd); - raw_spin_unlock(&rq->rd->rto_lock); + raw_spin_unlock(&rd->rto_lock); - if (cpu < 0) + if (cpu < 0) { + sched_put_rd(rd); return; + } /* Try the next RT overloaded CPU */ - irq_work_queue_on(&rq->rd->rto_push_work, cpu); + irq_work_queue_on(&rd->rto_push_work, cpu); } #endif /* HAVE_RT_PUSH_IPI */ @@ -2034,8 +2042,9 @@ static void pull_rt_task(struct rq *this_rq) bool resched = false; struct task_struct *p; struct rq *src_rq; + int rt_overload_count = rt_overloaded(this_rq); - if (likely(!rt_overloaded(this_rq))) + if (likely(!rt_overload_count)) return; /* @@ -2044,6 +2053,11 @@ static void pull_rt_task(struct rq *this_rq) */ smp_rmb(); + /* If we are the only overloaded CPU do nothing */ + if (rt_overload_count == 1 && + cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask)) + return; + #ifdef HAVE_RT_PUSH_IPI if (sched_feat(RT_PUSH_IPI)) { tell_cpu_to_push(this_rq); @@ -2206,7 +2220,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) queue_push_tasks(rq); #endif /* CONFIG_SMP */ - if (p->prio < rq->curr->prio) + if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) resched_curr(rq); } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b19552a212de..fb5fc458547f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -156,13 +156,39 @@ static inline int task_has_dl_policy(struct task_struct *p) return dl_policy(p->policy); } +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +/* + * !! For sched_setattr_nocheck() (kernel) only !! + * + * This is actually gross. :( + * + * It is used to make schedutil kworker(s) higher priority than SCHED_DEADLINE + * tasks, but still be able to sleep. We need this on platforms that cannot + * atomically change clock frequency. Remove once fast switching will be + * available on such platforms. + * + * SUGOV stands for SchedUtil GOVernor. + */ +#define SCHED_FLAG_SUGOV 0x10000000 + +static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se) +{ +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + return unlikely(dl_se->flags & SCHED_FLAG_SUGOV); +#else + return false; +#endif +} + /* * Tells if entity @a should preempt entity @b. */ static inline bool dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b) { - return dl_time_before(a->deadline, b->deadline); + return dl_entity_is_special(a) || + dl_time_before(a->deadline, b->deadline); } /* @@ -665,6 +691,8 @@ extern struct mutex sched_domains_mutex; extern void init_defrootdomain(void); extern int sched_init_domains(const struct cpumask *cpu_map); extern void rq_attach_root(struct rq *rq, struct root_domain *rd); +extern void sched_get_rd(struct root_domain *rd); +extern void sched_put_rd(struct root_domain *rd); #ifdef HAVE_RT_PUSH_IPI extern void rto_push_irq_work_func(struct irq_work *work); @@ -1328,47 +1356,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) # define finish_arch_post_lock_switch() do { } while (0) #endif -static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) -{ -#ifdef CONFIG_SMP - /* - * We can optimise this out completely for !SMP, because the - * SMP rebalancing from interrupt is the only thing that cares - * here. - */ - next->on_cpu = 1; -#endif -} - -static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) -{ -#ifdef CONFIG_SMP - /* - * After ->on_cpu is cleared, the task can be moved to a different CPU. - * We must ensure this doesn't happen until the switch is completely - * finished. - * - * In particular, the load of prev->state in finish_task_switch() must - * happen before this. - * - * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). - */ - smp_store_release(&prev->on_cpu, 0); -#endif -#ifdef CONFIG_DEBUG_SPINLOCK - /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; -#endif - /* - * If we are tracking spinlock dependencies then we have to - * fix up the runqueue lock - which gets 'carried over' from - * prev into current: - */ - spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - - raw_spin_unlock_irq(&rq->lock); -} - /* * wake flags */ @@ -1687,17 +1674,17 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ -#ifdef CONFIG_SMP -extern void sched_avg_update(struct rq *rq); - #ifndef arch_scale_freq_capacity static __always_inline -unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +unsigned long arch_scale_freq_capacity(int cpu) { return SCHED_CAPACITY_SCALE; } #endif +#ifdef CONFIG_SMP +extern void sched_avg_update(struct rq *rq); + #ifndef arch_scale_cpu_capacity static __always_inline unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) @@ -1711,10 +1698,17 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); + rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq)); sched_avg_update(rq); } #else +#ifndef arch_scale_cpu_capacity +static __always_inline +unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif @@ -2096,14 +2090,14 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); * The way cpufreq is currently arranged requires it to evaluate the CPU * performance state (frequency/voltage) on a regular basis to prevent it from * being stuck in a completely inadequate performance level for too long. - * That is not guaranteed to happen if the updates are only triggered from CFS, - * though, because they may not be coming in if RT or deadline tasks are active - * all the time (or there are RT and DL tasks only). + * That is not guaranteed to happen if the updates are only triggered from CFS + * and DL, though, because they may not be coming in if only RT tasks are + * active all the time (or there are RT tasks only). * - * As a workaround for that issue, this function is called by the RT and DL - * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * As a workaround for that issue, this function is called periodically by the + * RT sched class to trigger extra cpufreq updates to prevent it from stalling, * but that really is a band-aid. Going forward it should be replaced with - * solutions targeted more specifically at RT and DL tasks. + * solutions targeted more specifically at RT tasks. */ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { @@ -2125,3 +2119,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #else /* arch_scale_freq_capacity */ #define arch_scale_freq_invariant() (false) #endif + +#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL + +static inline unsigned long cpu_util_dl(struct rq *rq) +{ + return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; +} + +static inline unsigned long cpu_util_cfs(struct rq *rq) +{ + return rq->cfs.avg.util_avg; +} + +#endif diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index baf500d12b7c..8e7b58de61e7 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -31,8 +31,11 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) rq->rq_sched_info.run_delay += delta; } #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +#define __schedstat_inc(var) do { var++; } while (0) #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) +#define __schedstat_add(var, amt) do { var += (amt); } while (0) #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) +#define __schedstat_set(var, val) do { var = (val); } while (0) #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #define schedstat_val(var) (var) #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) @@ -48,8 +51,11 @@ static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} #define schedstat_enabled() 0 +#define __schedstat_inc(var) do { } while (0) #define schedstat_inc(var) do { } while (0) +#define __schedstat_add(var, amt) do { } while (0) #define schedstat_add(var, amt) do { } while (0) +#define __schedstat_set(var, val) do { } while (0) #define schedstat_set(var, val) do { } while (0) #define schedstat_val(var) 0 #define schedstat_val_or_zero(var) 0 diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 034cbed7f88b..519b024f4e94 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -259,6 +259,19 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) call_rcu_sched(&old_rd->rcu, free_rootdomain); } +void sched_get_rd(struct root_domain *rd) +{ + atomic_inc(&rd->refcount); +} + +void sched_put_rd(struct root_domain *rd) +{ + if (!atomic_dec_and_test(&rd->refcount)) + return; + + call_rcu_sched(&rd->rcu, free_rootdomain); +} + static int init_rootdomain(struct root_domain *rd) { if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 98feab7933c7..929ecb7d6b78 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -27,7 +27,7 @@ void add_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq wq_entry->flags &= ~WQ_FLAG_EXCLUSIVE; spin_lock_irqsave(&wq_head->lock, flags); - __add_wait_queue_entry_tail(wq_head, wq_entry); + __add_wait_queue(wq_head, wq_entry); spin_unlock_irqrestore(&wq_head->lock, flags); } EXPORT_SYMBOL(add_wait_queue); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 1245b2338fff..dc77548167ef 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -515,7 +515,7 @@ void put_seccomp_filter(struct task_struct *tsk) static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) { - memset(info, 0, sizeof(*info)); + clear_siginfo(info); info->si_signo = SIGSYS; info->si_code = SYS_SECCOMP; info->si_call_addr = (void __user *)KSTK_EIP(current); diff --git a/kernel/signal.c b/kernel/signal.c index 9558664bd9ec..c6e4c83dc090 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -40,6 +40,7 @@ #include <linux/cn_proc.h> #include <linux/compiler.h> #include <linux/posix-timers.h> +#include <linux/livepatch.h> #define CREATE_TRACE_POINTS #include <trace/events/signal.h> @@ -165,7 +166,8 @@ void recalc_sigpending_and_wake(struct task_struct *t) void recalc_sigpending(void) { - if (!recalc_sigpending_tsk(current) && !freezing(current)) + if (!recalc_sigpending_tsk(current) && !freezing(current) && + !klp_patch_pending(current)) clear_thread_flag(TIF_SIGPENDING); } @@ -549,6 +551,7 @@ still_pending: * a fast-pathed signal or we must have been * out of queue space. So zero out the info. */ + clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; info->si_code = SI_USER; @@ -642,6 +645,9 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) spin_unlock(&tsk->sighand->siglock); posixtimer_rearm(info); spin_lock(&tsk->sighand->siglock); + + /* Don't expose the si_sys_private value to userspace */ + info->si_sys_private = 0; } #endif return signr; @@ -1043,6 +1049,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { case (unsigned long) SEND_SIG_NOINFO: + clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; @@ -1051,6 +1058,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); break; case (unsigned long) SEND_SIG_PRIV: + clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; @@ -1485,6 +1493,129 @@ force_sigsegv(int sig, struct task_struct *p) return 0; } +int force_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; +#ifdef __ARCH_SI_TRAPNO + info.si_trapno = trapno; +#endif +#ifdef __ia64__ + info.si_imm = imm; + info.si_flags = flags; + info.si_isr = isr; +#endif + return force_sig_info(info.si_signo, &info, t); +} + +int send_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; +#ifdef __ARCH_SI_TRAPNO + info.si_trapno = trapno; +#endif +#ifdef __ia64__ + info.si_imm = imm; + info.si_flags = flags; + info.si_isr = isr; +#endif + return send_sig_info(info.si_signo, &info, t); +} + +#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR) +int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) +{ + struct siginfo info; + + WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_addr_lsb = lsb; + return force_sig_info(info.si_signo, &info, t); +} + +int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) +{ + struct siginfo info; + + WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_addr_lsb = lsb; + return send_sig_info(info.si_signo, &info, t); +} +EXPORT_SYMBOL(send_sig_mceerr); +#endif + +#ifdef SEGV_BNDERR +int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_BNDERR; + info.si_addr = addr; + info.si_lower = lower; + info.si_upper = upper; + return force_sig_info(info.si_signo, &info, current); +} +#endif + +#ifdef SEGV_PKUERR +int force_sig_pkuerr(void __user *addr, u32 pkey) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_PKUERR; + info.si_addr = addr; + info.si_pkey = pkey; + return force_sig_info(info.si_signo, &info, current); +} +#endif + +/* For the crazy architectures that include trap information in + * the errno field, instead of an actual errno value. + */ +int force_sig_ptrace_errno_trap(int errno, void __user *addr) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_errno = errno; + info.si_code = TRAP_HWBKPT; + info.si_addr = addr; + return force_sig_info(info.si_signo, &info, current); +} + int kill_pgrp(struct pid *pid, int sig, int priv) { int ret; @@ -1623,6 +1754,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) sig = SIGCHLD; } + clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; /* @@ -1717,6 +1849,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, parent = tsk->real_parent; } + clear_siginfo(&info); info.si_signo = SIGCHLD; info.si_errno = 0; /* @@ -1929,7 +2062,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why) { siginfo_t info; - memset(&info, 0, sizeof info); + clear_siginfo(&info); info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); @@ -2136,6 +2269,7 @@ static int ptrace_signal(int signr, siginfo_t *info) * have updated *info via PTRACE_SETSIGINFO. */ if (signr != info->si_signo) { + clear_siginfo(info); info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; @@ -2688,9 +2822,7 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) #endif [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, [SIGPOLL] = { NSIGPOLL, SIL_POLL }, -#ifdef __ARCH_SIGSYS [SIGSYS] = { NSIGSYS, SIL_SYS }, -#endif }; if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) layout = filter[sig].layout; @@ -2712,12 +2844,14 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) if ((sig == SIGFPE) && (si_code == FPE_FIXME)) layout = SIL_FAULT; #endif +#ifdef BUS_FIXME + if ((sig == SIGBUS) && (si_code == BUS_FIXME)) + layout = SIL_FAULT; +#endif } return layout; } -#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER - int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) { int err; @@ -2756,13 +2890,21 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) #ifdef __ARCH_SI_TRAPNO err |= __put_user(from->si_trapno, &to->si_trapno); #endif -#ifdef BUS_MCEERR_AO +#ifdef __ia64__ + err |= __put_user(from->si_imm, &to->si_imm); + err |= __put_user(from->si_flags, &to->si_flags); + err |= __put_user(from->si_isr, &to->si_isr); +#endif /* * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ - if (from->si_signo == SIGBUS && - (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) +#ifdef BUS_MCEERR_AR + if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR) + err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); +#endif +#ifdef BUS_MCEERR_AO + if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif #ifdef SEGV_BNDERR @@ -2788,18 +2930,185 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_ptr, &to->si_ptr); break; -#ifdef __ARCH_SIGSYS case SIL_SYS: err |= __put_user(from->si_call_addr, &to->si_call_addr); err |= __put_user(from->si_syscall, &to->si_syscall); err |= __put_user(from->si_arch, &to->si_arch); break; -#endif } return err; } +#ifdef CONFIG_COMPAT +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct siginfo *from) +#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) +{ + return __copy_siginfo_to_user32(to, from, in_x32_syscall()); +} +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct siginfo *from, bool x32_ABI) +#endif +{ + struct compat_siginfo new; + memset(&new, 0, sizeof(new)); + + new.si_signo = from->si_signo; + new.si_errno = from->si_errno; + new.si_code = from->si_code; + switch(siginfo_layout(from->si_signo, from->si_code)) { + case SIL_KILL: + new.si_pid = from->si_pid; + new.si_uid = from->si_uid; + break; + case SIL_TIMER: + new.si_tid = from->si_tid; + new.si_overrun = from->si_overrun; + new.si_int = from->si_int; + break; + case SIL_POLL: + new.si_band = from->si_band; + new.si_fd = from->si_fd; + break; + case SIL_FAULT: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; +#endif +#ifdef BUS_MCEERR_AR + if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR)) + new.si_addr_lsb = from->si_addr_lsb; +#endif +#ifdef BUS_MCEERR_AO + if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO)) + new.si_addr_lsb = from->si_addr_lsb; +#endif +#ifdef SEGV_BNDERR + if ((from->si_signo == SIGSEGV) && + (from->si_code == SEGV_BNDERR)) { + new.si_lower = ptr_to_compat(from->si_lower); + new.si_upper = ptr_to_compat(from->si_upper); + } +#endif +#ifdef SEGV_PKUERR + if ((from->si_signo == SIGSEGV) && + (from->si_code == SEGV_PKUERR)) + new.si_pkey = from->si_pkey; +#endif + + break; + case SIL_CHLD: + new.si_pid = from->si_pid; + new.si_uid = from->si_uid; + new.si_status = from->si_status; +#ifdef CONFIG_X86_X32_ABI + if (x32_ABI) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } else +#endif + { + new.si_utime = from->si_utime; + new.si_stime = from->si_stime; + } + break; + case SIL_RT: + new.si_pid = from->si_pid; + new.si_uid = from->si_uid; + new.si_int = from->si_int; + break; + case SIL_SYS: + new.si_call_addr = ptr_to_compat(from->si_call_addr); + new.si_syscall = from->si_syscall; + new.si_arch = from->si_arch; + break; + } + + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + + return 0; +} + +int copy_siginfo_from_user32(struct siginfo *to, + const struct compat_siginfo __user *ufrom) +{ + struct compat_siginfo from; + + if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) + return -EFAULT; + + clear_siginfo(to); + to->si_signo = from.si_signo; + to->si_errno = from.si_errno; + to->si_code = from.si_code; + switch(siginfo_layout(from.si_signo, from.si_code)) { + case SIL_KILL: + to->si_pid = from.si_pid; + to->si_uid = from.si_uid; + break; + case SIL_TIMER: + to->si_tid = from.si_tid; + to->si_overrun = from.si_overrun; + to->si_int = from.si_int; + break; + case SIL_POLL: + to->si_band = from.si_band; + to->si_fd = from.si_fd; + break; + case SIL_FAULT: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; +#endif +#ifdef BUS_MCEERR_AR + if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR)) + to->si_addr_lsb = from.si_addr_lsb; +#endif +#ifdef BUS_MCEER_AO + if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO)) + to->si_addr_lsb = from.si_addr_lsb; +#endif +#ifdef SEGV_BNDERR + if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) { + to->si_lower = compat_ptr(from.si_lower); + to->si_upper = compat_ptr(from.si_upper); + } +#endif +#ifdef SEGV_PKUERR + if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR)) + to->si_pkey = from.si_pkey; +#endif + break; + case SIL_CHLD: + to->si_pid = from.si_pid; + to->si_uid = from.si_uid; + to->si_status = from.si_status; +#ifdef CONFIG_X86_X32_ABI + if (in_x32_syscall()) { + to->si_utime = from._sifields._sigchld_x32._utime; + to->si_stime = from._sifields._sigchld_x32._stime; + } else #endif + { + to->si_utime = from.si_utime; + to->si_stime = from.si_stime; + } + break; + case SIL_RT: + to->si_pid = from.si_pid; + to->si_uid = from.si_uid; + to->si_int = from.si_int; + break; + case SIL_SYS: + to->si_call_addr = compat_ptr(from.si_call_addr); + to->si_syscall = from.si_syscall; + to->si_arch = from.si_arch; + break; + } + return 0; +} +#endif /* CONFIG_COMPAT */ /** * do_sigtimedwait - wait for queued signals specified in @which @@ -2937,6 +3246,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; + clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_USER; @@ -2978,8 +3288,9 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) static int do_tkill(pid_t tgid, pid_t pid, int sig) { - struct siginfo info = {}; + struct siginfo info; + clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_TKILL; @@ -3060,7 +3371,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info = {}; + siginfo_t info; int ret = copy_siginfo_from_user32(&info, uinfo); if (unlikely(ret)) return ret; @@ -3104,7 +3415,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info = {}; + siginfo_t info; if (copy_siginfo_from_user32(&info, uinfo)) return -EFAULT; @@ -3677,6 +3988,7 @@ void __init signals_init(void) /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE != offsetof(struct siginfo, _sifields._pad)); + BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE); sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } @@ -3684,26 +3996,25 @@ void __init signals_init(void) #ifdef CONFIG_KGDB_KDB #include <linux/kdb.h> /* - * kdb_send_sig_info - Allows kdb to send signals without exposing + * kdb_send_sig - Allows kdb to send signals without exposing * signal internals. This function checks if the required locks are * available before calling the main signal code, to avoid kdb * deadlocks. */ -void -kdb_send_sig_info(struct task_struct *t, struct siginfo *info) +void kdb_send_sig(struct task_struct *t, int sig) { static struct task_struct *kdb_prev_t; - int sig, new_t; + int new_t, ret; if (!spin_trylock(&t->sighand->siglock)) { kdb_printf("Can't do kill command now.\n" "The sigmask lock is held somewhere else in " "kernel, try again later\n"); return; } - spin_unlock(&t->sighand->siglock); new_t = kdb_prev_t != t; kdb_prev_t = t; if (t->state != TASK_RUNNING && new_t) { + spin_unlock(&t->sighand->siglock); kdb_printf("Process is not RUNNING, sending a signal from " "kdb risks deadlock\n" "on the run queue locks. " @@ -3712,8 +4023,9 @@ kdb_send_sig_info(struct task_struct *t, struct siginfo *info) "the deadlock.\n"); return; } - sig = info->si_signo; - if (send_sig_info(sig, info, t)) + ret = send_signal(sig, SEND_SIG_PRIV, t, false); + spin_unlock(&t->sighand->siglock); + if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", sig, t->pid); else diff --git a/kernel/softirq.c b/kernel/softirq.c index 2f5e87f1bae2..24d243ef8e71 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -665,7 +665,7 @@ static void run_ksoftirqd(unsigned int cpu) */ __do_softirq(); local_irq_enable(); - cond_resched_rcu_qs(); + cond_resched(); return; } local_irq_enable(); diff --git a/kernel/sys.c b/kernel/sys.c index 83ffd7dccf23..f2289de20e19 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -135,7 +135,7 @@ EXPORT_SYMBOL(overflowgid); */ int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; -int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; +int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 557d46728577..f98f28c12020 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -218,6 +218,8 @@ static int proc_dointvec_minmax_coredump(struct ctl_table *table, int write, static int proc_dostring_coredump(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses it's own private copy */ @@ -1374,13 +1376,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "hugepages_treat_as_movable", - .data = &hugepages_treat_as_movable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "nr_overcommit_hugepages", .data = NULL, @@ -1819,8 +1814,7 @@ static struct ctl_table fs_table[] = { .data = &pipe_max_size, .maxlen = sizeof(pipe_max_size), .mode = 0644, - .proc_handler = &pipe_proc_fn, - .extra1 = &pipe_min_size, + .proc_handler = proc_dopipe_max_size, }, { .procname = "pipe-user-pages-hard", @@ -2622,29 +2616,17 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, do_proc_douintvec_minmax_conv, ¶m); } -struct do_proc_dopipe_max_size_conv_param { - unsigned int *min; -}; - static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, unsigned int *valp, int write, void *data) { - struct do_proc_dopipe_max_size_conv_param *param = data; - if (write) { unsigned int val; - if (*lvalp > UINT_MAX) - return -EINVAL; - val = round_pipe_size(*lvalp); if (val == 0) return -EINVAL; - if (param->min && *param->min > val) - return -ERANGE; - *valp = val; } else { unsigned int val = *valp; @@ -2654,14 +2636,11 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, return 0; } -int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) +static int proc_dopipe_max_size(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) { - struct do_proc_dopipe_max_size_conv_param param = { - .min = (unsigned int *) table->extra1, - }; return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_dopipe_max_size_conv, ¶m); + do_proc_dopipe_max_size_conv, NULL); } static void validate_coredump_safety(void) @@ -3167,12 +3146,6 @@ int proc_douintvec_minmax(struct ctl_table *table, int write, return -ENOSYS; } -int proc_dopipe_max_size(struct ctl_table *table, int write, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return -ENOSYS; -} - int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -3216,7 +3189,6 @@ EXPORT_SYMBOL(proc_douintvec); EXPORT_SYMBOL(proc_dointvec_jiffies); EXPORT_SYMBOL(proc_dointvec_minmax); EXPORT_SYMBOL_GPL(proc_douintvec_minmax); -EXPORT_SYMBOL_GPL(proc_dopipe_max_size); EXPORT_SYMBOL(proc_dointvec_userhz_jiffies); EXPORT_SYMBOL(proc_dointvec_ms_jiffies); EXPORT_SYMBOL(proc_dostring); diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 4559e914452b..4e62a4a8fa91 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -194,11 +194,7 @@ static int fill_stats_for_pid(pid_t pid, struct taskstats *stats) { struct task_struct *tsk; - rcu_read_lock(); - tsk = find_task_by_vpid(pid); - if (tsk) - get_task_struct(tsk); - rcu_read_unlock(); + tsk = find_get_task_by_vpid(pid); if (!tsk) return -ESRCH; fill_stats(current_user_ns(), task_active_pid_ns(current), tsk, stats); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e776fc8cc1df..f6b5f19223d6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -95,6 +95,7 @@ config NO_HZ_FULL select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN select IRQ_WORK + select CPU_ISOLATION help Adaptively try to shutdown the tick whenever possible, even when the CPU is running tasks. Typically this requires running a single diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d32520840fde..23788100e214 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -37,7 +37,6 @@ #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> -#include <linux/kallsyms.h> #include <linux/interrupt.h> #include <linux/tick.h> #include <linux/seq_file.h> @@ -60,6 +59,15 @@ #include "tick-internal.h" /* + * Masks for selecting the soft and hard context timers from + * cpu_base->active + */ +#define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) +#define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) +#define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) +#define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) + +/* * The timer bases: * * There are more clockids than hrtimer bases. Thus, we index @@ -70,7 +78,6 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), - .seq = SEQCNT_ZERO(hrtimer_bases.seq), .clock_base = { { @@ -93,6 +100,26 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, + { + .index = HRTIMER_BASE_MONOTONIC_SOFT, + .clockid = CLOCK_MONOTONIC, + .get_time = &ktime_get, + }, + { + .index = HRTIMER_BASE_REALTIME_SOFT, + .clockid = CLOCK_REALTIME, + .get_time = &ktime_get_real, + }, + { + .index = HRTIMER_BASE_BOOTTIME_SOFT, + .clockid = CLOCK_BOOTTIME, + .get_time = &ktime_get_boottime, + }, + { + .index = HRTIMER_BASE_TAI_SOFT, + .clockid = CLOCK_TAI, + .get_time = &ktime_get_clocktai, + }, } }; @@ -118,7 +145,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { - .seq = SEQCNT_ZERO(migration_cpu_base), .clock_base = { { .cpu_base = &migration_cpu_base, }, }, }; @@ -156,45 +182,33 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * With HIGHRES=y we do not migrate the timer when it is expiring - * before the next event on the target cpu because we cannot reprogram - * the target cpu hardware and we would cause it to fire late. + * We do not migrate the timer when it is expiring before the next + * event on the target cpu. When high resolution is enabled, we cannot + * reprogram the target cpu hardware and we would cause it to fire + * late. To keep it simple, we handle the high resolution enabled and + * disabled case similar. * * Called with cpu_base->lock of target cpu held. */ static int hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) { -#ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires; - if (!new_base->cpu_base->hres_active) - return 0; - expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires <= new_base->cpu_base->expires_next; -#else - return 0; -#endif + return expires < new_base->cpu_base->expires_next; } -#ifdef CONFIG_NO_HZ_COMMON -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) -{ - if (pinned || !base->migration_enabled) - return base; - return &per_cpu(hrtimer_bases, get_nohz_timer_target()); -} -#else static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && !pinned) + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +#endif return base; } -#endif /* * We switch the timer base to a power-optimized selected CPU target, @@ -396,7 +410,8 @@ static inline void debug_hrtimer_init(struct hrtimer *timer) debug_object_init(timer, &hrtimer_debug_descr); } -static inline void debug_hrtimer_activate(struct hrtimer *timer) +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } @@ -429,8 +444,10 @@ void destroy_hrtimer_on_stack(struct hrtimer *timer) EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else + static inline void debug_hrtimer_init(struct hrtimer *timer) { } -static inline void debug_hrtimer_activate(struct hrtimer *timer) { } +static inline void debug_hrtimer_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif @@ -442,10 +459,11 @@ debug_init(struct hrtimer *timer, clockid_t clockid, trace_hrtimer_init(timer, clockid, mode); } -static inline void debug_activate(struct hrtimer *timer) +static inline void debug_activate(struct hrtimer *timer, + enum hrtimer_mode mode) { - debug_hrtimer_activate(timer); - trace_hrtimer_start(timer); + debug_hrtimer_activate(timer, mode); + trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) @@ -454,35 +472,43 @@ static inline void debug_deactivate(struct hrtimer *timer) trace_hrtimer_cancel(timer); } -#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) -static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, - struct hrtimer *timer) +static struct hrtimer_clock_base * +__next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) { -#ifdef CONFIG_HIGH_RES_TIMERS - cpu_base->next_timer = timer; -#endif + unsigned int idx; + + if (!*active) + return NULL; + + idx = __ffs(*active); + *active &= ~(1U << idx); + + return &cpu_base->clock_base[idx]; } -static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) +#define for_each_active_base(base, cpu_base, active) \ + while ((base = __next_base((cpu_base), &(active)))) + +static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, + unsigned int active, + ktime_t expires_next) { - struct hrtimer_clock_base *base = cpu_base->clock_base; - unsigned int active = cpu_base->active_bases; - ktime_t expires, expires_next = KTIME_MAX; + struct hrtimer_clock_base *base; + ktime_t expires; - hrtimer_update_next_timer(cpu_base, NULL); - for (; active; base++, active >>= 1) { + for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; - if (!(active & 0x01)) - continue; - next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; - hrtimer_update_next_timer(cpu_base, timer); + if (timer->is_soft) + cpu_base->softirq_next_timer = timer; + else + cpu_base->next_timer = timer; } } /* @@ -494,7 +520,47 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) expires_next = 0; return expires_next; } -#endif + +/* + * Recomputes cpu_base::*next_timer and returns the earliest expires_next but + * does not set cpu_base::*expires_next, that is done by hrtimer_reprogram. + * + * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, + * those timers will get run whenever the softirq gets handled, at the end of + * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. + * + * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. + * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual + * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. + * + * @active_mask must be one of: + * - HRTIMER_ACTIVE_ALL, + * - HRTIMER_ACTIVE_SOFT, or + * - HRTIMER_ACTIVE_HARD. + */ +static ktime_t +__hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) +{ + unsigned int active; + struct hrtimer *next_timer = NULL; + ktime_t expires_next = KTIME_MAX; + + if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; + cpu_base->softirq_next_timer = NULL; + expires_next = __hrtimer_next_event_base(cpu_base, active, KTIME_MAX); + + next_timer = cpu_base->softirq_next_timer; + } + + if (active_mask & HRTIMER_ACTIVE_HARD) { + active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; + cpu_base->next_timer = next_timer; + expires_next = __hrtimer_next_event_base(cpu_base, active, expires_next); + } + + return expires_next; +} static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { @@ -502,36 +568,14 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - return ktime_get_update_offsets_now(&base->clock_was_set_seq, + ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); -} -/* High resolution timer related functions */ -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer enabled ? - */ -static bool hrtimer_hres_enabled __read_mostly = true; -unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; -EXPORT_SYMBOL_GPL(hrtimer_resolution); - -/* - * Enable / Disable high resolution mode - */ -static int __init setup_hrtimer_hres(char *str) -{ - return (kstrtobool(str, &hrtimer_hres_enabled) == 0); -} - -__setup("highres=", setup_hrtimer_hres); + base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; + base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; + base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; -/* - * hrtimer_high_res_enabled - query, if the highres mode is enabled - */ -static inline int hrtimer_is_hres_enabled(void) -{ - return hrtimer_hres_enabled; + return now; } /* @@ -539,7 +583,8 @@ static inline int hrtimer_is_hres_enabled(void) */ static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { - return cpu_base->hres_active; + return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? + cpu_base->hres_active : 0; } static inline int hrtimer_hres_active(void) @@ -557,10 +602,23 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; - if (!cpu_base->hres_active) - return; + /* + * Find the current next expiration time. + */ + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); - expires_next = __hrtimer_get_next_event(cpu_base); + if (cpu_base->next_timer && cpu_base->next_timer->is_soft) { + /* + * When the softirq is activated, hrtimer has to be + * programmed with the first hard hrtimer because soft + * timer interrupt could occur too late. + */ + if (cpu_base->softirq_activated) + expires_next = __hrtimer_get_next_event(cpu_base, + HRTIMER_ACTIVE_HARD); + else + cpu_base->softirq_expires_next = expires_next; + } if (skip_equal && expires_next == cpu_base->expires_next) return; @@ -568,6 +626,9 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) cpu_base->expires_next = expires_next; /* + * If hres is not active, hardware does not have to be + * reprogrammed yet. + * * If a hang was detected in the last timer interrupt then we * leave the hang delay active in the hardware. We want the * system to make progress. That also prevents the following @@ -581,81 +642,38 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) * set. So we'd effectivly block all timers until the T2 event * fires. */ - if (cpu_base->hang_detected) + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(cpu_base->expires_next, 1); } +/* High resolution timer related functions */ +#ifdef CONFIG_HIGH_RES_TIMERS + /* - * When a timer is enqueued and expires earlier than the already enqueued - * timers, we have to check, whether it expires earlier than the timer for - * which the clock event device was armed. - * - * Called with interrupts disabled and base->cpu_base.lock held + * High resolution timer enabled ? */ -static void hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - - WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); - - /* - * If the timer is not on the current cpu, we cannot reprogram - * the other cpus clock event device. - */ - if (base->cpu_base != cpu_base) - return; - - /* - * If the hrtimer interrupt is running, then it will - * reevaluate the clock bases and reprogram the clock event - * device. The callbacks are always executed in hard interrupt - * context so we don't need an extra check for a running - * callback. - */ - if (cpu_base->in_hrtirq) - return; - - /* - * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Set it to 0. - */ - if (expires < 0) - expires = 0; - - if (expires >= cpu_base->expires_next) - return; - - /* Update the pointer to the next expiring timer */ - cpu_base->next_timer = timer; - - /* - * If a hang was detected in the last timer interrupt then we - * do not schedule a timer which is earlier than the expiry - * which we enforced in the hang detection. We want the system - * to make progress. - */ - if (cpu_base->hang_detected) - return; +static bool hrtimer_hres_enabled __read_mostly = true; +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; +EXPORT_SYMBOL_GPL(hrtimer_resolution); - /* - * Program the timer hardware. We enforce the expiry for - * events which are already in the past. - */ - cpu_base->expires_next = expires; - tick_program_event(expires, 1); +/* + * Enable / Disable high resolution mode + */ +static int __init setup_hrtimer_hres(char *str) +{ + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } +__setup("highres=", setup_hrtimer_hres); + /* - * Initialize the high resolution related parts of cpu_base + * hrtimer_high_res_enabled - query, if the highres mode is enabled */ -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) +static inline int hrtimer_is_hres_enabled(void) { - base->expires_next = KTIME_MAX; - base->hres_active = 0; + return hrtimer_hres_enabled; } /* @@ -667,7 +685,7 @@ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - if (!base->hres_active) + if (!__hrtimer_hres_active(base)) return; raw_spin_lock(&base->lock); @@ -714,23 +732,102 @@ void clock_was_set_delayed(void) #else -static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } -static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } -static inline void -hrtimer_force_reprogram(struct hrtimer_cpu_base *base, int skip_equal) { } -static inline int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) -{ - return 0; -} -static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ /* + * When a timer is enqueued and expires earlier than the already enqueued + * timers, we have to check, whether it expires earlier than the timer for + * which the clock event device was armed. + * + * Called with interrupts disabled and base->cpu_base.lock held + */ +static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + struct hrtimer_clock_base *base = timer->base; + ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); + + WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); + + /* + * CLOCK_REALTIME timer might be requested with an absolute + * expiry time which is less than base->offset. Set it to 0. + */ + if (expires < 0) + expires = 0; + + if (timer->is_soft) { + /* + * soft hrtimer could be started on a remote CPU. In this + * case softirq_expires_next needs to be updated on the + * remote CPU. The soft hrtimer will not expire before the + * first hard hrtimer on the remote CPU - + * hrtimer_check_target() prevents this case. + */ + struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; + + if (timer_cpu_base->softirq_activated) + return; + + if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) + return; + + timer_cpu_base->softirq_next_timer = timer; + timer_cpu_base->softirq_expires_next = expires; + + if (!ktime_before(expires, timer_cpu_base->expires_next) || + !reprogram) + return; + } + + /* + * If the timer is not on the current cpu, we cannot reprogram + * the other cpus clock event device. + */ + if (base->cpu_base != cpu_base) + return; + + /* + * If the hrtimer interrupt is running, then it will + * reevaluate the clock bases and reprogram the clock event + * device. The callbacks are always executed in hard interrupt + * context so we don't need an extra check for a running + * callback. + */ + if (cpu_base->in_hrtirq) + return; + + if (expires >= cpu_base->expires_next) + return; + + /* Update the pointer to the next expiring timer */ + cpu_base->next_timer = timer; + cpu_base->expires_next = expires; + + /* + * If hres is not active, hardware does not have to be + * programmed yet. + * + * If a hang was detected in the last timer interrupt then we + * do not schedule a timer which is earlier than the expiry + * which we enforced in the hang detection. We want the system + * to make progress. + */ + if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) + return; + + /* + * Program the timer hardware. We enforce the expiry for + * events which are already in the past. + */ + tick_program_event(expires, 1); +} + +/* * Clock realtime was set * * Change the offset of the realtime clock vs. the monotonic @@ -835,9 +932,10 @@ EXPORT_SYMBOL_GPL(hrtimer_forward); * Returns 1 when the new timer is the leftmost timer in the tree. */ static int enqueue_hrtimer(struct hrtimer *timer, - struct hrtimer_clock_base *base) + struct hrtimer_clock_base *base, + enum hrtimer_mode mode) { - debug_activate(timer); + debug_activate(timer, mode); base->cpu_base->active_bases |= 1 << base->index; @@ -870,7 +968,6 @@ static void __remove_hrtimer(struct hrtimer *timer, if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); -#ifdef CONFIG_HIGH_RES_TIMERS /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first @@ -881,7 +978,6 @@ static void __remove_hrtimer(struct hrtimer *timer, */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); -#endif } /* @@ -930,22 +1026,36 @@ static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, return tim; } -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - */ -void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - u64 delta_ns, const enum hrtimer_mode mode) +static void +hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { - struct hrtimer_clock_base *base, *new_base; - unsigned long flags; - int leftmost; + ktime_t expires; - base = lock_hrtimer_base(timer, &flags); + /* + * Find the next SOFT expiration. + */ + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); + + /* + * reprogramming needs to be triggered, even if the next soft + * hrtimer expires at the same time than the next hard + * hrtimer. cpu_base->softirq_expires_next needs to be updated! + */ + if (expires == KTIME_MAX) + return; + + /* + * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() + * cpu_base->*expires_next is only set by hrtimer_reprogram() + */ + hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); +} + +static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode, + struct hrtimer_clock_base *base) +{ + struct hrtimer_clock_base *new_base; /* Remove an active timer from the queue: */ remove_hrtimer(timer, base, true); @@ -960,21 +1070,35 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Switch the timer base, if necessary: */ new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); - leftmost = enqueue_hrtimer(timer, new_base); - if (!leftmost) - goto unlock; + return enqueue_hrtimer(timer, new_base, mode); +} + +/** + * hrtimer_start_range_ns - (re)start an hrtimer + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); + * softirq based mode is considered for debug purpose only! + */ +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + u64 delta_ns, const enum hrtimer_mode mode) +{ + struct hrtimer_clock_base *base; + unsigned long flags; + + /* + * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft + * match. + */ + WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); + + base = lock_hrtimer_base(timer, &flags); + + if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) + hrtimer_reprogram(timer, true); - if (!hrtimer_is_hres_active(timer)) { - /* - * Kick to reschedule the next tick to handle the new timer - * on dynticks target. - */ - if (new_base->cpu_base->nohz_active) - wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else { - hrtimer_reprogram(timer, new_base); - } -unlock: unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); @@ -1072,7 +1196,7 @@ u64 hrtimer_get_next_event(void) raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!__hrtimer_hres_active(cpu_base)) - expires = __hrtimer_get_next_event(cpu_base); + expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1095,17 +1219,24 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { + bool softtimer = !!(mode & HRTIMER_MODE_SOFT); + int base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; struct hrtimer_cpu_base *cpu_base; - int base; memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); - if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS) + /* + * POSIX magic: Relative CLOCK_REALTIME timers are not affected by + * clock modifications, so they needs to become CLOCK_MONOTONIC to + * ensure POSIX compliance. + */ + if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; - base = hrtimer_clockid_to_base(clock_id); + base += hrtimer_clockid_to_base(clock_id); + timer->is_soft = softtimer; timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } @@ -1114,7 +1245,13 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used - * @mode: timer mode abs/rel + * @mode: The modes which are relevant for intitialization: + * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, + * HRTIMER_MODE_REL_SOFT + * + * The PINNED variants of the above can be handed in, + * but the PINNED bit is ignored as pinning happens + * when the hrtimer is started */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) @@ -1133,19 +1270,19 @@ EXPORT_SYMBOL_GPL(hrtimer_init); */ bool hrtimer_active(const struct hrtimer *timer) { - struct hrtimer_cpu_base *cpu_base; + struct hrtimer_clock_base *base; unsigned int seq; do { - cpu_base = READ_ONCE(timer->base->cpu_base); - seq = raw_read_seqcount_begin(&cpu_base->seq); + base = READ_ONCE(timer->base); + seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || - cpu_base->running == timer) + base->running == timer) return true; - } while (read_seqcount_retry(&cpu_base->seq, seq) || - cpu_base != READ_ONCE(timer->base->cpu_base)); + } while (read_seqcount_retry(&base->seq, seq) || + base != READ_ONCE(timer->base)); return false; } @@ -1171,7 +1308,8 @@ EXPORT_SYMBOL_GPL(hrtimer_active); static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, - struct hrtimer *timer, ktime_t *now) + struct hrtimer *timer, ktime_t *now, + unsigned long flags) { enum hrtimer_restart (*fn)(struct hrtimer *); int restart; @@ -1179,16 +1317,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); - cpu_base->running = timer; + base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in - * hrtimer_active() cannot observe cpu_base->running == NULL && + * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ - raw_write_seqcount_barrier(&cpu_base->seq); + raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); fn = timer->function; @@ -1202,15 +1340,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, timer->is_rel = false; /* - * Because we run timers from hardirq context, there is no chance - * they get migrated to another cpu, therefore its safe to unlock - * the timer base. + * The timer is marked as running in the CPU base, so it is + * protected against migration to a different CPU even if the lock + * is dropped. */ - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); restart = fn(timer); trace_hrtimer_expire_exit(timer); - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and @@ -1223,33 +1361,31 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) - enqueue_hrtimer(timer, base); + enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in - * hrtimer_active() cannot observe cpu_base->running == NULL && + * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ - raw_write_seqcount_barrier(&cpu_base->seq); + raw_write_seqcount_barrier(&base->seq); - WARN_ON_ONCE(cpu_base->running != timer); - cpu_base->running = NULL; + WARN_ON_ONCE(base->running != timer); + base->running = NULL; } -static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, + unsigned long flags, unsigned int active_mask) { - struct hrtimer_clock_base *base = cpu_base->clock_base; - unsigned int active = cpu_base->active_bases; + struct hrtimer_clock_base *base; + unsigned int active = cpu_base->active_bases & active_mask; - for (; active; base++, active >>= 1) { + for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; - if (!(active & 0x01)) - continue; - basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { @@ -1272,11 +1408,28 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) if (basenow < hrtimer_get_softexpires_tv64(timer)) break; - __run_hrtimer(cpu_base, base, timer, &basenow); + __run_hrtimer(cpu_base, base, timer, &basenow, flags); } } } +static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; + ktime_t now; + + raw_spin_lock_irqsave(&cpu_base->lock, flags); + + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); + + cpu_base->softirq_activated = 0; + hrtimer_update_softirq_timer(cpu_base, true); + + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); +} + #ifdef CONFIG_HIGH_RES_TIMERS /* @@ -1287,13 +1440,14 @@ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; + unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; @@ -1306,17 +1460,23 @@ retry: */ cpu_base->expires_next = KTIME_MAX; - __hrtimer_run_queues(cpu_base, now); + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the next expiry */ - expires_next = __hrtimer_get_next_event(cpu_base); + expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { @@ -1337,7 +1497,7 @@ retry: * Acquire base lock for updating the offsets and retrieving * the current time. */ - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) @@ -1350,7 +1510,8 @@ retry: */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; - raw_spin_unlock(&cpu_base->lock); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); + delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; @@ -1392,6 +1553,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { } void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; ktime_t now; if (__hrtimer_hres_active(cpu_base)) @@ -1409,10 +1571,17 @@ void hrtimer_run_queues(void) return; } - raw_spin_lock(&cpu_base->lock); + raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); - __hrtimer_run_queues(cpu_base, now); - raw_spin_unlock(&cpu_base->lock); + + if (!ktime_before(now, cpu_base->softirq_expires_next)) { + cpu_base->softirq_expires_next = KTIME_MAX; + cpu_base->softirq_activated = 1; + raise_softirq_irqoff(HRTIMER_SOFTIRQ); + } + + __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); + raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* @@ -1590,7 +1759,13 @@ int hrtimers_prepare_cpu(unsigned int cpu) } cpu_base->cpu = cpu; - hrtimer_init_hres(cpu_base); + cpu_base->active_bases = 0; + cpu_base->hres_active = 0; + cpu_base->hang_detected = 0; + cpu_base->next_timer = NULL; + cpu_base->softirq_next_timer = NULL; + cpu_base->expires_next = KTIME_MAX; + cpu_base->softirq_expires_next = KTIME_MAX; return 0; } @@ -1622,7 +1797,7 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * sort out already expired timers and reprogram the * event device. */ - enqueue_hrtimer(timer, new_base); + enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } @@ -1634,6 +1809,12 @@ int hrtimers_dead_cpu(unsigned int scpu) BUG_ON(cpu_online(scpu)); tick_cancel_sched_timer(scpu); + /* + * this BH disable ensures that raise_softirq_irqoff() does + * not wakeup ksoftirqd (and acquire the pi-lock) while + * holding the cpu_base lock + */ + local_bh_disable(); local_irq_disable(); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); @@ -1649,12 +1830,19 @@ int hrtimers_dead_cpu(unsigned int scpu) &new_base->clock_base[i]); } + /* + * The migration might have changed the first expiring softirq + * timer on this CPU. Update it. + */ + hrtimer_update_softirq_timer(new_base, false); + raw_spin_unlock(&old_base->lock); raw_spin_unlock(&new_base->lock); /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); local_irq_enable(); + local_bh_enable(); return 0; } @@ -1663,18 +1851,19 @@ int hrtimers_dead_cpu(unsigned int scpu) void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); + open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL - * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME + * @mode: timer mode + * @clock_id: timer clock to be used */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, - const enum hrtimer_mode mode, int clock) + const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; @@ -1695,7 +1884,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, return -EINTR; } - hrtimer_init_on_stack(&t.timer, clock, mode); + hrtimer_init_on_stack(&t.timer, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_init_sleeper(&t, current); @@ -1717,7 +1906,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless @@ -1756,7 +1945,7 @@ EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) - * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL + * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 17cdc554c9fe..fe56c4e06c51 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -68,13 +68,13 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, return err; } -static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) { struct posix_clock *clk = get_posix_clock(fp); - unsigned int result = 0; + __poll_t result = 0; if (!clk) - return POLLERR; + return EPOLLERR; if (clk->ops.poll) result = clk->ops.poll(clk, fp, wait); @@ -216,7 +216,7 @@ struct posix_clock_desc { static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd) { - struct file *fp = fget(CLOCKID_TO_FD(id)); + struct file *fp = fget(clockid_to_fd(id)); int err = -EINVAL; if (!fp) diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 1f27887aa194..2541bd89f20e 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -14,6 +14,7 @@ #include <linux/tick.h> #include <linux/workqueue.h> #include <linux/compat.h> +#include <linux/sched/deadline.h> #include "posix-timers.h" @@ -791,6 +792,14 @@ check_timers_list(struct list_head *timers, return 0; } +static inline void check_dl_overrun(struct task_struct *tsk) +{ + if (tsk->dl.dl_overrun) { + tsk->dl.dl_overrun = 0; + __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); + } +} + /* * Check for any per-thread CPU timers that have fired and move them off * the tsk->cpu_timers[N] list onto the firing list. Here we update the @@ -804,6 +813,9 @@ static void check_thread_timers(struct task_struct *tsk, u64 expires; unsigned long soft; + if (dl_task(tsk)) + check_dl_overrun(tsk); + /* * If cputime_expires is zero, then there are no active * per thread CPU timers. @@ -906,6 +918,9 @@ static void check_process_timers(struct task_struct *tsk, struct task_cputime cputime; unsigned long soft; + if (dl_task(tsk)) + check_dl_overrun(tsk); + /* * If cputimer is not running, then there are no active * process wide timers (POSIX 1.b, itimers, RLIMIT_CPU). @@ -1111,6 +1126,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk) return 1; } + if (dl_task(tsk) && tsk->dl.dl_overrun) + return 1; + return 0; } @@ -1189,9 +1207,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, u64 now; WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED); - cpu_timer_sample_group(clock_idx, tsk, &now); - if (oldval) { + if (oldval && cpu_timer_sample_group(clock_idx, tsk, &now) != -EINVAL) { /* * We are setting itimer. The *oldval is absolute and we update * it to be relative, *newval argument is relative and we update @@ -1363,8 +1380,8 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) return do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t); } -#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED) -#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) +#define PROCESS_CLOCK make_process_cpuclock(0, CPUCLOCK_SCHED) +#define THREAD_CLOCK make_thread_cpuclock(0, CPUCLOCK_SCHED) static int process_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 13d6881f908b..75043046914e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -434,17 +434,22 @@ static struct pid *good_sigevent(sigevent_t * event) { struct task_struct *rtn = current->group_leader; - if ((event->sigev_notify & SIGEV_THREAD_ID ) && - (!(rtn = find_task_by_vpid(event->sigev_notify_thread_id)) || - !same_thread_group(rtn, current) || - (event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_SIGNAL)) + switch (event->sigev_notify) { + case SIGEV_SIGNAL | SIGEV_THREAD_ID: + rtn = find_task_by_vpid(event->sigev_notify_thread_id); + if (!rtn || !same_thread_group(rtn, current)) + return NULL; + /* FALLTHRU */ + case SIGEV_SIGNAL: + case SIGEV_THREAD: + if (event->sigev_signo <= 0 || event->sigev_signo > SIGRTMAX) + return NULL; + /* FALLTHRU */ + case SIGEV_NONE: + return task_pid(rtn); + default: return NULL; - - if (((event->sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) && - ((event->sigev_signo <= 0) || (event->sigev_signo > SIGRTMAX))) - return NULL; - - return task_pid(rtn); + } } static struct k_itimer * alloc_posix_timer(void) @@ -457,7 +462,7 @@ static struct k_itimer * alloc_posix_timer(void) kmem_cache_free(posix_timers_cache, tmr); return NULL; } - memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); + clear_siginfo(&tmr->sigq->info); return tmr; } @@ -669,7 +674,7 @@ void common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) struct timespec64 ts64; bool sig_none; - sig_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sig_none = timr->it_sigev_notify == SIGEV_NONE; iv = timr->it_interval; /* interval timer ? */ @@ -856,7 +861,7 @@ int common_timer_set(struct k_itimer *timr, int flags, timr->it_interval = timespec64_to_ktime(new_setting->it_interval); expires = timespec64_to_ktime(new_setting->it_value); - sigev_none = (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE; + sigev_none = timr->it_sigev_notify == SIGEV_NONE; kc->timer_arm(timr, expires, flags & TIMER_ABSTIME, sigev_none); timr->it_active = !sigev_none; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f8e1845aa464..e277284c2831 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -150,16 +150,15 @@ static inline void tick_nohz_init(void) { } #ifdef CONFIG_NO_HZ_COMMON extern unsigned long tick_nohz_active; -#else +extern void timers_update_nohz(void); +# ifdef CONFIG_SMP +extern struct static_key_false timers_migration_enabled; +# endif +#else /* CONFIG_NO_HZ_COMMON */ +static inline void timers_update_nohz(void) { } #define tick_nohz_active (0) #endif -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void timers_update_migration(bool update_nohz); -#else -static inline void timers_update_migration(bool update_nohz) { } -#endif - DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99578f06c8d4..29a5733eff83 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -650,6 +650,11 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) ts->next_tick = 0; } +static inline bool local_timer_softirq_pending(void) +{ + return local_softirq_pending() & TIMER_SOFTIRQ; +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { @@ -666,8 +671,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, } while (read_seqretry(&jiffies_lock, seq)); ts->last_jiffies = basejiff; - if (rcu_needs_cpu(basemono, &next_rcu) || - arch_needs_cpu() || irq_work_needs_cpu()) { + /* + * Keep the periodic tick, when RCU, architecture or irq_work + * requests it. + * Aside of that check whether the local timer softirq is + * pending. If so its a bad idea to call get_next_timer_interrupt() + * because there is an already expired timer, so it will request + * immeditate expiry, which rearms the hardware timer with a + * minimal delta which brings us back to this place + * immediately. Lather, rinse and repeat... + */ + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || + irq_work_needs_cpu() || local_timer_softirq_pending()) { next_tick = basemono + TICK_NSEC; } else { /* @@ -986,6 +1001,19 @@ ktime_t tick_nohz_get_sleep_length(void) } /** + * tick_nohz_get_idle_calls_cpu - return the current idle calls counter value + * for a particular CPU. + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls_cpu(int cpu) +{ + struct tick_sched *ts = tick_get_tick_sched(cpu); + + return ts->idle_calls; +} + +/** * tick_nohz_get_idle_calls - return the current idle calls counter value * * Called from the schedutil frequency scaling governor in scheduler context. @@ -1079,7 +1107,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode) ts->nohz_mode = mode; /* One update is enough */ if (!test_and_set_bit(0, &tick_nohz_active)) - timers_update_migration(true); + timers_update_nohz(); } /** diff --git a/kernel/time/timer.c b/kernel/time/timer.c index ffebcf878fba..48150ab42de9 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -200,8 +200,6 @@ struct timer_base { unsigned long clk; unsigned long next_expiry; unsigned int cpu; - bool migration_enabled; - bool nohz_active; bool is_idle; bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); @@ -210,45 +208,64 @@ struct timer_base { static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); -#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#ifdef CONFIG_NO_HZ_COMMON + +static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); +static DEFINE_MUTEX(timer_keys_mutex); + +static void timer_update_keys(struct work_struct *work); +static DECLARE_WORK(timer_update_work, timer_update_keys); + +#ifdef CONFIG_SMP unsigned int sysctl_timer_migration = 1; -void timers_update_migration(bool update_nohz) +DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); + +static void timers_update_migration(void) { - bool on = sysctl_timer_migration && tick_nohz_active; - unsigned int cpu; + if (sysctl_timer_migration && tick_nohz_active) + static_branch_enable(&timers_migration_enabled); + else + static_branch_disable(&timers_migration_enabled); +} +#else +static inline void timers_update_migration(void) { } +#endif /* !CONFIG_SMP */ - /* Avoid the loop, if nothing to update */ - if (this_cpu_read(timer_bases[BASE_STD].migration_enabled) == on) - return; +static void timer_update_keys(struct work_struct *work) +{ + mutex_lock(&timer_keys_mutex); + timers_update_migration(); + static_branch_enable(&timers_nohz_active); + mutex_unlock(&timer_keys_mutex); +} - for_each_possible_cpu(cpu) { - per_cpu(timer_bases[BASE_STD].migration_enabled, cpu) = on; - per_cpu(timer_bases[BASE_DEF].migration_enabled, cpu) = on; - per_cpu(hrtimer_bases.migration_enabled, cpu) = on; - if (!update_nohz) - continue; - per_cpu(timer_bases[BASE_STD].nohz_active, cpu) = true; - per_cpu(timer_bases[BASE_DEF].nohz_active, cpu) = true; - per_cpu(hrtimer_bases.nohz_active, cpu) = true; - } +void timers_update_nohz(void) +{ + schedule_work(&timer_update_work); } int timer_migration_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - static DEFINE_MUTEX(mutex); int ret; - mutex_lock(&mutex); + mutex_lock(&timer_keys_mutex); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) - timers_update_migration(false); - mutex_unlock(&mutex); + timers_update_migration(); + mutex_unlock(&timer_keys_mutex); return ret; } -#endif + +static inline bool is_timers_nohz_active(void) +{ + return static_branch_unlikely(&timers_nohz_active); +} +#else +static inline bool is_timers_nohz_active(void) { return false; } +#endif /* NO_HZ_COMMON */ static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) @@ -534,7 +551,7 @@ __internal_add_timer(struct timer_base *base, struct timer_list *timer) static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + if (!is_timers_nohz_active()) return; /* @@ -823,11 +840,10 @@ static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && - (tflags & TIMER_DEFERRABLE)) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); return base; } @@ -837,11 +853,10 @@ static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); /* - * If the timer is deferrable and nohz is active then we need to use - * the deferrable base. + * If the timer is deferrable and NO_HZ_COMMON is set then we need + * to use the deferrable base. */ - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active && - (tflags & TIMER_DEFERRABLE)) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = this_cpu_ptr(&timer_bases[BASE_DEF]); return base; } @@ -851,21 +866,20 @@ static inline struct timer_base *get_timer_base(u32 tflags) return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); } -#ifdef CONFIG_NO_HZ_COMMON static inline struct timer_base * get_target_base(struct timer_base *base, unsigned tflags) { -#ifdef CONFIG_SMP - if ((tflags & TIMER_PINNED) || !base->migration_enabled) - return get_timer_this_cpu_base(tflags); - return get_timer_cpu_base(tflags, get_nohz_timer_target()); -#else - return get_timer_this_cpu_base(tflags); +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + if (static_branch_likely(&timers_migration_enabled) && + !(tflags & TIMER_PINNED)) + return get_timer_cpu_base(tflags, get_nohz_timer_target()); #endif + return get_timer_this_cpu_base(tflags); } static inline void forward_timer_base(struct timer_base *base) { +#ifdef CONFIG_NO_HZ_COMMON unsigned long jnow; /* @@ -889,16 +903,8 @@ static inline void forward_timer_base(struct timer_base *base) base->clk = jnow; else base->clk = base->next_expiry; -} -#else -static inline struct timer_base * -get_target_base(struct timer_base *base, unsigned tflags) -{ - return get_timer_this_cpu_base(tflags); -} - -static inline void forward_timer_base(struct timer_base *base) { } #endif +} /* @@ -1009,8 +1015,6 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; - debug_activate(timer, expires); - new_base = get_target_base(base, timer->flags); if (base != new_base) { @@ -1034,6 +1038,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int option } } + debug_activate(timer, expires); + timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance @@ -1684,7 +1690,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) base->must_forward_clk = false; __run_timers(base); - if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) + if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); } @@ -1698,7 +1704,7 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ if (time_before(jiffies, base->clk)) { - if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; @@ -1855,6 +1861,21 @@ static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *h } } +int timers_prepare_cpu(unsigned int cpu) +{ + struct timer_base *base; + int b; + + for (b = 0; b < NR_BASES; b++) { + base = per_cpu_ptr(&timer_bases[b], cpu); + base->clk = jiffies; + base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; + base->is_idle = false; + base->must_forward_clk = true; + } + return 0; +} + int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; diff --git a/kernel/torture.c b/kernel/torture.c index 637e172835d8..37b94012a3f8 100644 --- a/kernel/torture.c +++ b/kernel/torture.c @@ -47,6 +47,7 @@ #include <linux/ktime.h> #include <asm/byteorder.h> #include <linux/torture.h> +#include "rcu/rcu.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com>"); @@ -60,7 +61,6 @@ static bool verbose; #define FULLSTOP_RMMOD 2 /* Normal rmmod of torture. */ static int fullstop = FULLSTOP_RMMOD; static DEFINE_MUTEX(fullstop_mutex); -static int *torture_runnable; #ifdef CONFIG_HOTPLUG_CPU @@ -500,7 +500,7 @@ static int torture_shutdown(void *arg) torture_shutdown_hook(); else VERBOSE_TOROUT_STRING("No torture_shutdown_hook(), skipping."); - ftrace_dump(DUMP_ALL); + rcu_ftrace_dump(DUMP_ALL); kernel_power_off(); /* Shut down the system. */ return 0; } @@ -572,17 +572,19 @@ static int stutter; */ void stutter_wait(const char *title) { + int spt; + cond_resched_rcu_qs(); - while (READ_ONCE(stutter_pause_test) || - (torture_runnable && !READ_ONCE(*torture_runnable))) { - if (stutter_pause_test) - if (READ_ONCE(stutter_pause_test) == 1) - schedule_timeout_interruptible(1); - else - while (READ_ONCE(stutter_pause_test)) - cond_resched(); - else + spt = READ_ONCE(stutter_pause_test); + for (; spt; spt = READ_ONCE(stutter_pause_test)) { + if (spt == 1) { + schedule_timeout_interruptible(1); + } else if (spt == 2) { + while (READ_ONCE(stutter_pause_test)) + cond_resched(); + } else { schedule_timeout_interruptible(round_jiffies_relative(HZ)); + } torture_shutdown_absorb(title); } } @@ -596,17 +598,15 @@ static int torture_stutter(void *arg) { VERBOSE_TOROUT_STRING("torture_stutter task started"); do { - if (!torture_must_stop()) { - if (stutter > 1) { - schedule_timeout_interruptible(stutter - 1); - WRITE_ONCE(stutter_pause_test, 2); - } - schedule_timeout_interruptible(1); + if (!torture_must_stop() && stutter > 1) { WRITE_ONCE(stutter_pause_test, 1); + schedule_timeout_interruptible(stutter - 1); + WRITE_ONCE(stutter_pause_test, 2); + schedule_timeout_interruptible(1); } + WRITE_ONCE(stutter_pause_test, 0); if (!torture_must_stop()) schedule_timeout_interruptible(stutter); - WRITE_ONCE(stutter_pause_test, 0); torture_shutdown_absorb("torture_stutter"); } while (!torture_must_stop()); torture_kthread_stopping("torture_stutter"); @@ -647,7 +647,7 @@ static void torture_stutter_cleanup(void) * The runnable parameter points to a flag that controls whether or not * the test is currently runnable. If there is no such flag, pass in NULL. */ -bool torture_init_begin(char *ttype, bool v, int *runnable) +bool torture_init_begin(char *ttype, bool v) { mutex_lock(&fullstop_mutex); if (torture_type != NULL) { @@ -659,7 +659,6 @@ bool torture_init_begin(char *ttype, bool v, int *runnable) } torture_type = ttype; verbose = v; - torture_runnable = runnable; fullstop = FULLSTOP_DONTSTOP; return true; } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index af7dad126c13..0b249e2f0c3c 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -164,6 +164,7 @@ config PREEMPTIRQ_EVENTS bool "Enable trace events for preempt and irq disable/enable" select TRACE_IRQFLAGS depends on DEBUG_PREEMPT || !PROVE_LOCKING + depends on TRACING default n help Enable tracing of disable and enable events for preemption and irqs. @@ -354,7 +355,7 @@ config PROFILE_ANNOTATED_BRANCHES on if you need to profile the system's use of these macros. config PROFILE_ALL_BRANCHES - bool "Profile all if conditionals" + bool "Profile all if conditionals" if !FORTIFY_SOURCE select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () @@ -529,6 +530,15 @@ config FUNCTION_PROFILER If in doubt, say N. +config BPF_KPROBE_OVERRIDE + bool "Enable BPF programs to override a kprobed function" + depends on BPF_EVENTS + depends on FUNCTION_ERROR_INJECTION + default n + help + Allows BPF to override the execution of a probed function and + set a different return value. This is used for error injection. + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 206e0e2ace53..987d9a9ae283 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -591,7 +591,7 @@ static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev, return ret; if (copy_to_user(arg, &buts, sizeof(buts))) { - blk_trace_remove(q); + __blk_trace_remove(q); return -EFAULT; } return 0; @@ -637,7 +637,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, return ret; if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) { - blk_trace_remove(q); + __blk_trace_remove(q); return -EFAULT; } @@ -872,7 +872,7 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, * **/ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, - u32 what, int error, union kernfs_node_id *cgid) + u32 what, int error) { struct blk_trace *bt = q->blk_trace; @@ -880,22 +880,21 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, return; __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, - bio_op(bio), bio->bi_opf, what, error, 0, NULL, cgid); + bio_op(bio), bio->bi_opf, what, error, 0, NULL, + blk_trace_bio_get_cgid(q, bio)); } static void blk_add_trace_bio_bounce(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } static void blk_add_trace_bio_complete(void *ignore, struct request_queue *q, struct bio *bio, int error) { - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } static void blk_add_trace_bio_backmerge(void *ignore, @@ -903,8 +902,7 @@ static void blk_add_trace_bio_backmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0); } static void blk_add_trace_bio_frontmerge(void *ignore, @@ -912,15 +910,13 @@ static void blk_add_trace_bio_frontmerge(void *ignore, struct request *rq, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0); } static void blk_add_trace_bio_queue(void *ignore, struct request_queue *q, struct bio *bio) { - blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0); } static void blk_add_trace_getrq(void *ignore, @@ -928,8 +924,7 @@ static void blk_add_trace_getrq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); else { struct blk_trace *bt = q->blk_trace; @@ -945,8 +940,7 @@ static void blk_add_trace_sleeprq(void *ignore, struct bio *bio, int rw) { if (bio) - blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0, - blk_trace_bio_get_cgid(q, bio)); + blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); else { struct blk_trace *bt = q->blk_trace; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 27d1f4ffa3de..fc2838ac8b78 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,10 @@ #include <linux/filter.h> #include <linux/uaccess.h> #include <linux/ctype.h> +#include <linux/kprobes.h> +#include <linux/error-injection.h> + +#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -76,6 +80,23 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + regs_set_return_value(regs, rc); + override_function_with_return(regs); + return 0; +} + +static const struct bpf_func_proto bpf_override_return_proto = { + .func = bpf_override_return, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; +#endif + BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -224,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, */ #define __BPF_TP_EMIT() __BPF_ARG3_TP() #define __BPF_TP(...) \ - __trace_printk(1 /* Fake ip will not be printed. */, \ + __trace_printk(0 /* Fake ip */, \ fmt, ##__VA_ARGS__) #define __BPF_ARG1_TP(...) \ @@ -343,14 +364,13 @@ static const struct bpf_func_proto bpf_perf_event_read_value_proto = { .arg4_type = ARG_CONST_SIZE, }; -static DEFINE_PER_CPU(struct perf_sample_data, bpf_sd); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_trace_sd); static __always_inline u64 __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, - u64 flags, struct perf_raw_record *raw) + u64 flags, struct perf_sample_data *sd) { struct bpf_array *array = container_of(map, struct bpf_array, map); - struct perf_sample_data *sd = this_cpu_ptr(&bpf_sd); unsigned int cpu = smp_processor_id(); u64 index = flags & BPF_F_INDEX_MASK; struct bpf_event_entry *ee; @@ -373,8 +393,6 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, if (unlikely(event->oncpu != cpu)) return -EOPNOTSUPP; - perf_sample_data_init(sd, 0, 0); - sd->raw = raw; perf_event_output(event, sd, regs); return 0; } @@ -382,6 +400,7 @@ __bpf_perf_event_output(struct pt_regs *regs, struct bpf_map *map, BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, u64, flags, void *, data, u64, size) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_trace_sd); struct perf_raw_record raw = { .frag = { .size = size, @@ -392,7 +411,10 @@ BPF_CALL_5(bpf_perf_event_output, struct pt_regs *, regs, struct bpf_map *, map, if (unlikely(flags & ~(BPF_F_INDEX_MASK))) return -EINVAL; - return __bpf_perf_event_output(regs, map, flags, &raw); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; + + return __bpf_perf_event_output(regs, map, flags, sd); } static const struct bpf_func_proto bpf_perf_event_output_proto = { @@ -407,10 +429,12 @@ static const struct bpf_func_proto bpf_perf_event_output_proto = { }; static DEFINE_PER_CPU(struct pt_regs, bpf_pt_regs); +static DEFINE_PER_CPU(struct perf_sample_data, bpf_misc_sd); u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy) { + struct perf_sample_data *sd = this_cpu_ptr(&bpf_misc_sd); struct pt_regs *regs = this_cpu_ptr(&bpf_pt_regs); struct perf_raw_frag frag = { .copy = ctx_copy, @@ -428,8 +452,10 @@ u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, }; perf_fetch_caller_regs(regs); + perf_sample_data_init(sd, 0, 0); + sd->raw = &raw; - return __bpf_perf_event_output(regs, map, flags, &raw); + return __bpf_perf_event_output(regs, map, flags, sd); } BPF_CALL_0(bpf_get_current_task) @@ -551,6 +577,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + case BPF_FUNC_override_return: + return &bpf_override_return_proto; +#endif default: return tracing_func_proto(func_id); } @@ -759,6 +789,8 @@ const struct bpf_prog_ops perf_event_prog_ops = { static DEFINE_MUTEX(bpf_event_mutex); +#define BPF_TRACE_MAX_PROGS 64 + int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) { @@ -766,12 +798,27 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; + /* + * Kprobe override only works if they are on the function entry, + * and only if they are on the opt-in list. + */ + if (prog->kprobe_override && + (!trace_kprobe_on_func_entry(event->tp_event) || + !trace_kprobe_error_injectable(event->tp_event))) + return -EINVAL; + mutex_lock(&bpf_event_mutex); if (event->prog) goto unlock; old_array = event->tp_event->prog_array; + if (old_array && + bpf_prog_array_length(old_array) >= BPF_TRACE_MAX_PROGS) { + ret = -E2BIG; + goto unlock; + } + ret = bpf_prog_array_copy(old_array, NULL, prog, &new_array); if (ret < 0) goto unlock; @@ -812,3 +859,26 @@ void perf_event_detach_bpf_prog(struct perf_event *event) unlock: mutex_unlock(&bpf_event_mutex); } + +int perf_event_query_prog_array(struct perf_event *event, void __user *info) +{ + struct perf_event_query_bpf __user *uquery = info; + struct perf_event_query_bpf query = {}; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + if (copy_from_user(&query, uquery, sizeof(query))) + return -EFAULT; + + mutex_lock(&bpf_event_mutex); + ret = bpf_prog_array_copy_info(event->tp_event->prog_array, + uquery->ids, + query.ids_len, + &uquery->prog_cnt); + mutex_unlock(&bpf_event_mutex); + + return ret; +} diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ccdf3664e4a9..eac9ce2c57a2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1119,15 +1119,11 @@ static struct ftrace_ops global_ops = { }; /* - * This is used by __kernel_text_address() to return true if the - * address is on a dynamically allocated trampoline that would - * not return true for either core_kernel_text() or - * is_module_text_address(). + * Used by the stack undwinder to know about dynamic ftrace trampolines. */ -bool is_ftrace_trampoline(unsigned long addr) +struct ftrace_ops *ftrace_ops_trampoline(unsigned long addr) { - struct ftrace_ops *op; - bool ret = false; + struct ftrace_ops *op = NULL; /* * Some of the ops may be dynamically allocated, @@ -1144,15 +1140,24 @@ bool is_ftrace_trampoline(unsigned long addr) if (op->trampoline && op->trampoline_size) if (addr >= op->trampoline && addr < op->trampoline + op->trampoline_size) { - ret = true; - goto out; + preempt_enable_notrace(); + return op; } } while_for_each_ftrace_op(op); - - out: preempt_enable_notrace(); - return ret; + return NULL; +} + +/* + * This is used by __kernel_text_address() to return true if the + * address is on a dynamically allocated trampoline that would + * not return true for either core_kernel_text() or + * is_module_text_address(). + */ +bool is_ftrace_trampoline(unsigned long addr) +{ + return ftrace_ops_trampoline(addr) != NULL; } struct ftrace_page { @@ -4451,7 +4456,6 @@ unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, func_g.type = filter_parse_regex(glob, strlen(glob), &func_g.search, ¬); func_g.len = strlen(func_g.search); - func_g.search = glob; /* we do not support '!' for function probes */ if (WARN_ON(not)) @@ -5010,7 +5014,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) parser = &iter->parser; if (trace_parser_loaded(parser)) { - parser->buffer[parser->idx] = 0; ftrace_match_records(iter->hash, parser->buffer, parser->idx); } @@ -5324,7 +5327,6 @@ ftrace_graph_release(struct inode *inode, struct file *file) parser = &fgd->parser; if (trace_parser_loaded((parser))) { - parser->buffer[parser->idx] = 0; ret = ftrace_graph_set_hash(fgd->new_hash, parser->buffer); } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 91874a95060d..dcf1c4dd3efe 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -280,6 +280,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data); /* Missed count stored at end */ #define RB_MISSED_STORED (1 << 30) +#define RB_MISSED_FLAGS (RB_MISSED_EVENTS|RB_MISSED_STORED) + struct buffer_data_page { u64 time_stamp; /* page time stamp */ local_t commit; /* write committed index */ @@ -331,7 +333,9 @@ static void rb_init_page(struct buffer_data_page *bpage) */ size_t ring_buffer_page_len(void *page) { - return local_read(&((struct buffer_data_page *)page)->commit) + struct buffer_data_page *bpage = page; + + return (local_read(&bpage->commit) & ~RB_MISSED_FLAGS) + BUF_PAGE_HDR_SIZE; } @@ -623,10 +627,10 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) * as data is added to any of the @buffer's cpu buffers. Otherwise * it will wait for data to be added to a specific cpu buffer. * - * Returns POLLIN | POLLRDNORM if data exists in the buffers, + * Returns EPOLLIN | EPOLLRDNORM if data exists in the buffers, * zero otherwise. */ -int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, +__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table) { struct ring_buffer_per_cpu *cpu_buffer; @@ -661,7 +665,7 @@ int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; return 0; } @@ -1799,12 +1803,6 @@ void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val) } EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite); -static __always_inline void * -__rb_data_page_index(struct buffer_data_page *bpage, unsigned index) -{ - return bpage->data + index; -} - static __always_inline void *__rb_page_index(struct buffer_page *bpage, unsigned index) { return bpage->page->data + index; @@ -2536,29 +2534,58 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) * The lock and unlock are done within a preempt disable section. * The current_context per_cpu variable can only be modified * by the current task between lock and unlock. But it can - * be modified more than once via an interrupt. There are four - * different contexts that we need to consider. + * be modified more than once via an interrupt. To pass this + * information from the lock to the unlock without having to + * access the 'in_interrupt()' functions again (which do show + * a bit of overhead in something as critical as function tracing, + * we use a bitmask trick. + * + * bit 0 = NMI context + * bit 1 = IRQ context + * bit 2 = SoftIRQ context + * bit 3 = normal context. + * + * This works because this is the order of contexts that can + * preempt other contexts. A SoftIRQ never preempts an IRQ + * context. + * + * When the context is determined, the corresponding bit is + * checked and set (if it was set, then a recursion of that context + * happened). + * + * On unlock, we need to clear this bit. To do so, just subtract + * 1 from the current_context and AND it to itself. * - * Normal context. - * SoftIRQ context - * IRQ context - * NMI context + * (binary) + * 101 - 1 = 100 + * 101 & 100 = 100 (clearing bit zero) * - * If for some reason the ring buffer starts to recurse, we - * only allow that to happen at most 4 times (one for each - * context). If it happens 5 times, then we consider this a - * recusive loop and do not let it go further. + * 1010 - 1 = 1001 + * 1010 & 1001 = 1000 (clearing bit 1) + * + * The least significant bit can be cleared this way, and it + * just so happens that it is the same bit corresponding to + * the current context. */ static __always_inline int trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - if (cpu_buffer->current_context >= 4) + unsigned int val = cpu_buffer->current_context; + unsigned long pc = preempt_count(); + int bit; + + if (!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) + bit = RB_CTX_NORMAL; + else + bit = pc & NMI_MASK ? RB_CTX_NMI : + pc & HARDIRQ_MASK ? RB_CTX_IRQ : RB_CTX_SOFTIRQ; + + if (unlikely(val & (1 << bit))) return 1; - cpu_buffer->current_context++; - /* Interrupts must see this update */ - barrier(); + val |= (1 << bit); + cpu_buffer->current_context = val; return 0; } @@ -2566,9 +2593,7 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) static __always_inline void trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - /* Don't let the dec leak out */ - barrier(); - cpu_buffer->current_context--; + cpu_buffer->current_context &= cpu_buffer->current_context - 1; } /** @@ -4406,8 +4431,13 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) { struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; struct buffer_data_page *bpage = data; + struct page *page = virt_to_page(bpage); unsigned long flags; + /* If the page is still in use someplace else, we can't reuse it */ + if (page_ref_count(page) > 1) + goto out; + local_irq_save(flags); arch_spin_lock(&cpu_buffer->lock); @@ -4419,6 +4449,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); + out: free_page((unsigned long)bpage); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b68c53b..20a2300ae4e8 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -362,7 +362,7 @@ trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct task_struct } /** - * trace_pid_filter_add_remove - Add or remove a task from a pid_list + * trace_pid_filter_add_remove_task - Add or remove a task from a pid_list * @pid_list: The list to modify * @self: The current task for fork or NULL for exit * @task: The task to add or remove @@ -530,8 +530,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, ubuf += ret; cnt -= ret; - parser.buffer[parser.idx] = 0; - ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; @@ -925,7 +923,7 @@ static void tracing_snapshot_instance(struct trace_array *tr) } /** - * trace_snapshot - take a snapshot of the current buffer. + * tracing_snapshot - take a snapshot of the current buffer. * * This causes a swap between the snapshot buffer and the current live * tracing buffer. You can use this to take snapshots of the live @@ -1004,9 +1002,9 @@ int tracing_alloc_snapshot(void) EXPORT_SYMBOL_GPL(tracing_alloc_snapshot); /** - * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * tracing_snapshot_alloc - allocate and take a snapshot of the current buffer. * - * This is similar to trace_snapshot(), but it will allocate the + * This is similar to tracing_snapshot(), but it will allocate the * snapshot buffer if it isn't already allocated. Use this only * where it is safe to sleep, as the allocation may sleep. * @@ -1236,18 +1234,18 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, cnt--; } + parser->idx = 0; + /* only spaces were written */ - if (isspace(ch)) { + if (isspace(ch) || !ch) { *ppos += read; ret = read; goto out; } - - parser->idx = 0; } /* read the non-space input */ - while (cnt && !isspace(ch)) { + while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { @@ -1262,12 +1260,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, } /* We either got finished input or we have to wait for another call. */ - if (isspace(ch)) { + if (isspace(ch) || !ch) { parser->buffer[parser->idx] = 0; parser->cont = false; } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; + /* Make sure the parsed string always terminates with '\0'. */ + parser->buffer[parser->idx] = 0; } else { ret = -EINVAL; goto out; @@ -1303,7 +1303,7 @@ unsigned long __read_mostly tracing_thresh; /* * Copy the new maximum trace into the separate maximum-trace * structure. (this way the maximum trace is permanently saved, - * for later retrieval via /sys/kernel/debug/tracing/latency_trace) + * for later retrieval via /sys/kernel/tracing/tracing_max_latency) */ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) @@ -2374,6 +2374,15 @@ void trace_event_buffer_commit(struct trace_event_buffer *fbuffer) } EXPORT_SYMBOL_GPL(trace_event_buffer_commit); +/* + * Skip 3: + * + * trace_buffer_unlock_commit_regs() + * trace_event_buffer_commit() + * trace_event_raw_event_xxx() +*/ +# define STACK_SKIP 3 + void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct ring_buffer *buffer, struct ring_buffer_event *event, @@ -2383,16 +2392,12 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, __buffer_unlock_commit(buffer, event); /* - * If regs is not set, then skip the following callers: - * trace_buffer_unlock_commit_regs - * event_trigger_unlock_commit - * trace_event_buffer_commit - * trace_event_raw_event_sched_switch + * If regs is not set, then skip the necessary functions. * Note, we can still get here via blktrace, wakeup tracer * and mmiotrace, but that's ok if they lose a function or - * two. They are that meaningful. + * two. They are not that meaningful. */ - ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs); + ftrace_trace_stack(tr, buffer, flags, regs ? 0 : STACK_SKIP, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } @@ -2415,7 +2420,7 @@ trace_process_export(struct trace_export *export, entry = ring_buffer_event_data(event); size = ring_buffer_event_length(event); - export->write(entry, size); + export->write(export, entry, size); } static DEFINE_MUTEX(ftrace_export_lock); @@ -2579,11 +2584,13 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer, trace.skip = skip; /* - * Add two, for this function and the call to save_stack_trace() + * Add one, for this function and the call to save_stack_trace() * If regs is set, then these functions will not be in the way. */ +#ifndef CONFIG_UNWINDER_ORC if (!regs) - trace.skip += 2; + trace.skip++; +#endif /* * Since events can happen in NMIs there's no safe way to @@ -2682,17 +2689,6 @@ void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, if (unlikely(in_nmi())) return; - /* - * It is possible that a function is being traced in a - * location that RCU is not watching. A call to - * rcu_irq_enter() will make sure that it is, but there's - * a few internal rcu functions that could be traced - * where that wont work either. In those cases, we just - * do nothing. - */ - if (unlikely(rcu_irq_enter_disabled())) - return; - rcu_irq_enter_irqson(); __ftrace_trace_stack(buffer, flags, skip, pc, NULL); rcu_irq_exit_irqson(); @@ -2711,11 +2707,10 @@ void trace_dump_stack(int skip) local_save_flags(flags); - /* - * Skip 3 more, seems to get us at the caller of - * this function. - */ - skip += 3; +#ifndef CONFIG_UNWINDER_ORC + /* Skip 1 to skip this function. */ + skip++; +#endif __ftrace_trace_stack(global_trace.trace_buffer.buffer, flags, skip, preempt_count(), NULL); } @@ -4178,37 +4173,30 @@ static const struct file_operations show_traces_fops = { .llseek = seq_lseek, }; -/* - * The tracer itself will not take this lock, but still we want - * to provide a consistent cpumask to user-space: - */ -static DEFINE_MUTEX(tracing_cpumask_update_lock); - -/* - * Temporary storage for the character representation of the - * CPU bitmask (and one more byte for the newline): - */ -static char mask_str[NR_CPUS + 1]; - static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct trace_array *tr = file_inode(filp)->i_private; + char *mask_str; int len; - mutex_lock(&tracing_cpumask_update_lock); + len = snprintf(NULL, 0, "%*pb\n", + cpumask_pr_args(tr->tracing_cpumask)) + 1; + mask_str = kmalloc(len, GFP_KERNEL); + if (!mask_str) + return -ENOMEM; - len = snprintf(mask_str, count, "%*pb\n", + len = snprintf(mask_str, len, "%*pb\n", cpumask_pr_args(tr->tracing_cpumask)); if (len >= count) { count = -EINVAL; goto out_err; } - count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1); + count = simple_read_from_buffer(ubuf, count, ppos, mask_str, len); out_err: - mutex_unlock(&tracing_cpumask_update_lock); + kfree(mask_str); return count; } @@ -4228,8 +4216,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, if (err) goto err_unlock; - mutex_lock(&tracing_cpumask_update_lock); - local_irq_disable(); arch_spin_lock(&tr->max_lock); for_each_tracing_cpu(cpu) { @@ -4252,8 +4238,6 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, local_irq_enable(); cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); - - mutex_unlock(&tracing_cpumask_update_lock); free_cpumask_var(tracing_cpumask_new); return count; @@ -5632,26 +5616,26 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) return 0; } -static unsigned int +static __poll_t trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { struct trace_array *tr = iter->tr; /* Iterators are static, they should be filled or empty */ if (trace_buffer_iter(iter, iter->cpu_file)) - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; if (tr->trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ - return POLLIN | POLLRDNORM; + return EPOLLIN | EPOLLRDNORM; else return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, filp, poll_table); } -static unsigned int +static __poll_t tracing_poll_pipe(struct file *filp, poll_table *poll_table) { struct trace_iterator *iter = filp->private_data; @@ -6605,7 +6589,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) return ret; } -static unsigned int +static __poll_t tracing_buffers_poll(struct file *filp, poll_table *poll_table) { struct ftrace_buffer_info *info = filp->private_data; @@ -6780,7 +6764,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int entries, size, i; + int entries, i; ssize_t ret = 0; #ifdef CONFIG_TRACER_MAX_TRACE @@ -6834,14 +6818,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - /* - * zero out any left over data, this is going to - * user land. - */ - size = ring_buffer_page_len(ref->page); - if (size < PAGE_SIZE) - memset(ref->page + size, 0, PAGE_SIZE - size); - page = virt_to_page(ref->page); spd.pages[i] = page; @@ -7599,6 +7575,7 @@ allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size buf->data = alloc_percpu(struct trace_array_cpu); if (!buf->data) { ring_buffer_free(buf->buffer); + buf->buffer = NULL; return -ENOMEM; } @@ -7622,7 +7599,9 @@ static int allocate_trace_buffers(struct trace_array *tr, int size) allocate_snapshot ? size : 1); if (WARN_ON(ret)) { ring_buffer_free(tr->trace_buffer.buffer); + tr->trace_buffer.buffer = NULL; free_percpu(tr->trace_buffer.data); + tr->trace_buffer.data = NULL; return -ENOMEM; } tr->allocated_snapshot = allocate_snapshot; diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index 79f838a75077..22fee766081b 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -165,7 +165,7 @@ static int benchmark_event_kthread(void *arg) * this thread will never voluntarily schedule which would * block synchronize_rcu_tasks() indefinitely. */ - cond_resched_rcu_qs(); + cond_resched(); } return 0; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ec0f9aa4e151..05c7172c6667 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -885,8 +885,6 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (*parser.buffer == '!') set = 0; - parser.buffer[parser.idx] = 0; - ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; @@ -2213,6 +2211,7 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) { struct trace_event_call *call, *p; const char *last_system = NULL; + bool first = false; int last_i; int i; @@ -2220,15 +2219,28 @@ void trace_event_eval_update(struct trace_eval_map **map, int len) list_for_each_entry_safe(call, p, &ftrace_events, list) { /* events are usually grouped together with systems */ if (!last_system || call->class->system != last_system) { + first = true; last_i = 0; last_system = call->class->system; } + /* + * Since calls are grouped by systems, the likelyhood that the + * next call in the iteration belongs to the same system as the + * previous call is high. As an optimization, we skip seaching + * for a map[] that matches the call's system if the last call + * was from the same system. That's what last_i is for. If the + * call has the same system as the previous call, then last_i + * will be the index of the first map[] that has a matching + * system. + */ for (i = last_i; i < len; i++) { if (call->class->system == map[i]->system) { /* Save the first system if need be */ - if (!last_i) + if (first) { last_i = i; + first = false; + } update_event_printk(call, map[i]); } } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 61e7f0678d33..a764aec3c9a1 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -400,7 +400,6 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) for (i = 0; i < len; i++) { if (buff[i] == '*') { if (!i) { - *search = buff + 1; type = MATCH_END_ONLY; } else if (i == len - 1) { if (type == MATCH_END_ONLY) @@ -410,14 +409,14 @@ enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not) buff[i] = 0; break; } else { /* pattern continues, use full glob */ - type = MATCH_GLOB; - break; + return MATCH_GLOB; } } else if (strchr("[?\\", buff[i])) { - type = MATCH_GLOB; - break; + return MATCH_GLOB; } } + if (buff[0] == '*') + *search = buff + 1; return type; } diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f2ac9d44f6c4..87411482a46f 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1123,13 +1123,22 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #endif /* CONFIG_TRACER_SNAPSHOT */ #ifdef CONFIG_STACKTRACE +#ifdef CONFIG_UNWINDER_ORC +/* Skip 2: + * event_triggers_post_call() + * trace_event_raw_event_xxx() + */ +# define STACK_SKIP 2 +#else /* - * Skip 3: + * Skip 4: * stacktrace_trigger() * event_triggers_post_call() + * trace_event_buffer_commit() * trace_event_raw_event_xxx() */ -#define STACK_SKIP 3 +#define STACK_SKIP 4 +#endif static void stacktrace_trigger(struct event_trigger_data *data, void *rec) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 27f7ad12c4b1..b611cd36e22d 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -154,6 +154,24 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); } +#ifdef CONFIG_UNWINDER_ORC +/* + * Skip 2: + * + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 2 +#else +/* + * Skip 3: + * __trace_stack() + * function_stack_trace_call() + * ftrace_call() + */ +#define STACK_SKIP 3 +#endif + static void function_stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) @@ -180,15 +198,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, if (likely(disabled == 1)) { pc = preempt_count(); trace_function(tr, ip, parent_ip, flags, pc); - /* - * skip over 5 funcs: - * __ftrace_trace_stack, - * __trace_stack, - * function_stack_trace_call - * ftrace_list_func - * ftrace_call - */ - __trace_stack(tr, flags, 5, pc); + __trace_stack(tr, flags, STACK_SKIP, pc); } atomic_dec(&data->disabled); @@ -367,14 +377,27 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, tracer_tracing_off(tr); } +#ifdef CONFIG_UNWINDER_ORC /* - * Skip 4: + * Skip 3: + * + * function_trace_probe_call() + * ftrace_ops_assist_func() + * ftrace_call() + */ +#define FTRACE_STACK_SKIP 3 +#else +/* + * Skip 5: + * + * __trace_stack() * ftrace_stacktrace() * function_trace_probe_call() - * ftrace_ops_list_func() + * ftrace_ops_assist_func() * ftrace_call() */ -#define STACK_SKIP 4 +#define FTRACE_STACK_SKIP 5 +#endif static __always_inline void trace_stack(struct trace_array *tr) { @@ -384,7 +407,7 @@ static __always_inline void trace_stack(struct trace_array *tr) local_save_flags(flags); pc = preempt_count(); - __trace_stack(tr, flags, STACK_SKIP, pc); + __trace_stack(tr, flags, FTRACE_STACK_SKIP, pc); } static void diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 492700c5fb4d..1fad24acd444 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -21,6 +21,7 @@ #include <linux/module.h> #include <linux/uaccess.h> #include <linux/rculist.h> +#include <linux/error-injection.h> #include "trace_probe.h" @@ -42,7 +43,6 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) - static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; @@ -87,6 +87,30 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +bool trace_kprobe_on_func_entry(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + + return kprobe_on_func_entry(tk->rp.kp.addr, + tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, + tk->rp.kp.addr ? 0 : tk->rp.kp.offset); +} + +bool trace_kprobe_error_injectable(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + unsigned long addr; + + if (tk->symbol) { + addr = (unsigned long) + kallsyms_lookup_name(trace_kprobe_symbol(tk)); + addr += tk->rp.kp.offset; + } else { + addr = (unsigned long)tk->rp.kp.addr; + } + return within_error_injection_list(addr); +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -1170,7 +1194,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static void +static int kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1179,12 +1203,31 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + unsigned long orig_ip = instruction_pointer(regs); + int ret; + + ret = trace_call_bpf(call, regs); + + /* + * We need to check and see if we modified the pc of the + * pt_regs, and if so clear the kprobe and return 1 so that we + * don't do the single stepping. + * The ftrace kprobe handler leaves it up to us to re-enable + * preemption here before returning if we've modified the ip. + */ + if (orig_ip != instruction_pointer(regs)) { + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + if (!ret) + return 0; + } head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return; + return 0; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1193,13 +1236,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return; + return 0; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); + return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1275,6 +1319,7 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1282,9 +1327,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - kprobe_perf_func(tk, regs); + ret = kprobe_perf_func(tk, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return ret; } NOKPROBE_SYMBOL(kprobe_dispatcher); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index fb66e3eaa192..e101c5bb9eda 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -252,6 +252,8 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +bool trace_kprobe_on_func_entry(struct trace_event_call *call); +bool trace_kprobe_error_injectable(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } + +static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) +{ + return false; +} + +static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) +{ + return false; +} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 8cda06a10d66..c364cf777e1a 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c @@ -1,13 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 +#include <linux/compiler.h> #include "trace.h" -int DYN_FTRACE_TEST_NAME(void) +noinline __noclone int DYN_FTRACE_TEST_NAME(void) { /* used to call mcount */ return 0; } -int DYN_FTRACE_TEST_NAME2(void) +noinline __noclone int DYN_FTRACE_TEST_NAME2(void) { /* used to call mcount */ return 0; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 734accc02418..3c7bfc4bf5e9 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -209,6 +209,10 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (__this_cpu_read(disable_stack_tracer) != 1) goto out; + /* If rcu is not watching, then save stack trace can fail */ + if (!rcu_is_watching()) + goto out; + ip += MCOUNT_INSN_SIZE; check_stack(ip, &stack); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 40592e7b3568..268029ae1be6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -608,7 +608,7 @@ static int probes_seq_show(struct seq_file *m, void *v) /* Don't print "0x (null)" when offset is 0 */ if (tu->offset) { - seq_printf(m, "0x%p", (void *)tu->offset); + seq_printf(m, "0x%px", (void *)tu->offset); } else { switch (sizeof(void *)) { case 4: diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 685c50ae6300..671b13457387 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -212,11 +212,10 @@ static int tracepoint_add_func(struct tracepoint *tp, } /* - * rcu_assign_pointer has a smp_wmb() which makes sure that the new - * probe callbacks array is consistent before setting a pointer to it. - * This array is referenced by __DO_TRACE from - * include/linux/tracepoints.h. A matching smp_read_barrier_depends() - * is used. + * rcu_assign_pointer has as smp_store_release() which makes sure + * that the new probe callbacks array is consistent before setting + * a pointer to it. This array is referenced by __DO_TRACE from + * include/linux/tracepoint.h using rcu_dereference_sched(). */ rcu_assign_pointer(tp->funcs, tp_funcs); if (!static_key_enabled(&tp->key)) diff --git a/kernel/uid16.c b/kernel/uid16.c index ce74a4901d2b..ef1da2a5f9bd 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -192,6 +192,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) return retval; } + groups_sort(group_info); retval = set_current_groups(group_info); put_group_info(group_info); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8fdb710bfdd7..017044c26233 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -38,7 +38,6 @@ #include <linux/hardirq.h> #include <linux/mempolicy.h> #include <linux/freezer.h> -#include <linux/kallsyms.h> #include <linux/debug_locks.h> #include <linux/lockdep.h> #include <linux/idr.h> @@ -48,6 +47,8 @@ #include <linux/nodemask.h> #include <linux/moduleparam.h> #include <linux/uaccess.h> +#include <linux/sched/isolation.h> +#include <linux/nmi.h> #include "workqueue_internal.h" @@ -1634,7 +1635,7 @@ static void worker_enter_idle(struct worker *worker) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); /* - * Sanity check nr_running. Because wq_unbind_fn() releases + * Sanity check nr_running. Because unbind_workers() releases * pool->lock between setting %WORKER_UNBOUND and zapping * nr_running, the warning may trigger spuriously. Check iff * unbind is not in progress. @@ -2135,7 +2136,7 @@ __acquires(&pool->lock) * stop_machine. At the same time, report a quiescent RCU state so * the same condition doesn't freeze RCU. */ - cond_resched_rcu_qs(); + cond_resched(); spin_lock_irq(&pool->lock); @@ -3806,6 +3807,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, return ret; } +EXPORT_SYMBOL_GPL(apply_workqueue_attrs); /** * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug @@ -3939,6 +3941,37 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, return clamp_val(max_active, 1, lim); } +/* + * Workqueues which may be used during memory reclaim should have a rescuer + * to guarantee forward progress. + */ +static int init_rescuer(struct workqueue_struct *wq) +{ + struct worker *rescuer; + int ret; + + if (!(wq->flags & WQ_MEM_RECLAIM)) + return 0; + + rescuer = alloc_worker(NUMA_NO_NODE); + if (!rescuer) + return -ENOMEM; + + rescuer->rescue_wq = wq; + rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); + ret = PTR_ERR_OR_ZERO(rescuer->task); + if (ret) { + kfree(rescuer); + return ret; + } + + wq->rescuer = rescuer; + kthread_bind_mask(rescuer->task, cpu_possible_mask); + wake_up_process(rescuer->task); + + return 0; +} + struct workqueue_struct *__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, @@ -4001,29 +4034,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (alloc_and_link_pwqs(wq) < 0) goto err_free_wq; - /* - * Workqueues which may be used during memory reclaim should - * have a rescuer to guarantee forward progress. - */ - if (flags & WQ_MEM_RECLAIM) { - struct worker *rescuer; - - rescuer = alloc_worker(NUMA_NO_NODE); - if (!rescuer) - goto err_destroy; - - rescuer->rescue_wq = wq; - rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", - wq->name); - if (IS_ERR(rescuer->task)) { - kfree(rescuer); - goto err_destroy; - } - - wq->rescuer = rescuer; - kthread_bind_mask(rescuer->task, cpu_possible_mask); - wake_up_process(rescuer->task); - } + if (wq_online && init_rescuer(wq) < 0) + goto err_destroy; if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) goto err_destroy; @@ -4463,6 +4475,12 @@ void show_workqueue_state(void) if (pwq->nr_active || !list_empty(&pwq->delayed_works)) show_pwq(pwq); spin_unlock_irqrestore(&pwq->pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } } @@ -4490,6 +4508,12 @@ void show_workqueue_state(void) pr_cont("\n"); next_pool: spin_unlock_irqrestore(&pool->lock, flags); + /* + * We could be printing a lot from atomic context, e.g. + * sysrq-t -> show_workqueue_state(). Avoid triggering + * hard lockup. + */ + touch_nmi_watchdog(); } rcu_read_unlock_sched(); @@ -4510,9 +4534,8 @@ void show_workqueue_state(void) * cpu comes back online. */ -static void wq_unbind_fn(struct work_struct *work) +static void unbind_workers(int cpu) { - int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; @@ -4589,16 +4612,6 @@ static void rebind_workers(struct worker_pool *pool) spin_lock_irq(&pool->lock); - /* - * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED - * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is - * being reworked and this can go away in time. - */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - spin_unlock_irq(&pool->lock); - return; - } - pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { @@ -4709,12 +4722,13 @@ int workqueue_online_cpu(unsigned int cpu) int workqueue_offline_cpu(unsigned int cpu) { - struct work_struct unbind_work; struct workqueue_struct *wq; /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); + if (WARN_ON(cpu != smp_processor_id())) + return -1; + + unbind_workers(cpu); /* update NUMA affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); @@ -4722,9 +4736,6 @@ int workqueue_offline_cpu(unsigned int cpu) wq_update_unbound_numa(wq, cpu, false); mutex_unlock(&wq_pool_mutex); - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); return 0; } @@ -4957,6 +4968,10 @@ int workqueue_set_unbound_cpumask(cpumask_var_t cpumask) if (!zalloc_cpumask_var(&saved_cpumask, GFP_KERNEL)) return -ENOMEM; + /* + * Not excluding isolated cpus on purpose. + * If the user wishes to include them, we allow that. + */ cpumask_and(cpumask, cpumask, cpu_possible_mask); if (!cpumask_empty(cpumask)) { apply_wqattrs_lock(); @@ -5555,7 +5570,7 @@ int __init workqueue_init_early(void) WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, cpu_possible_mask); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); @@ -5638,6 +5653,8 @@ int __init workqueue_init(void) * archs such as power and arm64. As per-cpu pools created * previously could be missing node hint and unbound pools NUMA * affinity, fix them up. + * + * Also, while iterating workqueues, create rescuers if requested. */ wq_numa_init(); @@ -5649,8 +5666,12 @@ int __init workqueue_init(void) } } - list_for_each_entry(wq, &workqueues, list) + list_for_each_entry(wq, &workqueues, list) { wq_update_unbound_numa(wq, smp_processor_id(), true); + WARN(init_rescuer(wq), + "workqueue: failed to create early rescuer for %s", + wq->name); + } mutex_unlock(&wq_pool_mutex); |