From 9dd957485d7d896ec18d8e2f9dd410efe71eca34 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 3 Jul 2017 00:42:43 -0400 Subject: ipc, kernel, mm: annotate ->poll() instances Signed-off-by: Al Viro --- kernel/events/core.c | 4 ++-- kernel/printk/printk.c | 4 ++-- kernel/relay.c | 4 ++-- kernel/time/posix-clock.c | 4 ++-- kernel/trace/trace.c | 6 +++--- 5 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 16beab4767e1..857c40d98d2c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4511,11 +4511,11 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) return ret; } -static unsigned int perf_poll(struct file *file, poll_table *wait) +static __poll_t perf_poll(struct file *file, poll_table *wait) { struct perf_event *event = file->private_data; struct ring_buffer *rb; - unsigned int events = POLLHUP; + __poll_t events = POLLHUP; poll_wait(file, &event->waitq, wait); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5d81206a572d..8aa27be96012 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -920,10 +920,10 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) return ret; } -static unsigned int devkmsg_poll(struct file *file, poll_table *wait) +static __poll_t devkmsg_poll(struct file *file, poll_table *wait) { struct devkmsg_user *user = file->private_data; - int ret = 0; + __poll_t ret = 0; if (!user) return POLLERR|POLLNVAL; diff --git a/kernel/relay.c b/kernel/relay.c index 39a9dfc69486..41280033a4c5 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -919,9 +919,9 @@ static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) * * Poll implemention. */ -static unsigned int relay_file_poll(struct file *filp, poll_table *wait) +static __poll_t relay_file_poll(struct file *filp, poll_table *wait) { - unsigned int mask = 0; + __poll_t mask = 0; struct rchan_buf *buf = filp->private_data; if (buf->finalized) diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 17cdc554c9fe..36b04e7fcf62 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -68,10 +68,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, return err; } -static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) +static __poll_t posix_clock_poll(struct file *fp, poll_table *wait) { struct posix_clock *clk = get_posix_clock(fp); - unsigned int result = 0; + __poll_t result = 0; if (!clk) return POLLERR; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 73e67b68c53b..1e2a45e87b93 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5632,7 +5632,7 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) return 0; } -static unsigned int +static __poll_t trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { struct trace_array *tr = iter->tr; @@ -5651,7 +5651,7 @@ trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_tabl filp, poll_table); } -static unsigned int +static __poll_t tracing_poll_pipe(struct file *filp, poll_table *poll_table) { struct trace_iterator *iter = filp->private_data; @@ -6605,7 +6605,7 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) return ret; } -static unsigned int +static __poll_t tracing_buffers_poll(struct file *filp, poll_table *poll_table) { struct ftrace_buffer_info *info = filp->private_data; -- cgit v1.2.3 From ecf927000ce3265e9871c79d43c10ceed8bd61c9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 16 Jul 2017 22:11:54 -0400 Subject: ring_buffer_poll_wait() return value used as return value of ->poll() Signed-off-by: Al Viro --- include/linux/ring_buffer.h | 2 +- kernel/trace/ring_buffer.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 289e4d54e3e0..7d9eb39fa76a 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -96,7 +96,7 @@ __ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *k }) int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full); -int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, +__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 91874a95060d..d24d48713ef3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -626,7 +626,7 @@ int ring_buffer_wait(struct ring_buffer *buffer, int cpu, bool full) * Returns POLLIN | POLLRDNORM if data exists in the buffers, * zero otherwise. */ -int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, +__poll_t ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, struct file *filp, poll_table *poll_table) { struct ring_buffer_per_cpu *cpu_buffer; -- cgit v1.2.3 From f06eae831f0c1fc5b982ea200daf552810e1dd55 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 11 Oct 2017 09:39:20 -0600 Subject: seccomp: hoist out filter resolving logic Hoist out the nth filter resolving logic that ptrace uses into a new function. We'll use this in the next patch to implement the new PTRACE_SECCOMP_GET_FILTER_FLAGS command. Signed-off-by: Tycho Andersen CC: Kees Cook CC: Andy Lutomirski CC: Oleg Nesterov Signed-off-by: Kees Cook --- kernel/seccomp.c | 77 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f0dfb2abb8d..99bddaf79076 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -978,49 +978,68 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter) } #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) -long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, - void __user *data) +static struct seccomp_filter *get_nth_filter(struct task_struct *task, + unsigned long filter_off) { - struct seccomp_filter *filter; - struct sock_fprog_kern *fprog; - long ret; - unsigned long count = 0; - - if (!capable(CAP_SYS_ADMIN) || - current->seccomp.mode != SECCOMP_MODE_DISABLED) { - return -EACCES; - } + struct seccomp_filter *orig, *filter; + unsigned long count; + /* + * Note: this is only correct because the caller should be the (ptrace) + * tracer of the task, otherwise lock_task_sighand is needed. + */ spin_lock_irq(&task->sighand->siglock); + if (task->seccomp.mode != SECCOMP_MODE_FILTER) { - ret = -EINVAL; - goto out; + spin_unlock_irq(&task->sighand->siglock); + return ERR_PTR(-EINVAL); } - filter = task->seccomp.filter; - while (filter) { - filter = filter->prev; + orig = task->seccomp.filter; + __get_seccomp_filter(orig); + spin_unlock_irq(&task->sighand->siglock); + + count = 0; + for (filter = orig; filter; filter = filter->prev) count++; - } if (filter_off >= count) { - ret = -ENOENT; + filter = ERR_PTR(-ENOENT); goto out; } - count -= filter_off; - filter = task->seccomp.filter; - while (filter && count > 1) { - filter = filter->prev; + count -= filter_off; + for (filter = orig; filter && count > 1; filter = filter->prev) count--; - } if (WARN_ON(count != 1 || !filter)) { - /* The filter tree shouldn't shrink while we're using it. */ - ret = -ENOENT; + filter = ERR_PTR(-ENOENT); goto out; } + __get_seccomp_filter(filter); + +out: + __put_seccomp_filter(orig); + return filter; +} + +long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, + void __user *data) +{ + struct seccomp_filter *filter; + struct sock_fprog_kern *fprog; + long ret; + + if (!capable(CAP_SYS_ADMIN) || + current->seccomp.mode != SECCOMP_MODE_DISABLED) { + return -EACCES; + } + + filter = get_nth_filter(task, filter_off); + if (IS_ERR(filter)) + return PTR_ERR(filter); + fprog = filter->prog->orig_prog; if (!fprog) { /* This must be a new non-cBPF filter, since we save @@ -1035,17 +1054,11 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - __get_seccomp_filter(filter); - spin_unlock_irq(&task->sighand->siglock); - if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; - __put_seccomp_filter(filter); - return ret; - out: - spin_unlock_irq(&task->sighand->siglock); + __put_seccomp_filter(filter); return ret; } #endif -- cgit v1.2.3 From 26500475ac1b499d8636ff281311d633909f5d20 Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Wed, 11 Oct 2017 09:39:21 -0600 Subject: ptrace, seccomp: add support for retrieving seccomp metadata With the new SECCOMP_FILTER_FLAG_LOG, we need to be able to extract these flags for checkpoint restore, since they describe the state of a filter. So, let's add PTRACE_SECCOMP_GET_METADATA, similar to ..._GET_FILTER, which returns the metadata of the nth filter (right now, just the flags). Hopefully this will be future proof, and new per-filter metadata can be added to this struct. Signed-off-by: Tycho Andersen CC: Kees Cook CC: Andy Lutomirski CC: Oleg Nesterov Signed-off-by: Kees Cook --- include/linux/seccomp.h | 8 ++++++++ include/uapi/linux/ptrace.h | 6 ++++++ kernel/ptrace.c | 4 ++++ kernel/seccomp.c | 33 +++++++++++++++++++++++++++++++++ 4 files changed, 51 insertions(+) (limited to 'kernel') diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 10f25f7e4304..c723a5c4e3ff 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -95,11 +95,19 @@ static inline void get_seccomp_filter(struct task_struct *tsk) #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE) extern long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, void __user *data); +extern long seccomp_get_metadata(struct task_struct *task, + unsigned long filter_off, void __user *data); #else static inline long seccomp_get_filter(struct task_struct *task, unsigned long n, void __user *data) { return -EINVAL; } +static inline long seccomp_get_metadata(struct task_struct *task, + unsigned long filter_off, + void __user *data) +{ + return -EINVAL; +} #endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */ #endif /* _LINUX_SECCOMP_H */ diff --git a/include/uapi/linux/ptrace.h b/include/uapi/linux/ptrace.h index e3939e00980b..e46d82b91166 100644 --- a/include/uapi/linux/ptrace.h +++ b/include/uapi/linux/ptrace.h @@ -66,6 +66,12 @@ struct ptrace_peeksiginfo_args { #define PTRACE_SETSIGMASK 0x420b #define PTRACE_SECCOMP_GET_FILTER 0x420c +#define PTRACE_SECCOMP_GET_METADATA 0x420d + +struct seccomp_metadata { + unsigned long filter_off; /* Input: which filter */ + unsigned int flags; /* Output: filter's flags */ +}; /* Read signals from a shared (process wide) queue */ #define PTRACE_PEEKSIGINFO_SHARED (1 << 0) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 84b1367935e4..58291e9f3276 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1092,6 +1092,10 @@ int ptrace_request(struct task_struct *child, long request, ret = seccomp_get_filter(child, addr, datavp); break; + case PTRACE_SECCOMP_GET_METADATA: + ret = seccomp_get_metadata(child, addr, datavp); + break; + default: break; } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 99bddaf79076..61bd9dc260c8 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1061,6 +1061,39 @@ out: __put_seccomp_filter(filter); return ret; } + +long seccomp_get_metadata(struct task_struct *task, + unsigned long size, void __user *data) +{ + long ret; + struct seccomp_filter *filter; + struct seccomp_metadata kmd = {}; + + if (!capable(CAP_SYS_ADMIN) || + current->seccomp.mode != SECCOMP_MODE_DISABLED) { + return -EACCES; + } + + size = min_t(unsigned long, size, sizeof(kmd)); + + if (copy_from_user(&kmd, data, size)) + return -EFAULT; + + filter = get_nth_filter(task, kmd.filter_off); + if (IS_ERR(filter)) + return PTR_ERR(filter); + + memset(&kmd, 0, sizeof(kmd)); + if (filter->log) + kmd.flags |= SECCOMP_FILTER_FLAG_LOG; + + ret = size; + if (copy_to_user(data, &kmd, size)) + ret = -EFAULT; + + __put_seccomp_filter(filter); + return ret; +} #endif #ifdef CONFIG_SYSCTL -- cgit v1.2.3 From 12a3cc8424fe1237aaeb982dec4f0914ddd22f3e Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:35 -0800 Subject: bpf: fix stack state printing in verifier log fix incorrect stack state prints in print_verifier_state() Fixes: 638f5b90d460 ("bpf: reduce verifier memory consumption") Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d4593571c404..71a9429fdbb5 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -279,7 +279,7 @@ static void print_verifier_state(struct bpf_verifier_env *env, for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { if (state->stack[i].slot_type[0] == STACK_SPILL) verbose(env, " fp%d=%s", - -MAX_BPF_STACK + i * BPF_REG_SIZE, + (-i - 1) * BPF_REG_SIZE, reg_type_str[state->stack[i].spilled_ptr.type]); } verbose(env, "\n"); -- cgit v1.2.3 From 4e92024a48ecbd06fba3ccfb2174abd3d2f54a83 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:36 -0800 Subject: bpf: print liveness info to verifier log let verifier print register and stack liveness information into verifier log Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 71a9429fdbb5..f7229390c279 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -216,6 +216,17 @@ static const char * const reg_type_str[] = { [PTR_TO_PACKET_END] = "pkt_end", }; +static void print_liveness(struct bpf_verifier_env *env, + enum bpf_reg_liveness live) +{ + if (live & (REG_LIVE_READ | REG_LIVE_WRITTEN)) + verbose(env, "_"); + if (live & REG_LIVE_READ) + verbose(env, "r"); + if (live & REG_LIVE_WRITTEN) + verbose(env, "w"); +} + static void print_verifier_state(struct bpf_verifier_env *env, struct bpf_verifier_state *state) { @@ -228,7 +239,9 @@ static void print_verifier_state(struct bpf_verifier_env *env, t = reg->type; if (t == NOT_INIT) continue; - verbose(env, " R%d=%s", i, reg_type_str[t]); + verbose(env, " R%d", i); + print_liveness(env, reg->live); + verbose(env, "=%s", reg_type_str[t]); if ((t == SCALAR_VALUE || t == PTR_TO_STACK) && tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ @@ -277,10 +290,13 @@ static void print_verifier_state(struct bpf_verifier_env *env, } } for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] == STACK_SPILL) - verbose(env, " fp%d=%s", - (-i - 1) * BPF_REG_SIZE, + if (state->stack[i].slot_type[0] == STACK_SPILL) { + verbose(env, " fp%d", + (-i - 1) * BPF_REG_SIZE); + print_liveness(env, state->stack[i].spilled_ptr.live); + verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); + } } verbose(env, "\n"); } -- cgit v1.2.3 From 19ceb4178d31e543479d75e20c2f9df08f16632f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:37 -0800 Subject: bpf: don't mark FP reg as uninit when verifier hits an internal bug don't mark register R10==FP as uninit, since it's read only register and it's not technically correct to let verifier run further, since it may assume that R10 has valid auxiliary state. While developing subsequent patches this issue was discovered, though the code eventually changed that aux reg state doesn't have pointers any more it is still safer to avoid clearing readonly register. Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f7229390c279..14ad7c6e806a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -584,8 +584,8 @@ static void mark_reg_unknown(struct bpf_verifier_env *env, { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_unknown(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } @@ -603,8 +603,8 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, { if (WARN_ON(regno >= MAX_BPF_REG)) { verbose(env, "mark_reg_not_init(regs, %u)\n", regno); - /* Something bad happened, let's kill all regs */ - for (regno = 0; regno < MAX_BPF_REG; regno++) + /* Something bad happened, let's kill all regs except FP */ + for (regno = 0; regno < BPF_REG_FP; regno++) __mark_reg_not_init(regs + regno); return; } -- cgit v1.2.3 From 2f18f62ee1648b2c93e6ab4a58d548010b0a67e4 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:38 -0800 Subject: bpf: improve verifier liveness marks registers with pointers filled from stack were missing live_written marks which caused liveness propagation to unnecessary mark more registers as live_read and miss state pruning opportunities later on. before after bpf_lb-DLB_L3.o 2285 2270 bpf_lb-DLB_L4.o 3723 3682 bpf_lb-DUNKNOWN.o 1110 1110 bpf_lxc-DDROP_ALL.o 27954 27876 bpf_lxc-DUNKNOWN.o 38954 38780 bpf_netdev.o 16943 16937 bpf_overlay.o 7929 7929 Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 14ad7c6e806a..46ff4e5b3fb7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -795,6 +795,11 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ state->regs[value_regno] = state->stack[spi].spilled_ptr; + /* mark reg as written since spilled pointer state likely + * has its liveness marks cleared by is_state_visited() + * which resets stack/reg liveness for state transitions + */ + state->regs[value_regno].live |= REG_LIVE_WRITTEN; mark_stack_slot_read(state, spi); } return 0; -- cgit v1.2.3 From 3bf15921c58df982f9b15d64754c483785bf66f3 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:39 -0800 Subject: bpf: improve JEQ/JNE path walking verifier knows how to trim paths that are known not to be taken at run-time when register containing run-time constant is compared with another constant. It was done only for JEQ comparison. Extend it to include JNE as well. More cases can be added in the future. before after bpf_lb-DLB_L3.o 2270 2051 bpf_lb-DLB_L4.o 3682 3287 bpf_lb-DUNKNOWN.o 1110 1080 bpf_lxc-DDROP_ALL.o 27876 24980 bpf_lxc-DUNKNOWN.o 38780 34308 bpf_netdev.o 16937 15404 bpf_overlay.o 7929 7191 Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 46ff4e5b3fb7..afe9a1a0a5fe 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2955,8 +2955,9 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (BPF_SRC(insn->code) == BPF_K && (opcode == BPF_JEQ || opcode == BPF_JNE) && dst_reg->type == SCALAR_VALUE && - tnum_equals_const(dst_reg->var_off, insn->imm)) { - if (opcode == BPF_JEQ) { + tnum_is_const(dst_reg->var_off)) { + if ((opcode == BPF_JEQ && dst_reg->var_off.value == insn->imm) || + (opcode == BPF_JNE && dst_reg->var_off.value != insn->imm)) { /* if (imm == imm) goto pc+off; * only follow the goto, ignore fall-through */ -- cgit v1.2.3 From 914cb781ee1a35f4c7a5173a668d6ba2c4734b91 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 30 Nov 2017 21:31:40 -0800 Subject: bpf: cleanup register_is_null() don't pass large struct bpf_reg_state by value. Instead pass it by pointer. Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index afe9a1a0a5fe..7afa92e9b409 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1265,9 +1265,9 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins } /* Does this register contain a constant zero? */ -static bool register_is_null(struct bpf_reg_state reg) +static bool register_is_null(struct bpf_reg_state *reg) { - return reg.type == SCALAR_VALUE && tnum_equals_const(reg.var_off, 0); + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); } /* when register 'regno' is passed into function that will read 'access_size' @@ -1280,31 +1280,31 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { + struct bpf_reg_state *reg = cur_regs(env) + regno; struct bpf_verifier_state *state = env->cur_state; - struct bpf_reg_state *regs = state->regs; int off, i, slot, spi; - if (regs[regno].type != PTR_TO_STACK) { + if (reg->type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ if (zero_size_allowed && access_size == 0 && - register_is_null(regs[regno])) + register_is_null(reg)) return 0; verbose(env, "R%d type=%s expected=%s\n", regno, - reg_type_str[regs[regno].type], + reg_type_str[reg->type], reg_type_str[PTR_TO_STACK]); return -EACCES; } /* Only allow fixed-offset stack reads */ - if (!tnum_is_const(regs[regno].var_off)) { + if (!tnum_is_const(reg->var_off)) { char tn_buf[48]; - tnum_strn(tn_buf, sizeof(tn_buf), regs[regno].var_off); + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); verbose(env, "invalid variable stack read R%d var_off=%s\n", regno, tn_buf); } - off = regs[regno].off + regs[regno].var_off.value; + off = reg->off + reg->var_off.value; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || access_size < 0 || (access_size == 0 && !zero_size_allowed)) { verbose(env, "invalid stack type R%d off=%d access_size=%d\n", @@ -1412,7 +1412,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, * passed in as argument, it's a SCALAR_VALUE type. Final test * happens during stack boundary checking. */ - if (register_is_null(*reg) && + if (register_is_null(reg) && arg_type == ARG_PTR_TO_MEM_OR_NULL) /* final test in check_stack_boundary() */; else if (!type_is_pkt_pointer(type) && -- cgit v1.2.3 From 43347d56c8d9dd732cee2f8efd384ad21dd1f6c4 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 15 Nov 2017 14:50:13 +0100 Subject: livepatch: send a fake signal to all blocking tasks Live patching consistency model is of LEAVE_PATCHED_SET and SWITCH_THREAD. This means that all tasks in the system have to be marked one by one as safe to call a new patched function. Safe means when a task is not (sleeping) in a set of patched functions. That is, no patched function is on the task's stack. Another clearly safe place is the boundary between kernel and userspace. The patching waits for all tasks to get outside of the patched set or to cross the boundary. The transition is completed afterwards. The problem is that a task can block the transition for quite a long time, if not forever. It could sleep in a set of patched functions, for example. Luckily we can force the task to leave the set by sending it a fake signal, that is a signal with no data in signal pending structures (no handler, no sign of proper signal delivered). Suspend/freezer use this to freeze the tasks as well. The task gets TIF_SIGPENDING set and is woken up (if it has been sleeping in the kernel before) or kicked by rescheduling IPI (if it was running on other CPU). This causes the task to go to kernel/userspace boundary where the signal would be handled and the task would be marked as safe in terms of live patching. There are tasks which are not affected by this technique though. The fake signal is not sent to kthreads. They should be handled differently. They can be woken up so they leave the patched set and their TIF_PATCH_PENDING can be cleared thanks to stack checking. For the sake of completeness, if the task is in TASK_RUNNING state but not currently running on some CPU it doesn't get the IPI, but it would eventually handle the signal anyway. Second, if the task runs in the kernel (in TASK_RUNNING state) it gets the IPI, but the signal is not handled on return from the interrupt. It would be handled on return to the userspace in the future when the fake signal is sent again. Stack checking deals with these cases in a better way. If the task was sleeping in a syscall it would be woken by our fake signal, it would check if TIF_SIGPENDING is set (by calling signal_pending() predicate) and return ERESTART* or EINTR. Syscalls with ERESTART* return values are restarted in case of the fake signal (see do_signal()). EINTR is propagated back to the userspace program. This could disturb the program, but... * each process dealing with signals should react accordingly to EINTR return values. * syscalls returning EINTR happen to be quite common situation in the system even if no fake signal is sent. * freezer sends the fake signal and does not deal with EINTR anyhow. Thus EINTR values are returned when the system is resumed. The very safe marking is done in architectures' "entry" on syscall and interrupt/exception exit paths, and in a stack checking functions of livepatch. TIF_PATCH_PENDING is cleared and the next recalc_sigpending() drops TIF_SIGPENDING. In connection with this, also call klp_update_patch_state() before do_signal(), so that recalc_sigpending() in dequeue_signal() can clear TIF_PATCH_PENDING immediately and thus prevent a double call of do_signal(). Note that the fake signal is not sent to stopped/traced tasks. Such task prevents the patching to finish till it continues again (is not traced anymore). Last, sending the fake signal is not automatic. It is done only when admin requests it by writing 1 to signal sysfs attribute in livepatch sysfs directory. Signed-off-by: Miroslav Benes Cc: Oleg Nesterov Cc: Michael Ellerman Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andy Lutomirski Cc: linuxppc-dev@lists.ozlabs.org Cc: x86@kernel.org Acked-by: Michael Ellerman (powerpc) Signed-off-by: Jiri Kosina --- Documentation/ABI/testing/sysfs-kernel-livepatch | 12 +++++++ Documentation/livepatch/livepatch.txt | 11 +++++-- arch/powerpc/kernel/signal.c | 6 ++-- arch/x86/entry/common.c | 6 ++-- kernel/livepatch/core.c | 30 +++++++++++++++++ kernel/livepatch/transition.c | 41 ++++++++++++++++++++++++ kernel/livepatch/transition.h | 1 + kernel/signal.c | 4 ++- 8 files changed, 102 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch index d5d39748382f..3bb9d5bc1ce3 100644 --- a/Documentation/ABI/testing/sysfs-kernel-livepatch +++ b/Documentation/ABI/testing/sysfs-kernel-livepatch @@ -33,6 +33,18 @@ Description: An attribute which indicates whether the patch is currently in transition. +What: /sys/kernel/livepatch//signal +Date: Nov 2017 +KernelVersion: 4.15.0 +Contact: live-patching@vger.kernel.org +Description: + A writable attribute that allows administrator to affect the + course of an existing transition. Writing 1 sends a fake + signal to all remaining blocking tasks. The fake signal + means that no proper signal is delivered (there is no data in + signal pending structures). Tasks are interrupted or woken up, + and forced to change their patched state. + What: /sys/kernel/livepatch// Date: Nov 2014 KernelVersion: 3.19.0 diff --git a/Documentation/livepatch/livepatch.txt b/Documentation/livepatch/livepatch.txt index ecdb18104ab0..9bcdef277a36 100644 --- a/Documentation/livepatch/livepatch.txt +++ b/Documentation/livepatch/livepatch.txt @@ -176,8 +176,12 @@ If a patch is in transition, this file shows 0 to indicate the task is unpatched and 1 to indicate it's patched. Otherwise, if no patch is in transition, it shows -1. Any tasks which are blocking the transition can be signaled with SIGSTOP and SIGCONT to force them to change their -patched state. - +patched state. This may be harmful to the system though. +/sys/kernel/livepatch//signal attribute provides a better alternative. +Writing 1 to the attribute sends a fake signal to all remaining blocking +tasks. No proper signal is actually delivered (there is no data in signal +pending structures). Tasks are interrupted or woken up, and forced to change +their patched state. 3.1 Adding consistency model support to new architectures --------------------------------------------------------- @@ -435,6 +439,9 @@ Information about the registered patches can be found under /sys/kernel/livepatch. The patches could be enabled and disabled by writing there. +/sys/kernel/livepatch//signal attribute allows administrator to affect a +patching operation. + See Documentation/ABI/testing/sysfs-kernel-livepatch for more details. diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index e9436c5e1e09..bf9c4e7792d1 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -153,6 +153,9 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) if (thread_info_flags & _TIF_UPROBE) uprobe_notify_resume(regs); + if (thread_info_flags & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + if (thread_info_flags & _TIF_SIGPENDING) { BUG_ON(regs != current->thread.regs); do_signal(current); @@ -163,9 +166,6 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags) tracehook_notify_resume(regs); } - if (thread_info_flags & _TIF_PATCH_PENDING) - klp_update_patch_state(current); - user_enter(); } diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d7d3cc24baf4..1e3883e45687 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -153,6 +153,9 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) if (cached_flags & _TIF_UPROBE) uprobe_notify_resume(regs); + if (cached_flags & _TIF_PATCH_PENDING) + klp_update_patch_state(current); + /* deal with pending signal delivery */ if (cached_flags & _TIF_SIGPENDING) do_signal(regs); @@ -165,9 +168,6 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) if (cached_flags & _TIF_USER_RETURN_NOTIFY) fire_user_return_notifiers(); - if (cached_flags & _TIF_PATCH_PENDING) - klp_update_patch_state(current); - /* Disable IRQs and retry */ local_irq_disable(); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index de9e45dca70f..88766bd91803 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -454,6 +454,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/ * /sys/kernel/livepatch//enabled * /sys/kernel/livepatch//transition + * /sys/kernel/livepatch//signal * /sys/kernel/livepatch// * /sys/kernel/livepatch/// */ @@ -528,11 +529,40 @@ static ssize_t transition_show(struct kobject *kobj, patch == klp_transition_patch); } +static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct klp_patch *patch; + int ret; + bool val; + + patch = container_of(kobj, struct klp_patch, kobj); + + /* + * klp_mutex lock is not grabbed here intentionally. It is not really + * needed. The race window is harmless and grabbing the lock would only + * hold the action back. + */ + if (patch != klp_transition_patch) + return -EINVAL; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (val) + klp_send_signals(); + + return count; +} + static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); +static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, &transition_kobj_attr.attr, + &signal_kobj_attr.attr, NULL }; diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index 56add6327736..edcfcb8ebb2d 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -608,3 +608,44 @@ void klp_copy_process(struct task_struct *child) /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */ } + +/* + * Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set. + * Kthreads with TIF_PATCH_PENDING set are woken up. Only admin can request this + * action currently. + */ +void klp_send_signals(void) +{ + struct task_struct *g, *task; + + pr_notice("signaling remaining tasks\n"); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) { + if (!klp_patch_pending(task)) + continue; + + /* + * There is a small race here. We could see TIF_PATCH_PENDING + * set and decide to wake up a kthread or send a fake signal. + * Meanwhile the task could migrate itself and the action + * would be meaningless. It is not serious though. + */ + if (task->flags & PF_KTHREAD) { + /* + * Wake up a kthread which sleeps interruptedly and + * still has not been migrated. + */ + wake_up_state(task, TASK_INTERRUPTIBLE); + } else { + /* + * Send fake signal to all non-kthread tasks which are + * still not migrated. + */ + spin_lock_irq(&task->sighand->siglock); + signal_wake_up(task, 0); + spin_unlock_irq(&task->sighand->siglock); + } + } + read_unlock(&tasklist_lock); +} diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index 0f6e27c481f9..40522795a5f6 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h @@ -11,5 +11,6 @@ void klp_cancel_transition(void); void klp_start_transition(void); void klp_try_complete_transition(void); void klp_reverse_transition(void); +void klp_send_signals(void); #endif /* _LIVEPATCH_TRANSITION_H */ diff --git a/kernel/signal.c b/kernel/signal.c index 8dcd8825b2de..186143b06d00 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -40,6 +40,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -163,7 +164,8 @@ void recalc_sigpending_and_wake(struct task_struct *t) void recalc_sigpending(void) { - if (!recalc_sigpending_tsk(current) && !freezing(current)) + if (!recalc_sigpending_tsk(current) && !freezing(current) && + !klp_patch_pending(current)) clear_thread_flag(TIF_SIGPENDING); } -- cgit v1.2.3 From c99a2be790b07752d8cc694434d3450afd4c5a00 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 22 Nov 2017 11:29:21 +0100 Subject: livepatch: force transition to finish If a task sleeps in a set of patched functions uninterruptedly, it could block the whole transition indefinitely. Thus it may be useful to clear its TIF_PATCH_PENDING to allow the process to finish. Admin can do that now by writing to force sysfs attribute in livepatch sysfs directory. TIF_PATCH_PENDING is then cleared for all tasks and the transition can finish successfully. Important note! Administrator should not use this feature without a clearance from a patch distributor. It must be checked that by doing so the consistency model guarantees are not violated. Removal (rmmod) of patch modules is permanently disabled when the feature is used. It cannot be guaranteed there is no task sleeping in such module. Signed-off-by: Miroslav Benes Acked-by: Josh Poimboeuf Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- Documentation/ABI/testing/sysfs-kernel-livepatch | 14 +++++++++ Documentation/livepatch/livepatch.txt | 18 ++++++++++-- kernel/livepatch/core.c | 30 ++++++++++++++++++++ kernel/livepatch/transition.c | 36 ++++++++++++++++++++++-- kernel/livepatch/transition.h | 1 + 5 files changed, 95 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch index 3bb9d5bc1ce3..dac7e1e62a8b 100644 --- a/Documentation/ABI/testing/sysfs-kernel-livepatch +++ b/Documentation/ABI/testing/sysfs-kernel-livepatch @@ -45,6 +45,20 @@ Description: signal pending structures). Tasks are interrupted or woken up, and forced to change their patched state. +What: /sys/kernel/livepatch//force +Date: Nov 2017 +KernelVersion: 4.15.0 +Contact: live-patching@vger.kernel.org +Description: + A writable attribute that allows administrator to affect the + course of an existing transition. Writing 1 clears + TIF_PATCH_PENDING flag of all tasks and thus forces the tasks to + the patched or unpatched state. Administrator should not + use this feature without a clearance from a patch + distributor. Removal (rmmod) of patch modules is permanently + disabled when the feature is used. See + Documentation/livepatch/livepatch.txt for more information. + What: /sys/kernel/livepatch// Date: Nov 2014 KernelVersion: 3.19.0 diff --git a/Documentation/livepatch/livepatch.txt b/Documentation/livepatch/livepatch.txt index 9bcdef277a36..896ba8941702 100644 --- a/Documentation/livepatch/livepatch.txt +++ b/Documentation/livepatch/livepatch.txt @@ -183,6 +183,20 @@ tasks. No proper signal is actually delivered (there is no data in signal pending structures). Tasks are interrupted or woken up, and forced to change their patched state. +Administrator can also affect a transition through +/sys/kernel/livepatch//force attribute. Writing 1 there clears +TIF_PATCH_PENDING flag of all tasks and thus forces the tasks to the patched +state. Important note! The force attribute is intended for cases when the +transition gets stuck for a long time because of a blocking task. Administrator +is expected to collect all necessary data (namely stack traces of such blocking +tasks) and request a clearance from a patch distributor to force the transition. +Unauthorized usage may cause harm to the system. It depends on the nature of the +patch, which functions are (un)patched, and which functions the blocking tasks +are sleeping in (/proc//stack may help here). Removal (rmmod) of patch +modules is permanently disabled when the force feature is used. It cannot be +guaranteed there is no task sleeping in such module. It implies unbounded +reference count if a patch module is disabled and enabled in a loop. + 3.1 Adding consistency model support to new architectures --------------------------------------------------------- @@ -439,8 +453,8 @@ Information about the registered patches can be found under /sys/kernel/livepatch. The patches could be enabled and disabled by writing there. -/sys/kernel/livepatch//signal attribute allows administrator to affect a -patching operation. +/sys/kernel/livepatch//signal and /sys/kernel/livepatch//force +attributes allow administrator to affect a patching operation. See Documentation/ABI/testing/sysfs-kernel-livepatch for more details. diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 88766bd91803..1c3c9b27c916 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -455,6 +455,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch//enabled * /sys/kernel/livepatch//transition * /sys/kernel/livepatch//signal + * /sys/kernel/livepatch//force * /sys/kernel/livepatch// * /sys/kernel/livepatch/// */ @@ -556,13 +557,42 @@ static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, return count; } +static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + struct klp_patch *patch; + int ret; + bool val; + + patch = container_of(kobj, struct klp_patch, kobj); + + /* + * klp_mutex lock is not grabbed here intentionally. It is not really + * needed. The race window is harmless and grabbing the lock would only + * hold the action back. + */ + if (patch != klp_transition_patch) + return -EINVAL; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (val) + klp_force_transition(); + + return count; +} + static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); static struct kobj_attribute signal_kobj_attr = __ATTR_WO(signal); +static struct kobj_attribute force_kobj_attr = __ATTR_WO(force); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, &transition_kobj_attr.attr, &signal_kobj_attr.attr, + &force_kobj_attr.attr, NULL }; diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index edcfcb8ebb2d..be5bfa533ee8 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -33,6 +33,8 @@ struct klp_patch *klp_transition_patch; static int klp_target_state = KLP_UNDEFINED; +static bool klp_forced = false; + /* * This work can be performed periodically to finish patching or unpatching any * "straggler" tasks which failed to transition in the first attempt. @@ -146,9 +148,12 @@ done: /* * See complementary comment in __klp_enable_patch() for why we * keep the module reference for immediate patches. + * + * klp_forced or immediate_func set implies unbounded increase of + * module's ref count if the module is disabled/enabled in a loop. */ - if (!klp_transition_patch->immediate && !immediate_func && - klp_target_state == KLP_UNPATCHED) { + if (!klp_forced && !klp_transition_patch->immediate && + !immediate_func && klp_target_state == KLP_UNPATCHED) { module_put(klp_transition_patch->mod); } @@ -649,3 +654,30 @@ void klp_send_signals(void) } read_unlock(&tasklist_lock); } + +/* + * Drop TIF_PATCH_PENDING of all tasks on admin's request. This forces an + * existing transition to finish. + * + * NOTE: klp_update_patch_state(task) requires the task to be inactive or + * 'current'. This is not the case here and the consistency model could be + * broken. Administrator, who is the only one to execute the + * klp_force_transitions(), has to be aware of this. + */ +void klp_force_transition(void) +{ + struct task_struct *g, *task; + unsigned int cpu; + + pr_warn("forcing remaining tasks to the patched state\n"); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) + klp_update_patch_state(task); + read_unlock(&tasklist_lock); + + for_each_possible_cpu(cpu) + klp_update_patch_state(idle_task(cpu)); + + klp_forced = true; +} diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h index 40522795a5f6..f9d0bc016067 100644 --- a/kernel/livepatch/transition.h +++ b/kernel/livepatch/transition.h @@ -12,5 +12,6 @@ void klp_start_transition(void); void klp_try_complete_transition(void); void klp_reverse_transition(void); void klp_send_signals(void); +void klp_force_transition(void); #endif /* _LIVEPATCH_TRANSITION_H */ -- cgit v1.2.3 From 156baec39732f025dc778e00da95fc10d6e45885 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 7 Dec 2017 09:40:38 -0800 Subject: rcu: Export init_rcu_head() and destroy_rcu_head() to GPL modules Use of init_rcu_head() and destroy_rcu_head() from modules results in the following build-time error with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y: ERROR: "init_rcu_head" [drivers/scsi/scsi_mod.ko] undefined! ERROR: "destroy_rcu_head" [drivers/scsi/scsi_mod.ko] undefined! This commit therefore adds EXPORT_SYMBOL_GPL() for each to allow them to be used by GPL-licensed kernel modules. Reported-by: Bart Van Assche Reported-by: Stephen Rothwell Signed-off-by: Paul E. McKenney Signed-off-by: Martin K. Petersen --- kernel/rcu/update.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index fbd56d6e575b..68fa19a5e7bd 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -422,11 +422,13 @@ void init_rcu_head(struct rcu_head *head) { debug_object_init(head, &rcuhead_debug_descr); } +EXPORT_SYMBOL_GPL(init_rcu_head); void destroy_rcu_head(struct rcu_head *head) { debug_object_free(head, &rcuhead_debug_descr); } +EXPORT_SYMBOL_GPL(destroy_rcu_head); static bool rcuhead_is_static_object(void *addr) { -- cgit v1.2.3 From 79ee842891595293be37c5aed0e75b4630166c5a Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 8 Dec 2017 11:56:08 +0900 Subject: sched/autogroup: remove unneeded kallsyms include Autogroup does not seem to use any of kallsyms functions/defines. Link: http://lkml.kernel.org/r/20171208025616.16267-2-sergey.senozhatsky@gmail.com To: Andrew Morton To: Michal Hocko To: Rafael Wysocki To: Len Brown To: Bjorn Helgaas To: Vlastimil Babka To: Tejun Heo To: Lai Jiangshan To: Thomas Gleixner To: Fengguang Wu Cc: Steven Rostedt Cc: LKML Cc: linux-pm@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-mm@kvack.org Signed-off-by: Sergey Senozhatsky Acked-by: Peter Zijlstra (Intel) Signed-off-by: Petr Mladek --- kernel/sched/autogroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index a43df5193538..0786227a3f48 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -3,7 +3,6 @@ #include #include -#include #include #include #include -- cgit v1.2.3 From 25493e5fba2f7b8cdade29d0fc8945114ee7732b Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 8 Dec 2017 17:24:22 +0900 Subject: sched/autogroup: move sched.h include Move local "sched.h" include to the bottom. sched.h defines several macros that are getting redefined in ARCH-specific code, for instance, finish_arch_post_lock_switch() and prepare_arch_switch(), so we need ARCH-specific definitions to come in first. Link: http://lkml.kernel.org/r/20171208082422.5021-1-sergey.senozhatsky@gmail.com To: Martin Schwidefsky Cc: Steven Rostedt Cc: LKML Cc: linux-pm@vger.kernel.org Cc: linux-pci@vger.kernel.org Cc: linux-mm@kvack.org Suggested-by: Martin Schwidefsky Signed-off-by: Sergey Senozhatsky Acked-by: Peter Zijlstra (Intel) Signed-off-by: Petr Mladek --- kernel/sched/autogroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index 0786227a3f48..bb4b9fe026a1 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c @@ -1,12 +1,12 @@ // SPDX-License-Identifier: GPL-2.0 -#include "sched.h" - #include #include #include #include #include +#include "sched.h" + unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; static struct autogroup autogroup_default; static atomic_t autogroup_seq_nr; -- cgit v1.2.3 From f371b304f12e31fe30207c41ca7754564e0ea4dc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 11 Dec 2017 11:39:02 -0800 Subject: bpf/tracing: allow user space to query prog array on the same tp Commit e87c6bc3852b ("bpf: permit multiple bpf attachments for a single perf event") added support to attach multiple bpf programs to a single perf event. Although this provides flexibility, users may want to know what other bpf programs attached to the same tp interface. Besides getting visibility for the underlying bpf system, such information may also help consolidate multiple bpf programs, understand potential performance issues due to a large array, and debug (e.g., one bpf program which overwrites return code may impact subsequent program results). Commit 2541517c32be ("tracing, perf: Implement BPF programs attached to kprobes") utilized the existing perf ioctl interface and added the command PERF_EVENT_IOC_SET_BPF to attach a bpf program to a tracepoint. This patch adds a new ioctl command, given a perf event fd, to query the bpf program array attached to the same perf tracepoint event. The new uapi ioctl command: PERF_EVENT_IOC_QUERY_BPF The new uapi/linux/perf_event.h structure: struct perf_event_query_bpf { __u32 ids_len; __u32 prog_cnt; __u32 ids[0]; }; User space provides buffer "ids" for kernel to copy to. When returning from the kernel, the number of available programs in the array is set in "prog_cnt". The usage: struct perf_event_query_bpf *query = malloc(sizeof(*query) + sizeof(u32) * ids_len); query.ids_len = ids_len; err = ioctl(pmu_efd, PERF_EVENT_IOC_QUERY_BPF, query); if (err == 0) { /* query.prog_cnt is the number of available progs, * number of progs in ids: (ids_len == 0) ? 0 : query.prog_cnt */ } else if (errno == ENOSPC) { /* query.ids_len number of progs copied, * query.prog_cnt is the number of available progs */ } else { /* other errors */ } Signed-off-by: Yonghong Song Acked-by: Peter Zijlstra (Intel) Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/bpf.h | 4 ++++ include/uapi/linux/perf_event.h | 22 ++++++++++++++++++++++ kernel/bpf/core.c | 21 +++++++++++++++++++++ kernel/events/core.c | 3 +++ kernel/trace/bpf_trace.c | 23 +++++++++++++++++++++++ 5 files changed, 73 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index e55e4255a210..f812ac508e9f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -254,6 +254,7 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); +int bpf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); @@ -285,6 +286,9 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs, struct bpf_prog *old_prog); +int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, + __u32 __user *prog_ids, u32 request_cnt, + __u32 __user *prog_cnt); int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, struct bpf_prog *exclude_prog, struct bpf_prog *include_prog, diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index b9a4953018ed..769533696483 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -418,6 +418,27 @@ struct perf_event_attr { __u16 __reserved_2; /* align to __u64 */ }; +/* + * Structure used by below PERF_EVENT_IOC_QUERY_BPF command + * to query bpf programs attached to the same perf tracepoint + * as the given perf event. + */ +struct perf_event_query_bpf { + /* + * The below ids array length + */ + __u32 ids_len; + /* + * Set by the kernel to indicate the number of + * available programs + */ + __u32 prog_cnt; + /* + * User provided buffer to store program ids + */ + __u32 ids[0]; +}; + #define perf_flags(attr) (*(&(attr)->read_format + 1)) /* @@ -433,6 +454,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) #define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) #define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 86b50aa26ee8..b16c6f8f42b6 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1462,6 +1462,8 @@ int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs, rcu_read_lock(); prog = rcu_dereference(progs)->progs; for (; *prog; prog++) { + if (*prog == &dummy_bpf_prog.prog) + continue; id = (*prog)->aux->id; if (copy_to_user(prog_ids + i, &id, sizeof(id))) { rcu_read_unlock(); @@ -1545,6 +1547,25 @@ int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array, return 0; } +int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, + __u32 __user *prog_ids, u32 request_cnt, + __u32 __user *prog_cnt) +{ + u32 cnt = 0; + + if (array) + cnt = bpf_prog_array_length(array); + + if (copy_to_user(prog_cnt, &cnt, sizeof(cnt))) + return -EFAULT; + + /* return early if user requested only program count or nothing to copy */ + if (!request_cnt || !cnt) + return 0; + + return bpf_prog_array_copy_to_user(array, prog_ids, request_cnt); +} + static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; diff --git a/kernel/events/core.c b/kernel/events/core.c index 16beab4767e1..f10609e539d4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4723,6 +4723,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon rcu_read_unlock(); return 0; } + + case PERF_EVENT_IOC_QUERY_BPF: + return bpf_event_query_prog_array(event, (void __user *)arg); default: return -ENOTTY; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 0ce99c379c30..b143f2a05aff 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -820,3 +820,26 @@ void perf_event_detach_bpf_prog(struct perf_event *event) unlock: mutex_unlock(&bpf_event_mutex); } + +int bpf_event_query_prog_array(struct perf_event *event, void __user *info) +{ + struct perf_event_query_bpf __user *uquery = info; + struct perf_event_query_bpf query = {}; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + if (copy_from_user(&query, uquery, sizeof(query))) + return -EFAULT; + + mutex_lock(&bpf_event_mutex); + ret = bpf_prog_array_copy_info(event->tp_event->prog_array, + uquery->ids, + query.ids_len, + &uquery->prog_cnt); + mutex_unlock(&bpf_event_mutex); + + return ret; +} -- cgit v1.2.3 From 92ace9991da08827e809c2d120108a96a281e7fc Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 11 Dec 2017 11:36:46 -0500 Subject: add infrastructure for tagging functions as error injectable Using BPF we can override kprob'ed functions and return arbitrary values. Obviously this can be a bit unsafe, so make this feature opt-in for functions. Simply tag a function with KPROBE_ERROR_INJECT_SYMBOL in order to give BPF access to that function for error injection purposes. Signed-off-by: Josef Bacik Acked-by: Ingo Molnar Signed-off-by: Alexei Starovoitov --- include/asm-generic/vmlinux.lds.h | 10 +++ include/linux/bpf.h | 11 +++ include/linux/kprobes.h | 1 + include/linux/module.h | 5 ++ kernel/kprobes.c | 163 ++++++++++++++++++++++++++++++++++++++ kernel/module.c | 6 +- 6 files changed, 195 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ee8b707d9fa9..a2e8582d094a 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -136,6 +136,15 @@ #define KPROBE_BLACKLIST() #endif +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +#define ERROR_INJECT_LIST() . = ALIGN(8); \ + VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .; \ + KEEP(*(_kprobe_error_inject_list)) \ + VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) = .; +#else +#define ERROR_INJECT_LIST() +#endif + #ifdef CONFIG_EVENT_TRACING #define FTRACE_EVENTS() . = ALIGN(8); \ VMLINUX_SYMBOL(__start_ftrace_events) = .; \ @@ -564,6 +573,7 @@ FTRACE_EVENTS() \ TRACE_SYSCALLS() \ KPROBE_BLACKLIST() \ + ERROR_INJECT_LIST() \ MEM_DISCARD(init.rodata) \ CLK_OF_TABLES() \ RESERVEDMEM_OF_TABLES() \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index f812ac508e9f..93e15b9d80c7 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -580,4 +580,15 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +#define BPF_ALLOW_ERROR_INJECTION(fname) \ +static unsigned long __used \ + __attribute__((__section__("_kprobe_error_inject_list"))) \ + _eil_addr_##fname = (unsigned long)fname; +#else +#define BPF_ALLOW_ERROR_INJECTION(fname) +#endif +#endif + #endif /* _LINUX_BPF_H */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 9440a2fc8893..963fd364f3d6 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -271,6 +271,7 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset); extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); extern bool within_kprobe_blacklist(unsigned long addr); +extern bool within_kprobe_error_injection_list(unsigned long addr); struct kprobe_insn_cache { struct mutex mutex; diff --git a/include/linux/module.h b/include/linux/module.h index c69b49abe877..548fa09fa806 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -475,6 +475,11 @@ struct module { ctor_fn_t *ctors; unsigned int num_ctors; #endif + +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + unsigned int num_kprobe_ei_funcs; + unsigned long *kprobe_ei_funcs; +#endif } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT #define MODULE_ARCH_INIT {} diff --git a/kernel/kprobes.c b/kernel/kprobes.c index da2ccf142358..b4aab48ad258 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -83,6 +83,16 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } +/* List of symbols that can be overriden for error injection. */ +static LIST_HEAD(kprobe_error_injection_list); +static DEFINE_MUTEX(kprobe_ei_mutex); +struct kprobe_ei_entry { + struct list_head list; + unsigned long start_addr; + unsigned long end_addr; + void *priv; +}; + /* Blacklist -- list of struct kprobe_blacklist_entry */ static LIST_HEAD(kprobe_blacklist); @@ -1394,6 +1404,17 @@ bool within_kprobe_blacklist(unsigned long addr) return false; } +bool within_kprobe_error_injection_list(unsigned long addr) +{ + struct kprobe_ei_entry *ent; + + list_for_each_entry(ent, &kprobe_error_injection_list, list) { + if (addr >= ent->start_addr && addr < ent->end_addr) + return true; + } + return false; +} + /* * If we have a symbol_name argument, look it up and add the offset field * to it. This way, we can specify a relative address to a symbol. @@ -2168,6 +2189,86 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return 0; } +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +/* Markers of the _kprobe_error_inject_list section */ +extern unsigned long __start_kprobe_error_inject_list[]; +extern unsigned long __stop_kprobe_error_inject_list[]; + +/* + * Lookup and populate the kprobe_error_injection_list. + * + * For safety reasons we only allow certain functions to be overriden with + * bpf_error_injection, so we need to populate the list of the symbols that have + * been marked as safe for overriding. + */ +static void populate_kprobe_error_injection_list(unsigned long *start, + unsigned long *end, + void *priv) +{ + unsigned long *iter; + struct kprobe_ei_entry *ent; + unsigned long entry, offset = 0, size = 0; + + mutex_lock(&kprobe_ei_mutex); + for (iter = start; iter < end; iter++) { + entry = arch_deref_entry_point((void *)*iter); + + if (!kernel_text_address(entry) || + !kallsyms_lookup_size_offset(entry, &size, &offset)) { + pr_err("Failed to find error inject entry at %p\n", + (void *)entry); + continue; + } + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + break; + ent->start_addr = entry; + ent->end_addr = entry + size; + ent->priv = priv; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &kprobe_error_injection_list); + } + mutex_unlock(&kprobe_ei_mutex); +} + +static void __init populate_kernel_kprobe_ei_list(void) +{ + populate_kprobe_error_injection_list(__start_kprobe_error_inject_list, + __stop_kprobe_error_inject_list, + NULL); +} + +static void module_load_kprobe_ei_list(struct module *mod) +{ + if (!mod->num_kprobe_ei_funcs) + return; + populate_kprobe_error_injection_list(mod->kprobe_ei_funcs, + mod->kprobe_ei_funcs + + mod->num_kprobe_ei_funcs, mod); +} + +static void module_unload_kprobe_ei_list(struct module *mod) +{ + struct kprobe_ei_entry *ent, *n; + if (!mod->num_kprobe_ei_funcs) + return; + + mutex_lock(&kprobe_ei_mutex); + list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) { + if (ent->priv == mod) { + list_del_init(&ent->list); + kfree(ent); + } + } + mutex_unlock(&kprobe_ei_mutex); +} +#else +static inline void __init populate_kernel_kprobe_ei_list(void) {} +static inline void module_load_kprobe_ei_list(struct module *m) {} +static inline void module_unload_kprobe_ei_list(struct module *m) {} +#endif + /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2178,6 +2279,11 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); + if (val == MODULE_STATE_COMING) + module_load_kprobe_ei_list(mod); + else if (val == MODULE_STATE_GOING) + module_unload_kprobe_ei_list(mod); + if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2240,6 +2346,8 @@ static int __init init_kprobes(void) pr_err("Please take care of using kprobes.\n"); } + populate_kernel_kprobe_ei_list(); + if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { @@ -2407,6 +2515,56 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; +/* + * kprobes/error_injection_list -- shows which functions can be overriden for + * error injection. + * */ +static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&kprobe_ei_mutex); + return seq_list_start(&kprobe_error_injection_list, *pos); +} + +static void kprobe_ei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&kprobe_ei_mutex); +} + +static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &kprobe_error_injection_list, pos); +} + +static int kprobe_ei_seq_show(struct seq_file *m, void *v) +{ + char buffer[KSYM_SYMBOL_LEN]; + struct kprobe_ei_entry *ent = + list_entry(v, struct kprobe_ei_entry, list); + + sprint_symbol(buffer, ent->start_addr); + seq_printf(m, "%s\n", buffer); + return 0; +} + +static const struct seq_operations kprobe_ei_seq_ops = { + .start = kprobe_ei_seq_start, + .next = kprobe_ei_seq_next, + .stop = kprobe_ei_seq_stop, + .show = kprobe_ei_seq_show, +}; + +static int kprobe_ei_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &kprobe_ei_seq_ops); +} + +static const struct file_operations debugfs_kprobe_ei_ops = { + .open = kprobe_ei_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + static void arm_all_kprobes(void) { struct hlist_head *head; @@ -2548,6 +2706,11 @@ static int __init debugfs_kprobe_init(void) if (!file) goto error; + file = debugfs_create_file("error_injection_list", 0444, dir, NULL, + &debugfs_kprobe_ei_ops); + if (!file) + goto error; + return 0; error: diff --git a/kernel/module.c b/kernel/module.c index dea01ac9cb74..bd695bfdc5c4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3118,7 +3118,11 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif - +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list", + sizeof(*mod->kprobe_ei_funcs), + &mod->num_kprobe_ei_funcs); +#endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); -- cgit v1.2.3 From 9802d86585db91655c7d1929a4f6bbe0952ea88e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 11 Dec 2017 11:36:48 -0500 Subject: bpf: add a bpf_override_function helper Error injection is sloppy and very ad-hoc. BPF could fill this niche perfectly with it's kprobe functionality. We could make sure errors are only triggered in specific call chains that we care about with very specific situations. Accomplish this with the bpf_override_funciton helper. This will modify the probe'd callers return value to the specified value and set the PC to an override function that simply returns, bypassing the originally probed function. This gives us a nice clean way to implement systematic error injection for all of our code paths. Acked-by: Alexei Starovoitov Acked-by: Ingo Molnar Signed-off-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- arch/Kconfig | 3 +++ arch/x86/Kconfig | 1 + arch/x86/include/asm/kprobes.h | 4 +++ arch/x86/include/asm/ptrace.h | 5 ++++ arch/x86/kernel/kprobes/ftrace.c | 14 ++++++++++ include/linux/filter.h | 3 ++- include/linux/trace_events.h | 1 + include/uapi/linux/bpf.h | 7 ++++- kernel/bpf/core.c | 3 +++ kernel/bpf/verifier.c | 2 ++ kernel/events/core.c | 7 +++++ kernel/trace/Kconfig | 11 ++++++++ kernel/trace/bpf_trace.c | 35 +++++++++++++++++++++++++ kernel/trace/trace_kprobe.c | 55 +++++++++++++++++++++++++++++++++++----- kernel/trace/trace_probe.h | 12 +++++++++ 15 files changed, 154 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 400b9e1b2f27..d3f4aaf9cb7a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -196,6 +196,9 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool +config HAVE_KPROBE_OVERRIDE + bool + config HAVE_NMI bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8eed3f94bfc7..04d66e6fa447 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -154,6 +154,7 @@ config X86 select HAVE_KERNEL_XZ select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE + select HAVE_KPROBE_OVERRIDE select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 9f2e3102e0bb..36abb23a7a35 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -67,6 +67,10 @@ extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); asmlinkage void kretprobe_trampoline(void); +#ifdef CONFIG_KPROBES_ON_FTRACE +extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs); +#endif + /* Architecture specific copy of original instruction*/ struct arch_specific_insn { /* copy of the original instruction */ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 14131dd06b29..6de1fd3d0097 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -109,6 +109,11 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) return regs->ax; } +static inline void regs_set_return_value(struct pt_regs *regs, unsigned long rc) +{ + regs->ax = rc; +} + /* * user_mode(regs) determines whether a register set came from user * mode. On x86_32, this is true if V8086 mode was enabled OR if the diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 8dc0161cec8f..1ea748d682fd 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -97,3 +97,17 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p) p->ainsn.boostable = false; return 0; } + +asmlinkage void override_func(void); +asm( + ".type override_func, @function\n" + "override_func:\n" + " ret\n" + ".size override_func, .-override_func\n" +); + +void arch_ftrace_kprobe_override_function(struct pt_regs *regs) +{ + regs->ip = (unsigned long)&override_func; +} +NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function); diff --git a/include/linux/filter.h b/include/linux/filter.h index 0062302e1285..5feb441d3dd9 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -458,7 +458,8 @@ struct bpf_prog { locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1; /* Do we need dst entry? */ + dst_needed:1, /* Do we need dst entry? */ + kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index af44e7c2d577..5fea451f6e28 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -528,6 +528,7 @@ do { \ struct perf_event; DECLARE_PER_CPU(struct pt_regs, perf_trace_regs); +DECLARE_PER_CPU(int, bpf_kprobe_override); extern int perf_trace_init(struct perf_event *event); extern void perf_trace_destroy(struct perf_event *event); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 80d62e88590c..595bda120cfb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -677,6 +677,10 @@ union bpf_attr { * @buf: buf to fill * @buf_size: size of the buf * Return : 0 on success or negative error code + * + * int bpf_override_return(pt_regs, rc) + * @pt_regs: pointer to struct pt_regs + * @rc: the return value to set */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -736,7 +740,8 @@ union bpf_attr { FN(xdp_adjust_meta), \ FN(perf_event_read_value), \ FN(perf_prog_read_value), \ - FN(getsockopt), + FN(getsockopt), \ + FN(override_return), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b16c6f8f42b6..d32bebf4f2de 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1320,6 +1320,9 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) { + if (fp->kprobe_override) + return false; + if (!array->owner_prog_type) { /* There's no owner yet where we could check for * compatibility. diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 7afa92e9b409..e807bda7fe29 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4413,6 +4413,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) prog->dst_needed = 1; if (insn->imm == BPF_FUNC_get_prandom_u32) bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_override_return) + prog->kprobe_override = 1; if (insn->imm == BPF_FUNC_tail_call) { /* If we tail call into other programs, we * cannot make any assumptions since they can diff --git a/kernel/events/core.c b/kernel/events/core.c index f10609e539d4..5857c500721b 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8080,6 +8080,13 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) return -EINVAL; } + /* Kprobe override only works for kprobes, not uprobes. */ + if (prog->kprobe_override && + !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) { + bpf_prog_put(prog); + return -EINVAL; + } + if (is_tracepoint || is_syscall_tp) { int off = trace_event_get_offsets(event->tp_event); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index af7dad126c13..3e6fd580fe7f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -529,6 +529,17 @@ config FUNCTION_PROFILER If in doubt, say N. +config BPF_KPROBE_OVERRIDE + bool "Enable BPF programs to override a kprobed function" + depends on BPF_EVENTS + depends on KPROBES_ON_FTRACE + depends on HAVE_KPROBE_OVERRIDE + depends on DYNAMIC_FTRACE_WITH_REGS + default n + help + Allows BPF to override the execution of a probed function and + set a different return value. This is used for error injection. + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index b143f2a05aff..e009b7ecf473 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,6 +13,10 @@ #include #include #include +#include +#include + +#include "trace_probe.h" #include "trace.h" u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); @@ -76,6 +80,24 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) } EXPORT_SYMBOL_GPL(trace_call_bpf); +#ifdef CONFIG_BPF_KPROBE_OVERRIDE +BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) +{ + __this_cpu_write(bpf_kprobe_override, 1); + regs_set_return_value(regs, rc); + arch_ftrace_kprobe_override_function(regs); + return 0; +} + +static const struct bpf_func_proto bpf_override_return_proto = { + .func = bpf_override_return, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; +#endif + BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr) { int ret; @@ -551,6 +573,10 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_get_stackid_proto; case BPF_FUNC_perf_event_read_value: return &bpf_perf_event_read_value_proto; +#ifdef CONFIG_BPF_KPROBE_OVERRIDE + case BPF_FUNC_override_return: + return &bpf_override_return_proto; +#endif default: return tracing_func_proto(func_id); } @@ -768,6 +794,15 @@ int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog_array *new_array; int ret = -EEXIST; + /* + * Kprobe override only works for ftrace based kprobes, and only if they + * are on the opt-in list. + */ + if (prog->kprobe_override && + (!trace_kprobe_ftrace(event->tp_event) || + !trace_kprobe_error_injectable(event->tp_event))) + return -EINVAL; + mutex_lock(&bpf_event_mutex); if (event->prog) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 492700c5fb4d..5db849809a56 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,6 +42,7 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) +DEFINE_PER_CPU(int, bpf_kprobe_override); static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { @@ -87,6 +88,27 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } +int trace_kprobe_ftrace(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + return kprobe_ftrace(&tk->rp.kp); +} + +int trace_kprobe_error_injectable(struct trace_event_call *call) +{ + struct trace_kprobe *tk = (struct trace_kprobe *)call->data; + unsigned long addr; + + if (tk->symbol) { + addr = (unsigned long) + kallsyms_lookup_name(trace_kprobe_symbol(tk)); + addr += tk->rp.kp.offset; + } else { + addr = (unsigned long)tk->rp.kp.addr; + } + return within_kprobe_error_injection_list(addr); +} + static int register_kprobe_event(struct trace_kprobe *tk); static int unregister_kprobe_event(struct trace_kprobe *tk); @@ -1170,7 +1192,7 @@ static int kretprobe_event_define_fields(struct trace_event_call *event_call) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static void +static int kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct trace_event_call *call = &tk->tp.call; @@ -1179,12 +1201,29 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int size, __size, dsize; int rctx; - if (bpf_prog_array_valid(call) && !trace_call_bpf(call, regs)) - return; + if (bpf_prog_array_valid(call)) { + int ret; + + ret = trace_call_bpf(call, regs); + + /* + * We need to check and see if we modified the pc of the + * pt_regs, and if so clear the kprobe and return 1 so that we + * don't do the instruction skipping. Also reset our state so + * we are clean the next pass through. + */ + if (__this_cpu_read(bpf_kprobe_override)) { + __this_cpu_write(bpf_kprobe_override, 0); + reset_current_kprobe(); + return 1; + } + if (!ret) + return 0; + } head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) - return; + return 0; dsize = __get_data_size(&tk->tp, regs); __size = sizeof(*entry) + tk->tp.size + dsize; @@ -1193,13 +1232,14 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) entry = perf_trace_buf_alloc(size, NULL, &rctx); if (!entry) - return; + return 0; entry->ip = (unsigned long)tk->rp.kp.addr; memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, head, NULL); + return 0; } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1275,6 +1315,7 @@ static int kprobe_register(struct trace_event_call *event, static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); + int ret = 0; raw_cpu_inc(*tk->nhit); @@ -1282,9 +1323,9 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS if (tk->tp.flags & TP_FLAG_PROFILE) - kprobe_perf_func(tk, regs); + ret = kprobe_perf_func(tk, regs); #endif - return 0; /* We don't tweek kernel, so just return 0 */ + return ret; } NOKPROBE_SYMBOL(kprobe_dispatcher); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index fb66e3eaa192..5e54d748c84c 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -252,6 +252,8 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); +int trace_kprobe_ftrace(struct trace_event_call *call); +int trace_kprobe_error_injectable(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -277,6 +279,16 @@ alloc_symbol_cache(const char *sym, long offset) { return NULL; } + +static inline int trace_kprobe_ftrace(struct trace_event_call *call) +{ + return 0; +} + +static inline int trace_kprobe_error_injectable(struct trace_event_call *call) +{ + return 0; +} #endif /* CONFIG_KPROBE_EVENTS */ struct probe_arg { -- cgit v1.2.3 From f4e2298e63d24bb7f5cf0f56f72867973cb7e652 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 13 Dec 2017 10:35:37 -0800 Subject: bpf/tracing: fix kernel/events/core.c compilation error Commit f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") introduced a perf ioctl command to query prog array attached to the same perf tracepoint. The commit introduced a compilation error under certain config conditions, e.g., (1). CONFIG_BPF_SYSCALL is not defined, or (2). CONFIG_TRACING is defined but neither CONFIG_UPROBE_EVENTS nor CONFIG_KPROBE_EVENTS is defined. Error message: kernel/events/core.o: In function `perf_ioctl': core.c:(.text+0x98c4): undefined reference to `bpf_event_query_prog_array' This patch fixed this error by guarding the real definition under CONFIG_BPF_EVENTS and provided static inline dummy function if CONFIG_BPF_EVENTS was not defined. It renamed the function from bpf_event_query_prog_array to perf_event_query_prog_array and moved the definition from linux/bpf.h to linux/trace_events.h so the definition is in proximity to other prog_array related functions. Fixes: f371b304f12e ("bpf/tracing: allow user space to query prog array on the same tp") Reported-by: Stephen Rothwell Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 - include/linux/trace_events.h | 6 ++++++ kernel/events/core.c | 2 +- kernel/trace/bpf_trace.c | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 93e15b9d80c7..54dc7cae2949 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -254,7 +254,6 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); -int bpf_event_query_prog_array(struct perf_event *event, void __user *info); int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, union bpf_attr __user *uattr); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 5fea451f6e28..8a1442c4e513 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -467,6 +467,7 @@ trace_trigger_soft_disabled(struct trace_event_file *file) unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); void perf_event_detach_bpf_prog(struct perf_event *event); +int perf_event_query_prog_array(struct perf_event *event, void __user *info); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -481,6 +482,11 @@ perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog) static inline void perf_event_detach_bpf_prog(struct perf_event *event) { } +static inline int +perf_event_query_prog_array(struct perf_event *event, void __user *info) +{ + return -EOPNOTSUPP; +} #endif enum { diff --git a/kernel/events/core.c b/kernel/events/core.c index 5857c500721b..34fda6aa4606 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4725,7 +4725,7 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon } case PERF_EVENT_IOC_QUERY_BPF: - return bpf_event_query_prog_array(event, (void __user *)arg); + return perf_event_query_prog_array(event, (void __user *)arg); default: return -ENOTTY; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index e009b7ecf473..c5dd60cc0751 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -856,7 +856,7 @@ unlock: mutex_unlock(&bpf_event_mutex); } -int bpf_event_query_prog_array(struct perf_event *event, void __user *info) +int perf_event_query_prog_array(struct perf_event *event, void __user *info) { struct perf_event_query_bpf __user *uquery = info; struct perf_event_query_bpf query = {}; -- cgit v1.2.3 From 8b2770a4e1c7a837e1889d4f00026e999da5faa0 Mon Sep 17 00:00:00 2001 From: Wolffhardt Schwabe Date: Thu, 14 Dec 2017 19:13:57 +0100 Subject: fix typo in assignment of fs default overflow gid The patch remains without practical effect since both macros carry identical values. Still, it might become a problem in the future if (for whatever reason) the default overflow uid and gid differ. The DEFAULT_FS_OVERFLOWGID macro was previously unused. Signed-off-by: Wolffhardt Schwabe Signed-off-by: Anatoliy Cherepantsev Signed-off-by: Eric W. Biederman --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 83ffd7dccf23..f2289de20e19 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -135,7 +135,7 @@ EXPORT_SYMBOL(overflowgid); */ int fs_overflowuid = DEFAULT_FS_OVERFLOWUID; -int fs_overflowgid = DEFAULT_FS_OVERFLOWUID; +int fs_overflowgid = DEFAULT_FS_OVERFLOWGID; EXPORT_SYMBOL(fs_overflowuid); EXPORT_SYMBOL(fs_overflowgid); -- cgit v1.2.3 From cc8b0b92a1699bc32f7fec71daa2bfc90de43a4d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:05 -0800 Subject: bpf: introduce function calls (function boundaries) Allow arbitrary function calls from bpf function to another bpf function. Since the beginning of bpf all bpf programs were represented as a single function and program authors were forced to use always_inline for all functions in their C code. That was causing llvm to unnecessary inflate the code size and forcing developers to move code to header files with little code reuse. With a bit of additional complexity teach verifier to recognize arbitrary function calls from one bpf function to another as long as all of functions are presented to the verifier as a single bpf program. New program layout: r6 = r1 // some code .. r1 = .. // arg1 r2 = .. // arg2 call pc+1 // function call pc-relative exit .. = r1 // access arg1 .. = r2 // access arg2 .. call pc+20 // second level of function call ... It allows for better optimized code and finally allows to introduce the core bpf libraries that can be reused in different projects, since programs are no longer limited by single elf file. With function calls bpf can be compiled into multiple .o files. This patch is the first step. It detects programs that contain multiple functions and checks that calls between them are valid. It splits the sequence of bpf instructions (one program) into a set of bpf functions that call each other. Calls to only known functions are allowed. In the future the verifier may allow calls to unresolved functions and will do dynamic linking. This logic supports statically linked bpf functions only. Such function boundary detection could have been done as part of control flow graph building in check_cfg(), but it's cleaner to separate function boundary detection vs control flow checks within a subprogram (function) into logically indepedent steps. Follow up patches may split check_cfg() further, but not check_subprogs(). Only allow bpf-to-bpf calls for root only and for non-hw-offloaded programs. These restrictions can be relaxed in the future. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 5 +- include/uapi/linux/bpf.h | 6 ++ kernel/bpf/disasm.c | 8 ++- kernel/bpf/verifier.c | 141 ++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 155 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index c561b986bab0..91a583bb3fa7 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -141,6 +141,8 @@ struct bpf_ext_analyzer_ops { int insn_idx, int prev_insn_idx); }; +#define BPF_MAX_SUBPROGS 256 + /* single container for all structs * one verifier_env per bpf_check() call */ @@ -159,8 +161,9 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ - struct bpf_verifer_log log; + u32 subprog_starts[BPF_MAX_SUBPROGS]; + u32 subprog_cnt; }; static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 595bda120cfb..d01f1cb3cfc0 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -197,8 +197,14 @@ enum bpf_attach_type { */ #define BPF_F_STRICT_ALIGNMENT (1U << 0) +/* when bpf_ldimm64->src_reg == BPF_PSEUDO_MAP_FD, bpf_ldimm64->imm == fd */ #define BPF_PSEUDO_MAP_FD 1 +/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative + * offset to another bpf function + */ +#define BPF_PSEUDO_CALL 1 + /* flags for BPF_MAP_UPDATE_ELEM command */ #define BPF_ANY 0 /* create new element or update existing */ #define BPF_NOEXIST 1 /* create new element if it didn't exist */ diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index e682850c9715..883f88fa5bfc 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -189,8 +189,12 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); + if (insn->src_reg == BPF_PSEUDO_CALL) + verbose(env, "(%02x) call pc%+d\n", insn->code, + insn->imm); + else + verbose(env, "(%02x) call %s#%d\n", insn->code, + func_id_name(insn->imm), insn->imm); } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(env, "(%02x) goto pc%+d\n", insn->code, insn->off); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e807bda7fe29..1d0f7ff0b9a9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include "disasm.h" @@ -636,6 +638,113 @@ enum reg_arg_type { DST_OP_NO_MARK /* same as above, check only, don't mark */ }; +static int cmp_subprogs(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +static int find_subprog(struct bpf_verifier_env *env, int off) +{ + u32 *p; + + p = bsearch(&off, env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs); + if (!p) + return -ENOENT; + return p - env->subprog_starts; + +} + +static int add_subprog(struct bpf_verifier_env *env, int off) +{ + int insn_cnt = env->prog->len; + int ret; + + if (off >= insn_cnt || off < 0) { + verbose(env, "call to invalid destination\n"); + return -EINVAL; + } + ret = find_subprog(env, off); + if (ret >= 0) + return 0; + if (env->subprog_cnt >= BPF_MAX_SUBPROGS) { + verbose(env, "too many subprograms\n"); + return -E2BIG; + } + env->subprog_starts[env->subprog_cnt++] = off; + sort(env->subprog_starts, env->subprog_cnt, + sizeof(env->subprog_starts[0]), cmp_subprogs, NULL); + return 0; +} + +static int check_subprogs(struct bpf_verifier_env *env) +{ + int i, ret, subprog_start, subprog_end, off, cur_subprog = 0; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + + /* determine subprog starts. The end is one before the next starts */ + for (i = 0; i < insn_cnt; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + if (!env->allow_ptr_leaks) { + verbose(env, "function calls to other bpf functions are allowed for root only\n"); + return -EPERM; + } + if (bpf_prog_is_dev_bound(env->prog->aux)) { + verbose(env, "funcation calls in offloaded programs are not supported yet\n"); + return -EINVAL; + } + ret = add_subprog(env, i + insn[i].imm + 1); + if (ret < 0) + return ret; + } + + if (env->log.level > 1) + for (i = 0; i < env->subprog_cnt; i++) + verbose(env, "func#%d @%d\n", i, env->subprog_starts[i]); + + /* now check that all jumps are within the same subprog */ + subprog_start = 0; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + for (i = 0; i < insn_cnt; i++) { + u8 code = insn[i].code; + + if (BPF_CLASS(code) != BPF_JMP) + goto next; + if (BPF_OP(code) == BPF_EXIT || BPF_OP(code) == BPF_CALL) + goto next; + off = i + insn[i].off + 1; + if (off < subprog_start || off >= subprog_end) { + verbose(env, "jump out of range from insn %d to %d\n", i, off); + return -EINVAL; + } +next: + if (i == subprog_end - 1) { + /* to avoid fall-through from one subprog into another + * the last insn of the subprog should be either exit + * or unconditional jump back + */ + if (code != (BPF_JMP | BPF_EXIT) && + code != (BPF_JMP | BPF_JA)) { + verbose(env, "last insn is not an exit or jmp\n"); + return -EINVAL; + } + subprog_start = subprog_end; + if (env->subprog_cnt == cur_subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[cur_subprog++]; + } + } + return 0; +} + static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) { struct bpf_verifier_state *parent = state->parent; @@ -3284,6 +3393,10 @@ static int check_cfg(struct bpf_verifier_env *env) int ret = 0; int i, t; + ret = check_subprogs(env); + if (ret < 0) + return ret; + insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL); if (!insn_state) return -ENOMEM; @@ -3316,6 +3429,14 @@ peek_stack: goto err_free; if (t + 1 < insn_cnt) env->explored_states[t + 1] = STATE_LIST_MARK; + if (insns[t].src_reg == BPF_PSEUDO_CALL) { + env->explored_states[t] = STATE_LIST_MARK; + ret = push_insn(t, t + insns[t].imm + 1, BRANCH, env); + if (ret == 1) + goto peek_stack; + else if (ret < 0) + goto err_free; + } } else if (opcode == BPF_JA) { if (BPF_SRC(insns[t].code) != BPF_K) { ret = -EINVAL; @@ -4245,6 +4366,19 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, return 0; } +static void adjust_subprog_starts(struct bpf_verifier_env *env, u32 off, u32 len) +{ + int i; + + if (len == 1) + return; + for (i = 0; i < env->subprog_cnt; i++) { + if (env->subprog_starts[i] < off) + continue; + env->subprog_starts[i] += len - 1; + } +} + static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, const struct bpf_insn *patch, u32 len) { @@ -4255,6 +4389,7 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return NULL; if (adjust_insn_aux_data(env, new_prog->len, off, len)) return NULL; + adjust_subprog_starts(env, off, len); return new_prog; } @@ -4408,6 +4543,8 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL)) continue; + if (insn->src_reg == BPF_PSEUDO_CALL) + continue; if (insn->imm == BPF_FUNC_get_route_realm) prog->dst_needed = 1; @@ -4589,12 +4726,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!env->explored_states) goto skip_full_check; + env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); + ret = check_cfg(env); if (ret < 0) goto skip_full_check; - env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); - ret = do_check(env); if (env->cur_state) { free_verifier_state(env->cur_state, true); -- cgit v1.2.3 From f4d7e40a5b7157e1329c3c5b10f60d8289fc2941 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:06 -0800 Subject: bpf: introduce function calls (verification) Allow arbitrary function calls from bpf function to another bpf function. To recognize such set of bpf functions the verifier does: 1. runs control flow analysis to detect function boundaries 2. proceeds with verification of all functions starting from main(root) function It recognizes that the stack of the caller can be accessed by the callee (if the caller passed a pointer to its stack to the callee) and the callee can store map_value and other pointers into the stack of the caller. 3. keeps track of the stack_depth of each function to make sure that total stack depth is still less than 512 bytes 4. disallows pointers to the callee stack to be stored into the caller stack, since they will be invalid as soon as the callee returns 5. to reuse all of the existing state_pruning logic each function call is considered to be independent call from the verifier point of view. The verifier pretends to inline all function calls it sees are being called. It stores the callsite instruction index as part of the state to make sure that two calls to the same callee from two different places in the caller will be different from state pruning point of view 6. more safety checks are added to liveness analysis Implementation details: . struct bpf_verifier_state is now consists of all stack frames that led to this function . struct bpf_func_state represent one stack frame. It consists of registers in the given frame and its stack . propagate_liveness() logic had a premature optimization where mark_reg_read() and mark_stack_slot_read() were manually inlined with loop iterating over parents for each register or stack slot. Undo this optimization to reuse more complex mark_*_read() logic . skip_callee() logic is not necessary from safety point of view, but without it mark_*_read() markings become too conservative, since after returning from the funciton call a read of r6-r9 will incorrectly propagate the read marks into callee causing inefficient pruning later . mark_*_read() logic is now aware of control flow which makes it more complex. In the future the plan is to rewrite liveness to be hierarchical. So that liveness can be done within basic block only and control flow will be responsible for propagation of liveness information along cfg and between calls. . tail_calls and ld_abs insns are not allowed in the programs with bpf-to-bpf calls . returning stack pointers to the caller or storing them into stack frame of the caller is not allowed Testing: . no difference in cilium processed_insn numbers . large number of tests follows in next patches Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 36 ++- kernel/bpf/verifier.c | 702 +++++++++++++++++++++++++++++++++---------- 2 files changed, 585 insertions(+), 153 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 91a583bb3fa7..1f23408024ee 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -76,6 +76,14 @@ struct bpf_reg_state { s64 smax_value; /* maximum possible (s64)value */ u64 umin_value; /* minimum possible (u64)value */ u64 umax_value; /* maximum possible (u64)value */ + /* Inside the callee two registers can be both PTR_TO_STACK like + * R1=fp-8 and R2=fp-8, but one of them points to this function stack + * while another to the caller's stack. To differentiate them 'frameno' + * is used which is an index in bpf_verifier_state->frame[] array + * pointing to bpf_func_state. + * This field must be second to last, for states_equal() reasons. + */ + u32 frameno; /* This field must be last, for states_equal() reasons. */ enum bpf_reg_liveness live; }; @@ -96,13 +104,34 @@ struct bpf_stack_state { /* state of the program: * type of all registers and stack info */ -struct bpf_verifier_state { +struct bpf_func_state { struct bpf_reg_state regs[MAX_BPF_REG]; struct bpf_verifier_state *parent; + /* index of call instruction that called into this func */ + int callsite; + /* stack frame number of this function state from pov of + * enclosing bpf_verifier_state. + * 0 = main function, 1 = first callee. + */ + u32 frameno; + /* subprog number == index within subprog_stack_depth + * zero == main subprog + */ + u32 subprogno; + + /* should be second to last. See copy_func_state() */ int allocated_stack; struct bpf_stack_state *stack; }; +#define MAX_CALL_FRAMES 8 +struct bpf_verifier_state { + /* call stack tracking */ + struct bpf_func_state *frame[MAX_CALL_FRAMES]; + struct bpf_verifier_state *parent; + u32 curframe; +}; + /* linked list of verifier states used to prune search */ struct bpf_verifier_state_list { struct bpf_verifier_state state; @@ -163,12 +192,15 @@ struct bpf_verifier_env { struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifer_log log; u32 subprog_starts[BPF_MAX_SUBPROGS]; + u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; }; static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) { - return env->cur_state->regs; + struct bpf_verifier_state *cur = env->cur_state; + + return cur->frame[cur->curframe]->regs; } #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 1d0f7ff0b9a9..f6e09d84a96f 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -229,13 +229,23 @@ static void print_liveness(struct bpf_verifier_env *env, verbose(env, "w"); } +static struct bpf_func_state *func(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg) +{ + struct bpf_verifier_state *cur = env->cur_state; + + return cur->frame[reg->frameno]; +} + static void print_verifier_state(struct bpf_verifier_env *env, - struct bpf_verifier_state *state) + const struct bpf_func_state *state) { - struct bpf_reg_state *reg; + const struct bpf_reg_state *reg; enum bpf_reg_type t; int i; + if (state->frameno) + verbose(env, " frame%d:", state->frameno); for (i = 0; i < MAX_BPF_REG; i++) { reg = &state->regs[i]; t = reg->type; @@ -248,6 +258,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, tnum_is_const(reg->var_off)) { /* reg->off should be 0 for SCALAR_VALUE */ verbose(env, "%lld", reg->var_off.value + reg->off); + if (t == PTR_TO_STACK) + verbose(env, ",call_%d", func(env, reg)->callsite); } else { verbose(env, "(id=%d", reg->id); if (t != SCALAR_VALUE) @@ -303,8 +315,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "\n"); } -static int copy_stack_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) +static int copy_stack_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { if (!src->stack) return 0; @@ -320,13 +332,13 @@ static int copy_stack_state(struct bpf_verifier_state *dst, /* do_check() starts with zero-sized stack in struct bpf_verifier_state to * make it consume minimal amount of memory. check_stack_write() access from - * the program calls into realloc_verifier_state() to grow the stack size. + * the program calls into realloc_func_state() to grow the stack size. * Note there is a non-zero 'parent' pointer inside bpf_verifier_state * which this function copies over. It points to previous bpf_verifier_state * which is never reallocated */ -static int realloc_verifier_state(struct bpf_verifier_state *state, int size, - bool copy_old) +static int realloc_func_state(struct bpf_func_state *state, int size, + bool copy_old) { u32 old_size = state->allocated_stack; struct bpf_stack_state *new_stack; @@ -359,10 +371,21 @@ static int realloc_verifier_state(struct bpf_verifier_state *state, int size, return 0; } +static void free_func_state(struct bpf_func_state *state) +{ + kfree(state->stack); + kfree(state); +} + static void free_verifier_state(struct bpf_verifier_state *state, bool free_self) { - kfree(state->stack); + int i; + + for (i = 0; i <= state->curframe; i++) { + free_func_state(state->frame[i]); + state->frame[i] = NULL; + } if (free_self) kfree(state); } @@ -370,18 +393,46 @@ static void free_verifier_state(struct bpf_verifier_state *state, /* copy verifier state from src to dst growing dst stack space * when necessary to accommodate larger src stack */ -static int copy_verifier_state(struct bpf_verifier_state *dst, - const struct bpf_verifier_state *src) +static int copy_func_state(struct bpf_func_state *dst, + const struct bpf_func_state *src) { int err; - err = realloc_verifier_state(dst, src->allocated_stack, false); + err = realloc_func_state(dst, src->allocated_stack, false); if (err) return err; - memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); + memcpy(dst, src, offsetof(struct bpf_func_state, allocated_stack)); return copy_stack_state(dst, src); } +static int copy_verifier_state(struct bpf_verifier_state *dst_state, + const struct bpf_verifier_state *src) +{ + struct bpf_func_state *dst; + int i, err; + + /* if dst has more stack frames then src frame, free them */ + for (i = src->curframe + 1; i <= dst_state->curframe; i++) { + free_func_state(dst_state->frame[i]); + dst_state->frame[i] = NULL; + } + dst_state->curframe = src->curframe; + dst_state->parent = src->parent; + for (i = 0; i <= src->curframe; i++) { + dst = dst_state->frame[i]; + if (!dst) { + dst = kzalloc(sizeof(*dst), GFP_KERNEL); + if (!dst) + return -ENOMEM; + dst_state->frame[i] = dst; + } + err = copy_func_state(dst, src->frame[i]); + if (err) + return err; + } + return 0; +} + static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, int *insn_idx) { @@ -443,6 +494,10 @@ err: static const int caller_saved[CALLER_SAVED_REGS] = { BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5 }; +#define CALLEE_SAVED_REGS 5 +static const int callee_saved[CALLEE_SAVED_REGS] = { + BPF_REG_6, BPF_REG_7, BPF_REG_8, BPF_REG_9 +}; static void __mark_reg_not_init(struct bpf_reg_state *reg); @@ -578,6 +633,7 @@ static void __mark_reg_unknown(struct bpf_reg_state *reg) reg->id = 0; reg->off = 0; reg->var_off = tnum_unknown; + reg->frameno = 0; __mark_reg_unbounded(reg); } @@ -614,8 +670,9 @@ static void mark_reg_not_init(struct bpf_verifier_env *env, } static void init_reg_state(struct bpf_verifier_env *env, - struct bpf_reg_state *regs) + struct bpf_func_state *state) { + struct bpf_reg_state *regs = state->regs; int i; for (i = 0; i < MAX_BPF_REG; i++) { @@ -626,12 +683,24 @@ static void init_reg_state(struct bpf_verifier_env *env, /* frame pointer */ regs[BPF_REG_FP].type = PTR_TO_STACK; mark_reg_known_zero(env, regs, BPF_REG_FP); + regs[BPF_REG_FP].frameno = state->frameno; /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; mark_reg_known_zero(env, regs, BPF_REG_1); } +#define BPF_MAIN_FUNC (-1) +static void init_func_state(struct bpf_verifier_env *env, + struct bpf_func_state *state, + int callsite, int frameno, int subprogno) +{ + state->callsite = callsite; + state->frameno = frameno; + state->subprogno = subprogno; + init_reg_state(env, state); +} + enum reg_arg_type { SRC_OP, /* register is used as source operand */ DST_OP, /* register is used as destination operand */ @@ -745,29 +814,86 @@ next: return 0; } -static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) +struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) { - struct bpf_verifier_state *parent = state->parent; + struct bpf_verifier_state *tmp = NULL; + + /* 'parent' could be a state of caller and + * 'state' could be a state of callee. In such case + * parent->curframe < state->curframe + * and it's ok for r1 - r5 registers + * + * 'parent' could be a callee's state after it bpf_exit-ed. + * In such case parent->curframe > state->curframe + * and it's ok for r0 only + */ + if (parent->curframe == state->curframe || + (parent->curframe < state->curframe && + regno >= BPF_REG_1 && regno <= BPF_REG_5) || + (parent->curframe > state->curframe && + regno == BPF_REG_0)) + return parent; + + if (parent->curframe > state->curframe && + regno >= BPF_REG_6) { + /* for callee saved regs we have to skip the whole chain + * of states that belong to callee and mark as LIVE_READ + * the registers before the call + */ + tmp = parent; + while (tmp && tmp->curframe != state->curframe) { + tmp = tmp->parent; + } + if (!tmp) + goto bug; + parent = tmp; + } else { + goto bug; + } + return parent; +bug: + verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); + verbose(env, "regno %d parent frame %d current frame %d\n", + regno, parent->curframe, state->curframe); + return 0; +} + +static int mark_reg_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + u32 regno) +{ + bool writes = parent == state->parent; /* Observe write marks */ if (regno == BPF_REG_FP) /* We don't need to worry about FP liveness because it's read-only */ - return; + return 0; while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->regs[regno].live & REG_LIVE_WRITTEN) + if (writes && state->frame[state->curframe]->regs[regno].live & REG_LIVE_WRITTEN) break; + parent = skip_callee(env, state, parent, regno); + if (!parent) + return -EFAULT; /* ... then we depend on parent's value */ - parent->regs[regno].live |= REG_LIVE_READ; + parent->frame[parent->curframe]->regs[regno].live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } + return 0; } static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_reg_state *regs = env->cur_state->regs; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs; if (regno >= MAX_BPF_REG) { verbose(env, "R%d is invalid\n", regno); @@ -780,7 +906,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose(env, "R%d !read_ok\n", regno); return -EACCES; } - mark_reg_read(env->cur_state, regno); + return mark_reg_read(env, vstate, vstate->parent, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -815,13 +941,15 @@ static bool is_spillable_regtype(enum bpf_reg_type type) * stack boundary and alignment are checked in check_mem_access() */ static int check_stack_write(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, - int size, int value_regno) + struct bpf_func_state *state, /* func where register points to */ + int off, int size, int value_regno) { + struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + enum bpf_reg_type type; - err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), - true); + err = realloc_func_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); if (err) return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, @@ -834,8 +962,9 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EACCES; } + cur = env->cur_state->frame[env->cur_state->curframe]; if (value_regno >= 0 && - is_spillable_regtype(state->regs[value_regno].type)) { + is_spillable_regtype((type = cur->regs[value_regno].type))) { /* register containing pointer is being spilled into stack */ if (size != BPF_REG_SIZE) { @@ -843,8 +972,13 @@ static int check_stack_write(struct bpf_verifier_env *env, return -EACCES; } + if (state != cur && type == PTR_TO_STACK) { + verbose(env, "cannot spill pointers to stack into stack frame of the caller\n"); + return -EINVAL; + } + /* save register state */ - state->stack[spi].spilled_ptr = state->regs[value_regno]; + state->stack[spi].spilled_ptr = cur->regs[value_regno]; state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) @@ -860,34 +994,68 @@ static int check_stack_write(struct bpf_verifier_env *env, return 0; } -static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slot) +/* registers of every function are unique and mark_reg_read() propagates + * the liveness in the following cases: + * - from callee into caller for R1 - R5 that were used as arguments + * - from caller into callee for R0 that used as result of the call + * - from caller to the same caller skipping states of the callee for R6 - R9, + * since R6 - R9 are callee saved by implicit function prologue and + * caller's R6 != callee's R6, so when we propagate liveness up to + * parent states we need to skip callee states for R6 - R9. + * + * stack slot marking is different, since stacks of caller and callee are + * accessible in both (since caller can pass a pointer to caller's stack to + * callee which can pass it to another function), hence mark_stack_slot_read() + * has to propagate the stack liveness to all parent states at given frame number. + * Consider code: + * f1() { + * ptr = fp - 8; + * *ptr = ctx; + * call f2 { + * .. = *ptr; + * } + * .. = *ptr; + * } + * First *ptr is reading from f1's stack and mark_stack_slot_read() has + * to mark liveness at the f1's frame and not f2's frame. + * Second *ptr is also reading from f1's stack and mark_stack_slot_read() has + * to propagate liveness to f2 states at f1's frame level and further into + * f1 states at f1's frame level until write into that stack slot + */ +static void mark_stack_slot_read(struct bpf_verifier_env *env, + const struct bpf_verifier_state *state, + struct bpf_verifier_state *parent, + int slot, int frameno) { - struct bpf_verifier_state *parent = state->parent; + bool writes = parent == state->parent; /* Observe write marks */ while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) + if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; + parent->frame[frameno]->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; + writes = true; } } static int check_stack_read(struct bpf_verifier_env *env, - struct bpf_verifier_state *state, int off, int size, - int value_regno) + struct bpf_func_state *reg_state /* func where register points to */, + int off, int size, int value_regno) { + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; u8 *stype; - if (state->allocated_stack <= slot) { + if (reg_state->allocated_stack <= slot) { verbose(env, "invalid read from stack off %d+0 size %d\n", off, size); return -EACCES; } - stype = state->stack[spi].slot_type; + stype = reg_state->stack[spi].slot_type; if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { @@ -903,13 +1071,14 @@ static int check_stack_read(struct bpf_verifier_env *env, if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = state->stack[spi].spilled_ptr; + state->regs[value_regno] = reg_state->stack[spi].spilled_ptr; /* mark reg as written since spilled pointer state likely * has its liveness marks cleared by is_state_visited() * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; - mark_stack_slot_read(state, spi); + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); } return 0; } else { @@ -947,7 +1116,8 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, static int check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size, bool zero_size_allowed) { - struct bpf_verifier_state *state = env->cur_state; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *reg = &state->regs[regno]; int err; @@ -1197,6 +1367,39 @@ static int check_ptr_alignment(struct bpf_verifier_env *env, strict); } +static int update_stack_depth(struct bpf_verifier_env *env, + const struct bpf_func_state *func, + int off) +{ + u16 stack = env->subprog_stack_depth[func->subprogno], total = 0; + struct bpf_verifier_state *cur = env->cur_state; + int i; + + if (stack >= -off) + return 0; + + /* update known max for given subprogram */ + env->subprog_stack_depth[func->subprogno] = -off; + + /* compute the total for current call chain */ + for (i = 0; i <= cur->curframe; i++) { + u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno]; + + /* round up to 32-bytes, since this is granularity + * of interpreter stack sizes + */ + depth = round_up(depth, 32); + total += depth; + } + + if (total > MAX_BPF_STACK) { + verbose(env, "combined stack size of %d calls is %d. Too large\n", + cur->curframe, total); + return -EACCES; + } + return 0; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -1207,9 +1410,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn int bpf_size, enum bpf_access_type t, int value_regno) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = regs + regno; + struct bpf_func_state *state; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -1298,8 +1501,10 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn return -EACCES; } - if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; + state = func(env, reg); + err = update_stack_depth(env, state, off); + if (err) + return err; if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, @@ -1390,7 +1595,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, struct bpf_call_arg_meta *meta) { struct bpf_reg_state *reg = cur_regs(env) + regno; - struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *state = func(env, reg); int off, i, slot, spi; if (reg->type != PTR_TO_STACK) { @@ -1421,9 +1626,6 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } - if (env->prog->aux->stack_depth < -off) - env->prog->aux->stack_depth = -off; - if (meta && meta->raw_mode) { meta->access_size = access_size; meta->regno = regno; @@ -1441,7 +1643,7 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, return -EACCES; } } - return 0; + return update_stack_depth(env, state, off); } static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, @@ -1694,6 +1896,10 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env, case BPF_FUNC_tail_call: if (map->map_type != BPF_MAP_TYPE_PROG_ARRAY) goto error; + if (env->subprog_cnt) { + verbose(env, "tail_calls are not allowed in programs with bpf-to-bpf calls\n"); + return -EINVAL; + } break; case BPF_FUNC_perf_event_read: case BPF_FUNC_perf_event_output: @@ -1755,9 +1961,9 @@ static int check_raw_mode(const struct bpf_func_proto *fn) /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] * are now invalid, so turn them into unknown SCALAR_VALUE. */ -static void clear_all_pkt_pointers(struct bpf_verifier_env *env) +static void __clear_all_pkt_pointers(struct bpf_verifier_env *env, + struct bpf_func_state *state) { - struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -1774,7 +1980,121 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) } } -static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) +static void clear_all_pkt_pointers(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + int i; + + for (i = 0; i <= vstate->curframe; i++) + __clear_all_pkt_pointers(env, vstate->frame[i]); +} + +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, + int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + int i, subprog, target_insn; + + if (state->curframe >= MAX_CALL_FRAMES) { + verbose(env, "the call stack of %d frames is too deep\n", + state->curframe); + return -E2BIG; + } + + target_insn = *insn_idx + insn->imm; + subprog = find_subprog(env, target_insn + 1); + if (subprog < 0) { + verbose(env, "verifier bug. No program starts at insn %d\n", + target_insn + 1); + return -EFAULT; + } + + caller = state->frame[state->curframe]; + if (state->frame[state->curframe + 1]) { + verbose(env, "verifier bug. Frame %d already allocated\n", + state->curframe + 1); + return -EFAULT; + } + + callee = kzalloc(sizeof(*callee), GFP_KERNEL); + if (!callee) + return -ENOMEM; + state->frame[state->curframe + 1] = callee; + + /* callee cannot access r0, r6 - r9 for reading and has to write + * into its own stack before reading from it. + * callee can read/write into caller's stack + */ + init_func_state(env, callee, + /* remember the callsite, it will be used by bpf_exit */ + *insn_idx /* callsite */, + state->curframe + 1 /* frameno within this callchain */, + subprog + 1 /* subprog number within this prog */); + + /* copy r1 - r5 args that callee can access */ + for (i = BPF_REG_1; i <= BPF_REG_5; i++) + callee->regs[i] = caller->regs[i]; + + /* after the call regsiters r0 - r5 were scratched */ + for (i = 0; i < CALLER_SAVED_REGS; i++) { + mark_reg_not_init(env, caller->regs, caller_saved[i]); + check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK); + } + + /* only increment it after check_reg_arg() finished */ + state->curframe++; + + /* and go analyze first insn of the callee */ + *insn_idx = target_insn; + + if (env->log.level) { + verbose(env, "caller:\n"); + print_verifier_state(env, caller); + verbose(env, "callee:\n"); + print_verifier_state(env, callee); + } + return 0; +} + +static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) +{ + struct bpf_verifier_state *state = env->cur_state; + struct bpf_func_state *caller, *callee; + struct bpf_reg_state *r0; + + callee = state->frame[state->curframe]; + r0 = &callee->regs[BPF_REG_0]; + if (r0->type == PTR_TO_STACK) { + /* technically it's ok to return caller's stack pointer + * (or caller's caller's pointer) back to the caller, + * since these pointers are valid. Only current stack + * pointer will be invalid as soon as function exits, + * but let's be conservative + */ + verbose(env, "cannot return stack pointer to the caller\n"); + return -EINVAL; + } + + state->curframe--; + caller = state->frame[state->curframe]; + /* return to the caller whatever r0 had in the callee */ + caller->regs[BPF_REG_0] = *r0; + + *insn_idx = callee->callsite + 1; + if (env->log.level) { + verbose(env, "returning from callee:\n"); + print_verifier_state(env, callee); + verbose(env, "to caller at %d:\n", *insn_idx); + print_verifier_state(env, caller); + } + /* clear everything in the callee */ + free_func_state(callee); + state->frame[state->curframe + 1] = NULL; + return 0; +} + +static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; struct bpf_reg_state *regs; @@ -1934,7 +2254,9 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; @@ -1946,13 +2268,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, dst_reg = ®s[dst]; if (WARN_ON_ONCE(known && (smin_val != smax_val))) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: known but bad sbounds\n"); return -EINVAL; } if (WARN_ON_ONCE(known && (umin_val != umax_val))) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: known but bad ubounds\n"); return -EINVAL; @@ -2354,7 +2676,9 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_func_state *state = vstate->frame[vstate->curframe]; + struct bpf_reg_state *regs = state->regs, *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); int rc; @@ -2428,12 +2752,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(env, env->cur_state); + print_verifier_state(env, state); verbose(env, "verifier internal error: no src_reg\n"); return -EINVAL; } @@ -2587,14 +2911,15 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) return 0; } -static void find_good_pkt_pointers(struct bpf_verifier_state *state, +static void find_good_pkt_pointers(struct bpf_verifier_state *vstate, struct bpf_reg_state *dst_reg, enum bpf_reg_type type, bool range_right_open) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs, *reg; u16 new_range; - int i; + int i, j; if (dst_reg->off < 0 || (dst_reg->off == 0 && range_right_open)) @@ -2664,12 +2989,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - reg = &state->stack[i].spilled_ptr; - if (reg->type == type && reg->id == dst_reg->id) - reg->range = max(reg->range, new_range); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + reg = &state->stack[i].spilled_ptr; + if (reg->type == type && reg->id == dst_reg->id) + reg->range = max(reg->range, new_range); + } } } @@ -2907,20 +3235,24 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, /* The logic is similar to find_good_pkt_pointers(), both could eventually * be folded together at some point. */ -static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, +static void mark_map_regs(struct bpf_verifier_state *vstate, u32 regno, bool is_null) { + struct bpf_func_state *state = vstate->frame[vstate->curframe]; struct bpf_reg_state *regs = state->regs; u32 id = regs[regno].id; - int i; + int i, j; for (i = 0; i < MAX_BPF_REG; i++) mark_map_reg(regs, i, id, is_null); - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + for (j = 0; j <= vstate->curframe; j++) { + state = vstate->frame[j]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); + } } } @@ -3020,8 +3352,10 @@ static bool try_match_pkt_pointers(const struct bpf_insn *insn, static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; - struct bpf_reg_state *regs = this_branch->regs, *dst_reg; + struct bpf_verifier_state *this_branch = env->cur_state; + struct bpf_verifier_state *other_branch; + struct bpf_reg_state *regs = this_branch->frame[this_branch->curframe]->regs; + struct bpf_reg_state *dst_reg, *other_branch_regs; u8 opcode = BPF_OP(insn->code); int err; @@ -3084,6 +3418,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); if (!other_branch) return -EFAULT; + other_branch_regs = other_branch->frame[other_branch->curframe]->regs; /* detect if we are comparing against a constant value so we can adjust * our min/max values for our dst register. @@ -3096,22 +3431,22 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (dst_reg->type == SCALAR_VALUE && regs[insn->src_reg].type == SCALAR_VALUE) { if (tnum_is_const(regs[insn->src_reg].var_off)) - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, regs[insn->src_reg].var_off.value, opcode); else if (tnum_is_const(dst_reg->var_off)) - reg_set_min_max_inv(&other_branch->regs[insn->src_reg], + reg_set_min_max_inv(&other_branch_regs[insn->src_reg], ®s[insn->src_reg], dst_reg->var_off.value, opcode); else if (opcode == BPF_JEQ || opcode == BPF_JNE) /* Comparing for equality, we can combine knowledge */ - reg_combine_min_max(&other_branch->regs[insn->src_reg], - &other_branch->regs[insn->dst_reg], + reg_combine_min_max(&other_branch_regs[insn->src_reg], + &other_branch_regs[insn->dst_reg], ®s[insn->src_reg], ®s[insn->dst_reg], opcode); } } else if (dst_reg->type == SCALAR_VALUE) { - reg_set_min_max(&other_branch->regs[insn->dst_reg], + reg_set_min_max(&other_branch_regs[insn->dst_reg], dst_reg, insn->imm, opcode); } @@ -3132,7 +3467,7 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, return -EACCES; } if (env->log.level) - print_verifier_state(env, this_branch); + print_verifier_state(env, this_branch->frame[this_branch->curframe]); return 0; } @@ -3217,6 +3552,18 @@ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if (env->subprog_cnt) { + /* when program has LD_ABS insn JITs and interpreter assume + * that r1 == ctx == skb which is not the case for callees + * that can have arbitrary arguments. It's problematic + * for main prog as well since JITs would need to analyze + * all functions in order to make proper register save/restore + * decisions in the main prog. Hence disallow LD_ABS with calls + */ + verbose(env, "BPF_LD_[ABS|IND] instructions cannot be mixed with bpf-to-bpf calls\n"); + return -EINVAL; + } + if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { @@ -3555,11 +3902,21 @@ static bool check_ids(u32 old_id, u32 cur_id, struct idpair *idmap) static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, struct idpair *idmap) { + bool equal; + if (!(rold->live & REG_LIVE_READ)) /* explored state didn't use this */ return true; - if (memcmp(rold, rcur, offsetof(struct bpf_reg_state, live)) == 0) + equal = memcmp(rold, rcur, offsetof(struct bpf_reg_state, frameno)) == 0; + + if (rold->type == PTR_TO_STACK) + /* two stack pointers are equal only if they're pointing to + * the same stack frame, since fp-8 in foo != fp-8 in bar + */ + return equal && rold->frameno == rcur->frameno; + + if (equal) return true; if (rold->type == NOT_INIT) @@ -3632,7 +3989,6 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, tnum_in(rold->var_off, rcur->var_off); case PTR_TO_CTX: case CONST_PTR_TO_MAP: - case PTR_TO_STACK: case PTR_TO_PACKET_END: /* Only valid matches are exact, which memcmp() above * would have accepted @@ -3647,8 +4003,8 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } -static bool stacksafe(struct bpf_verifier_state *old, - struct bpf_verifier_state *cur, +static bool stacksafe(struct bpf_func_state *old, + struct bpf_func_state *cur, struct idpair *idmap) { int i, spi; @@ -3724,9 +4080,8 @@ static bool stacksafe(struct bpf_verifier_state *old, * whereas register type in current state is meaningful, it means that * the current state will reach 'bpf_exit' instruction safely */ -static bool states_equal(struct bpf_verifier_env *env, - struct bpf_verifier_state *old, - struct bpf_verifier_state *cur) +static bool func_states_equal(struct bpf_func_state *old, + struct bpf_func_state *cur) { struct idpair *idmap; bool ret = false; @@ -3750,71 +4105,76 @@ out_free: return ret; } +static bool states_equal(struct bpf_verifier_env *env, + struct bpf_verifier_state *old, + struct bpf_verifier_state *cur) +{ + int i; + + if (old->curframe != cur->curframe) + return false; + + /* for states to be equal callsites have to be the same + * and all frame states need to be equivalent + */ + for (i = 0; i <= old->curframe; i++) { + if (old->frame[i]->callsite != cur->frame[i]->callsite) + return false; + if (!func_states_equal(old->frame[i], cur->frame[i])) + return false; + } + return true; +} + /* A write screens off any subsequent reads; but write marks come from the - * straight-line code between a state and its parent. When we arrive at a - * jump target (in the first iteration of the propagate_liveness() loop), - * we didn't arrive by the straight-line code, so read marks in state must - * propagate to parent regardless of state's write marks. + * straight-line code between a state and its parent. When we arrive at an + * equivalent state (jump target or such) we didn't arrive by the straight-line + * code, so read marks in the state must propagate to the parent regardless + * of the state's write marks. That's what 'parent == state->parent' comparison + * in mark_reg_read() and mark_stack_slot_read() is for. */ -static bool do_propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) +static int propagate_liveness(struct bpf_verifier_env *env, + const struct bpf_verifier_state *vstate, + struct bpf_verifier_state *vparent) { - bool writes = parent == state->parent; /* Observe write marks */ - bool touched = false; /* any changes made? */ - int i; + int i, frame, err = 0; + struct bpf_func_state *state, *parent; - if (!parent) - return touched; + if (vparent->curframe != vstate->curframe) { + WARN(1, "propagate_live: parent frame %d current frame %d\n", + vparent->curframe, vstate->curframe); + return -EFAULT; + } /* Propagate read liveness of registers... */ BUILD_BUG_ON(BPF_REG_FP + 1 != MAX_BPF_REG); /* We don't need to worry about FP liveness because it's read-only */ for (i = 0; i < BPF_REG_FP; i++) { - if (parent->regs[i].live & REG_LIVE_READ) + if (vparent->frame[vparent->curframe]->regs[i].live & REG_LIVE_READ) continue; - if (writes && (state->regs[i].live & REG_LIVE_WRITTEN)) - continue; - if (state->regs[i].live & REG_LIVE_READ) { - parent->regs[i].live |= REG_LIVE_READ; - touched = true; + if (vstate->frame[vstate->curframe]->regs[i].live & REG_LIVE_READ) { + err = mark_reg_read(env, vstate, vparent, i); + if (err) + return err; } } + /* ... and stack slots */ - for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && - i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) - continue; - if (writes && - (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) - continue; - if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { - parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; - touched = true; + for (frame = 0; frame <= vstate->curframe; frame++) { + state = vstate->frame[frame]; + parent = vparent->frame[frame]; + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].slot_type[0] != STACK_SPILL) + continue; + if (state->stack[i].slot_type[0] != STACK_SPILL) + continue; + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) + continue; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) + mark_stack_slot_read(env, vstate, vparent, i, frame); } } - return touched; -} - -/* "parent" is "a state from which we reach the current state", but initially - * it is not the state->parent (i.e. "the state whose straight-line code leads - * to the current state"), instead it is the state that happened to arrive at - * a (prunable) equivalent of the current state. See comment above - * do_propagate_liveness() for consequences of this. - * This function is just a more efficient way of calling mark_reg_read() or - * mark_stack_slot_read() on each reg in "parent" that is read in "state", - * though it requires that parent != state->parent in the call arguments. - */ -static void propagate_liveness(const struct bpf_verifier_state *state, - struct bpf_verifier_state *parent) -{ - while (do_propagate_liveness(state, parent)) { - /* Something changed, so we need to feed those changes onward */ - state = parent; - parent = state->parent; - } + return err; } static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) @@ -3822,7 +4182,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; struct bpf_verifier_state *cur = env->cur_state; - int i, err; + int i, j, err; sl = env->explored_states[insn_idx]; if (!sl) @@ -3843,7 +4203,9 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - propagate_liveness(&sl->state, cur); + err = propagate_liveness(env, &sl->state, cur); + if (err) + return err; return 1; } sl = sl->next; @@ -3851,9 +4213,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) /* there were no equivalent states, remember current one. * technically the current state is not proven to be safe yet, - * but it will either reach bpf_exit (which means it's safe) or - * it will be rejected. Since there are no loops, we won't be - * seeing this 'insn_idx' instruction again on the way to bpf_exit + * but it will either reach outer most bpf_exit (which means it's safe) + * or it will be rejected. Since there are no loops, we won't be + * seeing this tuple (frame[0].callsite, frame[1].callsite, .. insn_idx) + * again on the way to bpf_exit */ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) @@ -3877,10 +4240,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - cur->regs[i].live = REG_LIVE_NONE; - for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) - if (cur->stack[i].slot_type[0] == STACK_SPILL) - cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; + cur->frame[cur->curframe]->regs[i].live = REG_LIVE_NONE; + + /* all stack frames are accessible from callee, clear them all */ + for (j = 0; j <= cur->curframe; j++) { + struct bpf_func_state *frame = cur->frame[j]; + + for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) + if (frame->stack[i].slot_type[0] == STACK_SPILL) + frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + } return 0; } @@ -3898,7 +4267,7 @@ static int do_check(struct bpf_verifier_env *env) struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; struct bpf_reg_state *regs; - int insn_cnt = env->prog->len; + int insn_cnt = env->prog->len, i; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; @@ -3906,9 +4275,18 @@ static int do_check(struct bpf_verifier_env *env) state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); if (!state) return -ENOMEM; - env->cur_state = state; - init_reg_state(env, state->regs); + state->curframe = 0; state->parent = NULL; + state->frame[0] = kzalloc(sizeof(struct bpf_func_state), GFP_KERNEL); + if (!state->frame[0]) { + kfree(state); + return -ENOMEM; + } + env->cur_state = state; + init_func_state(env, state->frame[0], + BPF_MAIN_FUNC /* callsite */, + 0 /* frameno */, + 0 /* subprogno, zero == main subprog */); insn_idx = 0; for (;;) { struct bpf_insn *insn; @@ -3955,7 +4333,7 @@ static int do_check(struct bpf_verifier_env *env) else verbose(env, "\nfrom %d to %d:", prev_insn_idx, insn_idx); - print_verifier_state(env, state); + print_verifier_state(env, state->frame[state->curframe]); do_print_state = false; } @@ -4088,13 +4466,17 @@ static int do_check(struct bpf_verifier_env *env) if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || - insn->src_reg != BPF_REG_0 || + (insn->src_reg != BPF_REG_0 && + insn->src_reg != BPF_PSEUDO_CALL) || insn->dst_reg != BPF_REG_0) { verbose(env, "BPF_CALL uses reserved fields\n"); return -EINVAL; } - err = check_call(env, insn->imm, insn_idx); + if (insn->src_reg == BPF_PSEUDO_CALL) + err = check_func_call(env, insn, &insn_idx); + else + err = check_helper_call(env, insn->imm, insn_idx); if (err) return err; @@ -4119,6 +4501,16 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } + if (state->curframe) { + /* exit from nested function */ + prev_insn_idx = insn_idx; + err = prepare_func_exit(env, &insn_idx); + if (err) + return err; + do_print_state = true; + continue; + } + /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time @@ -4179,8 +4571,16 @@ process_bpf_exit: insn_idx++; } - verbose(env, "processed %d insns, stack depth %d\n", insn_processed, - env->prog->aux->stack_depth); + verbose(env, "processed %d insns, stack depth ", insn_processed); + for (i = 0; i < env->subprog_cnt + 1; i++) { + u32 depth = env->subprog_stack_depth[i]; + + verbose(env, "%d", depth); + if (i + 1 < env->subprog_cnt + 1) + verbose(env, "+"); + } + verbose(env, "\n"); + env->prog->aux->stack_depth = env->subprog_stack_depth[0]; return 0; } -- cgit v1.2.3 From cc2b14d51053eb055c06f45e1a5cdbfcf2b79e94 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:08 -0800 Subject: bpf: teach verifier to recognize zero initialized stack programs with function calls are often passing various pointers via stack. When all calls are inlined llvm flattens stack accesses and optimizes away extra branches. When functions are not inlined it becomes the job of the verifier to recognize zero initialized stack to avoid exploring paths that program will not take. The following program would fail otherwise: ptr = &buffer_on_stack; *ptr = 0; ... func_call(.., ptr, ...) { if (..) *ptr = bpf_map_lookup(); } ... if (*ptr != 0) { // Access (*ptr)->field is valid. // Without stack_zero tracking such (*ptr)->field access // will be rejected } since stack slots are no longer uniform invalid | spill | misc add liveness marking to all slots, but do it in 8 byte chunks. So if nothing was read or written in [fp-16, fp-9] range it will be marked as LIVE_NONE. If any byte in that range was read, it will be marked LIVE_READ and stacksafe() check will perform byte-by-byte verification. If all bytes in the range were written the slot will be marked as LIVE_WRITTEN. This significantly speeds up state equality comparison and reduces total number of states processed. before after bpf_lb-DLB_L3.o 2051 2003 bpf_lb-DLB_L4.o 3287 3164 bpf_lb-DUNKNOWN.o 1080 1080 bpf_lxc-DDROP_ALL.o 24980 12361 bpf_lxc-DUNKNOWN.o 34308 16605 bpf_netdev.o 15404 10962 bpf_overlay.o 7191 6679 Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 3 +- kernel/bpf/verifier.c | 129 +++++++++++++++++++++++++++++++++---------- 2 files changed, 103 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 1f23408024ee..585d4e17ea88 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -91,7 +91,8 @@ struct bpf_reg_state { enum bpf_stack_slot_type { STACK_INVALID, /* nothing was stored in this stack slot */ STACK_SPILL, /* register spilled into stack */ - STACK_MISC /* BPF program wrote some data into this slot */ + STACK_MISC, /* BPF program wrote some data into this slot */ + STACK_ZERO, /* BPF program wrote constant zero */ }; #define BPF_REG_SIZE 8 /* size of eBPF register in bytes */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6e09d84a96f..df1ded3faf1d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -311,6 +311,8 @@ static void print_verifier_state(struct bpf_verifier_env *env, verbose(env, "=%s", reg_type_str[state->stack[i].spilled_ptr.type]); } + if (state->stack[i].slot_type[0] == STACK_ZERO) + verbose(env, " fp%d=0", (-i - 1) * BPF_REG_SIZE); } verbose(env, "\n"); } @@ -522,6 +524,13 @@ static void __mark_reg_known_zero(struct bpf_reg_state *reg) __mark_reg_known(reg, 0); } +static void __mark_reg_const_zero(struct bpf_reg_state *reg) +{ + __mark_reg_known(reg, 0); + reg->off = 0; + reg->type = SCALAR_VALUE; +} + static void mark_reg_known_zero(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno) { @@ -937,6 +946,12 @@ static bool is_spillable_regtype(enum bpf_reg_type type) } } +/* Does this register contain a constant zero? */ +static bool register_is_null(struct bpf_reg_state *reg) +{ + return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); +} + /* check_stack_read/write functions track spill/fill of registers, * stack boundary and alignment are checked in check_mem_access() */ @@ -984,12 +999,30 @@ static int check_stack_write(struct bpf_verifier_env *env, for (i = 0; i < BPF_REG_SIZE; i++) state->stack[spi].slot_type[i] = STACK_SPILL; } else { + u8 type = STACK_MISC; + /* regular write of data into stack */ state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; + /* only mark the slot as written if all 8 bytes were written + * otherwise read propagation may incorrectly stop too soon + * when stack slots are partially written. + * This heuristic means that read propagation will be + * conservative, since it will add reg_live_read marks + * to stack slots all the way to first state when programs + * writes+reads less than 8 bytes + */ + if (size == BPF_REG_SIZE) + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; + + /* when we zero initialize stack slots mark them as such */ + if (value_regno >= 0 && + register_is_null(&cur->regs[value_regno])) + type = STACK_ZERO; + for (i = 0; i < size; i++) state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = - STACK_MISC; + type; } return 0; } @@ -1030,6 +1063,14 @@ static void mark_stack_slot_read(struct bpf_verifier_env *env, bool writes = parent == state->parent; /* Observe write marks */ while (parent) { + if (parent->frame[frameno]->allocated_stack <= slot * BPF_REG_SIZE) + /* since LIVE_WRITTEN mark is only done for full 8-byte + * write the read marks are conservative and parent + * state may not even have the stack allocated. In such case + * end the propagation, since the loop reached beginning + * of the function + */ + break; /* if read wasn't screened by an earlier write ... */ if (writes && state->frame[frameno]->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; @@ -1077,21 +1118,38 @@ static int check_stack_read(struct bpf_verifier_env *env, * which resets stack/reg liveness for state transitions */ state->regs[value_regno].live |= REG_LIVE_WRITTEN; - mark_stack_slot_read(env, vstate, vstate->parent, spi, - reg_state->frameno); } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); return 0; } else { + int zeros = 0; + for (i = 0; i < size; i++) { - if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { - verbose(env, "invalid read from stack off %d+%d size %d\n", - off, i, size); - return -EACCES; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_MISC) + continue; + if (stype[(slot - i) % BPF_REG_SIZE] == STACK_ZERO) { + zeros++; + continue; } + verbose(env, "invalid read from stack off %d+%d size %d\n", + off, i, size); + return -EACCES; + } + mark_stack_slot_read(env, vstate, vstate->parent, spi, + reg_state->frameno); + if (value_regno >= 0) { + if (zeros == size) { + /* any size read into register is zero extended, + * so the whole register == const_zero + */ + __mark_reg_const_zero(&state->regs[value_regno]); + } else { + /* have read misc data from the stack */ + mark_reg_unknown(env, state->regs, value_regno); + } + state->regs[value_regno].live |= REG_LIVE_WRITTEN; } - if (value_regno >= 0) - /* have read misc data from the stack */ - mark_reg_unknown(env, state->regs, value_regno); return 0; } } @@ -1578,12 +1636,6 @@ static int check_xadd(struct bpf_verifier_env *env, int insn_idx, struct bpf_ins BPF_SIZE(insn->code), BPF_WRITE, -1); } -/* Does this register contain a constant zero? */ -static bool register_is_null(struct bpf_reg_state *reg) -{ - return reg->type == SCALAR_VALUE && tnum_equals_const(reg->var_off, 0); -} - /* when register 'regno' is passed into function that will read 'access_size' * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized. @@ -1633,15 +1685,30 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } for (i = 0; i < access_size; i++) { + u8 *stype; + slot = -(off + i) - 1; spi = slot / BPF_REG_SIZE; - if (state->allocated_stack <= slot || - state->stack[spi].slot_type[slot % BPF_REG_SIZE] != - STACK_MISC) { - verbose(env, "invalid indirect read from stack off %d+%d size %d\n", - off, i, access_size); - return -EACCES; + if (state->allocated_stack <= slot) + goto err; + stype = &state->stack[spi].slot_type[slot % BPF_REG_SIZE]; + if (*stype == STACK_MISC) + goto mark; + if (*stype == STACK_ZERO) { + /* helper can write anything into the stack */ + *stype = STACK_MISC; + goto mark; } +err: + verbose(env, "invalid indirect read from stack off %d+%d size %d\n", + off, i, access_size); + return -EACCES; +mark: + /* reading any byte out of 8-byte 'spill_slot' will cause + * the whole slot to be marked as 'read' + */ + mark_stack_slot_read(env, env->cur_state, env->cur_state->parent, + spi, state->frameno); } return update_stack_depth(env, state, off); } @@ -4022,8 +4089,19 @@ static bool stacksafe(struct bpf_func_state *old, for (i = 0; i < old->allocated_stack; i++) { spi = i / BPF_REG_SIZE; + if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) + /* explored state didn't use this */ + return true; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; + /* if old state was safe with misc data in the stack + * it will be safe with zero-initialized stack. + * The opposite is not true + */ + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_MISC && + cur->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_ZERO) + continue; if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != cur->stack[spi].slot_type[i % BPF_REG_SIZE]) /* Ex: old explored (safe) state has STACK_SPILL in @@ -4164,10 +4242,6 @@ static int propagate_liveness(struct bpf_verifier_env *env, parent = vparent->frame[frame]; for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && i < parent->allocated_stack / BPF_REG_SIZE; i++) { - if (parent->stack[i].slot_type[0] != STACK_SPILL) - continue; - if (state->stack[i].slot_type[0] != STACK_SPILL) - continue; if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) @@ -4247,8 +4321,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) struct bpf_func_state *frame = cur->frame[j]; for (i = 0; i < frame->allocated_stack / BPF_REG_SIZE; i++) - if (frame->stack[i].slot_type[0] == STACK_SPILL) - frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; + frame->stack[i].spilled_ptr.live = REG_LIVE_NONE; } return 0; } -- cgit v1.2.3 From 1ea47e01ad6ea0fe99697c54c2413d81dd21fe32 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:13 -0800 Subject: bpf: add support for bpf_call to interpreter though bpf_call is still the same call instruction and calling convention 'bpf to bpf' and 'bpf to helper' is the same the interpreter has to oparate on 'struct bpf_insn *'. To distinguish these two cases add a kernel internal opcode and mark call insns with it. This opcode is seen by interpreter only. JITs will never see it. Also add tiny bit of debug code to aid interpreter debugging. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + include/linux/filter.h | 6 ++++ kernel/bpf/core.c | 90 ++++++++++++++++++++++++++++++++++++++++---------- kernel/bpf/verifier.c | 36 ++++++++++++++++++++ 4 files changed, 116 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 54dc7cae2949..8935f6f63d5f 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -402,6 +402,7 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); +void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth); /* Map specifics */ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); diff --git a/include/linux/filter.h b/include/linux/filter.h index 5feb441d3dd9..f26e6da1007b 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -58,6 +58,9 @@ struct bpf_prog_aux; /* unused opcode to mark special call to bpf_tail_call() helper */ #define BPF_TAIL_CALL 0xf0 +/* unused opcode to mark call to interpreter with arguments */ +#define BPF_CALL_ARGS 0xe0 + /* As per nm, we expose JITed images as text (code) section for * kallsyms. That way, tools like perf can find it to match * addresses. @@ -710,6 +713,9 @@ bool sk_filter_charge(struct sock *sk, struct sk_filter *fp); void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp); u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); +#define __bpf_call_base_args \ + ((u64 (*)(u64, u64, u64, u64, u64, const struct bpf_insn *)) \ + __bpf_call_base) struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d32bebf4f2de..dc12c4fd006e 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -217,30 +217,40 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static bool bpf_is_jmp_and_has_target(const struct bpf_insn *insn) -{ - return BPF_CLASS(insn->code) == BPF_JMP && - /* Call and Exit are both special jumps with no - * target inside the BPF instruction image. - */ - BPF_OP(insn->code) != BPF_CALL && - BPF_OP(insn->code) != BPF_EXIT; -} - static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) { struct bpf_insn *insn = prog->insnsi; u32 i, insn_cnt = prog->len; + bool pseudo_call; + u8 code; + int off; for (i = 0; i < insn_cnt; i++, insn++) { - if (!bpf_is_jmp_and_has_target(insn)) + code = insn->code; + if (BPF_CLASS(code) != BPF_JMP) continue; + if (BPF_OP(code) == BPF_EXIT) + continue; + if (BPF_OP(code) == BPF_CALL) { + if (insn->src_reg == BPF_PSEUDO_CALL) + pseudo_call = true; + else + continue; + } else { + pseudo_call = false; + } + off = pseudo_call ? insn->imm : insn->off; /* Adjust offset of jmps if we cross boundaries. */ - if (i < pos && i + insn->off + 1 > pos) - insn->off += delta; - else if (i > pos + delta && i + insn->off + 1 <= pos + delta) - insn->off -= delta; + if (i < pos && i + off + 1 > pos) + off += delta; + else if (i > pos + delta && i + off + 1 <= pos + delta) + off -= delta; + + if (pseudo_call) + insn->imm = off; + else + insn->off = off; } } @@ -774,8 +784,7 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); * * Decode and execute eBPF instructions. */ -static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, - u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { u64 tmp; static const void *jumptable[256] = { @@ -835,6 +844,7 @@ static unsigned int ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, /* Call instruction */ [BPF_JMP | BPF_CALL] = &&JMP_CALL, + [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, /* Jumps */ [BPF_JMP | BPF_JA] = &&JMP_JA, @@ -1025,6 +1035,13 @@ select_insn: BPF_R4, BPF_R5); CONT; + JMP_CALL_ARGS: + BPF_R0 = (__bpf_call_base_args + insn->imm)(BPF_R1, BPF_R2, + BPF_R3, BPF_R4, + BPF_R5, + insn + insn->off + 1); + CONT; + JMP_TAIL_CALL: { struct bpf_map *map = (struct bpf_map *) (unsigned long) BPF_R2; struct bpf_array *array = container_of(map, struct bpf_array, map); @@ -1297,6 +1314,23 @@ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn return ___bpf_prog_run(regs, insn, stack); \ } +#define PROG_NAME_ARGS(stack_size) __bpf_prog_run_args##stack_size +#define DEFINE_BPF_PROG_RUN_ARGS(stack_size) \ +static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ + const struct bpf_insn *insn) \ +{ \ + u64 stack[stack_size / sizeof(u64)]; \ + u64 regs[MAX_BPF_REG]; \ +\ + FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ + BPF_R1 = r1; \ + BPF_R2 = r2; \ + BPF_R3 = r3; \ + BPF_R4 = r4; \ + BPF_R5 = r5; \ + return ___bpf_prog_run(regs, insn, stack); \ +} + #define EVAL1(FN, X) FN(X) #define EVAL2(FN, X, Y...) FN(X) EVAL1(FN, Y) #define EVAL3(FN, X, Y...) FN(X) EVAL2(FN, Y) @@ -1308,6 +1342,10 @@ EVAL6(DEFINE_BPF_PROG_RUN, 32, 64, 96, 128, 160, 192); EVAL6(DEFINE_BPF_PROG_RUN, 224, 256, 288, 320, 352, 384); EVAL4(DEFINE_BPF_PROG_RUN, 416, 448, 480, 512); +EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 32, 64, 96, 128, 160, 192); +EVAL6(DEFINE_BPF_PROG_RUN_ARGS, 224, 256, 288, 320, 352, 384); +EVAL4(DEFINE_BPF_PROG_RUN_ARGS, 416, 448, 480, 512); + #define PROG_NAME_LIST(stack_size) PROG_NAME(stack_size), static unsigned int (*interpreters[])(const void *ctx, @@ -1316,6 +1354,24 @@ EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) }; +#undef PROG_NAME_LIST +#define PROG_NAME_LIST(stack_size) PROG_NAME_ARGS(stack_size), +static u64 (*interpreters_args[])(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, + const struct bpf_insn *insn) = { +EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) +EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) +EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) +}; +#undef PROG_NAME_LIST + +void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) +{ + stack_depth = max_t(u32, stack_depth, 1); + insn->off = (s16) insn->imm; + insn->imm = interpreters_args[(round_up(stack_depth, 32) / 32) - 1] - + __bpf_call_base_args; + insn->code = BPF_JMP | BPF_CALL_ARGS; +} bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index df1ded3faf1d..cdc1f043c69b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1458,6 +1458,21 @@ static int update_stack_depth(struct bpf_verifier_env *env, return 0; } +static int get_callee_stack_depth(struct bpf_verifier_env *env, + const struct bpf_insn *insn, int idx) +{ + int start = idx + insn->imm + 1, subprog; + + subprog = find_subprog(env, start); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + start); + return -EFAULT; + } + subprog++; + return env->subprog_stack_depth[subprog]; +} + /* check whether memory at (regno + off) is accessible for t = (read | write) * if t==write, value_regno is a register which value is stored into memory * if t==read, value_regno is a register which will receive the value from memory @@ -4997,6 +5012,24 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } +static int fixup_call_args(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = prog->insnsi; + int i, depth; + + for (i = 0; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + depth = get_callee_stack_depth(env, insn, i); + if (depth < 0) + return depth; + bpf_patch_call_args(insn, depth); + } + return 0; +} + /* fixup insn->imm field of bpf_call instructions * and inline eligible helpers as explicit sequence of BPF instructions * @@ -5225,6 +5258,9 @@ skip_full_check: if (ret == 0) ret = fixup_bpf_calls(env); + if (ret == 0) + ret = fixup_call_args(env); + if (log->level && bpf_verifier_log_full(log)) ret = -ENOSPC; if (log->level && !log->ubuf) { -- cgit v1.2.3 From 60b58afc96c9df71871df2dbad42037757ceef26 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:14 -0800 Subject: bpf: fix net.core.bpf_jit_enable race global bpf_jit_enable variable is tested multiple times in JITs, blinding and verifier core. The malicious root can try to toggle it while loading the programs. This race condition was accounted for and there should be no issues, but it's safer to avoid this race condition. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- arch/arm/net/bpf_jit_32.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/mips/net/ebpf_jit.c | 2 +- arch/powerpc/net/bpf_jit_comp64.c | 2 +- arch/s390/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp_64.c | 2 +- arch/x86/net/bpf_jit_comp.c | 2 +- include/linux/filter.h | 5 +++-- kernel/bpf/core.c | 3 ++- kernel/bpf/verifier.c | 2 +- 10 files changed, 13 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index c199990e12b6..4425189bb24c 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -1824,7 +1824,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) /* If BPF JIT was not enabled then we must fall back to * the interpreter. */ - if (!bpf_jit_enable) + if (!prog->jit_requested) return orig_prog; /* If constant blinding was enabled and we failed during blinding diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index ba38d403abb2..288137cb0871 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -844,7 +844,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) int image_size; u8 *image_ptr; - if (!bpf_jit_enable) + if (!prog->jit_requested) return orig_prog; tmp = bpf_jit_blind_constants(prog); diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 962b0259b4b6..97069a1b6f43 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -1869,7 +1869,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) unsigned int image_size; u8 *image_ptr; - if (!bpf_jit_enable || !cpu_has_mips64r2) + if (!prog->jit_requested || !cpu_has_mips64r2) return prog; tmp = bpf_jit_blind_constants(prog); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 46d74e81aff1..d5a5bc43cf8f 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -993,7 +993,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_prog *tmp_fp; bool bpf_blinded = false; - if (!bpf_jit_enable) + if (!fp->jit_requested) return org_fp; tmp_fp = bpf_jit_blind_constants(org_fp); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index e81c16838b90..f4baa8c514d3 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -1300,7 +1300,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_jit jit; int pass; - if (!bpf_jit_enable) + if (!fp->jit_requested) return orig_fp; tmp = bpf_jit_blind_constants(fp); diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 5765e7e711f7..a2f1b5e774a7 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -1517,7 +1517,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) u8 *image_ptr; int pass; - if (!bpf_jit_enable) + if (!prog->jit_requested) return orig_prog; tmp = bpf_jit_blind_constants(prog); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 0554e8aef4d5..68859b58ab84 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1121,7 +1121,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) int pass; int i; - if (!bpf_jit_enable) + if (!prog->jit_requested) return orig_prog; tmp = bpf_jit_blind_constants(prog); diff --git a/include/linux/filter.h b/include/linux/filter.h index f26e6da1007b..3d6edc34932c 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -458,6 +458,7 @@ struct bpf_binary_header { struct bpf_prog { u16 pages; /* Number of allocated pages */ u16 jited:1, /* Is our filter JIT'ed? */ + jit_requested:1,/* archs need to JIT the prog */ locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ @@ -804,7 +805,7 @@ static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) return fp->jited && bpf_jit_is_ebpf(); } -static inline bool bpf_jit_blinding_enabled(void) +static inline bool bpf_jit_blinding_enabled(struct bpf_prog *prog) { /* These are the prerequisites, should someone ever have the * idea to call blinding outside of them, we make sure to @@ -812,7 +813,7 @@ static inline bool bpf_jit_blinding_enabled(void) */ if (!bpf_jit_is_ebpf()) return false; - if (!bpf_jit_enable) + if (!prog->jit_requested) return false; if (!bpf_jit_harden) return false; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index dc12c4fd006e..bda911644b1c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -94,6 +94,7 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) fp->pages = size / PAGE_SIZE; fp->aux = aux; fp->aux->prog = fp; + fp->jit_requested = ebpf_jit_enabled(); INIT_LIST_HEAD_RCU(&fp->aux->ksym_lnode); @@ -721,7 +722,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled()) + if (!bpf_jit_blinding_enabled(prog)) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index cdc1f043c69b..8e0e4cd0d5e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5080,7 +5080,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) /* BPF_EMIT_CALL() assumptions in some of the map_gen_lookup * handlers are currently limited to 64 bit only. */ - if (ebpf_jit_enabled() && BITS_PER_LONG == 64 && + if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_map_lookup_elem) { map_ptr = env->insn_aux_data[i + delta].map_ptr; if (map_ptr == BPF_MAP_PTR_POISON || -- cgit v1.2.3 From 1c2a088a6626d4f51d2f2c97b0cbedbfbf3637f6 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 14 Dec 2017 17:55:15 -0800 Subject: bpf: x64: add JIT support for multi-function programs Typical JIT does several passes over bpf instructions to compute total size and relative offsets of jumps and calls. With multitple bpf functions calling each other all relative calls will have invalid offsets intially therefore we need to additional last pass over the program to emit calls with correct offsets. For example in case of three bpf functions: main: call foo call bpf_map_lookup exit foo: call bar exit bar: exit We will call bpf_int_jit_compile() indepedently for main(), foo() and bar() x64 JIT typically does 4-5 passes to converge. After these initial passes the image for these 3 functions will be good except call targets, since start addresses of foo() and bar() are unknown when we were JITing main() (note that call bpf_map_lookup will be resolved properly during initial passes). Once start addresses of 3 functions are known we patch call_insn->imm to point to right functions and call bpf_int_jit_compile() again which needs only one pass. Additional safety checks are done to make sure this last pass doesn't produce image that is larger or smaller than previous pass. When constant blinding is on it's applied to all functions at the first pass, since doing it once again at the last pass can change size of the JITed code. Tested on x64 and arm64 hw with JIT on/off, blinding on/off. x64 jits bpf-to-bpf calls correctly while arm64 falls back to interpreter. All other JITs that support normal BPF_CALL will behave the same way since bpf-to-bpf call is equivalent to bpf-to-kernel call from JITs point of view. Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: Daniel Borkmann --- arch/x86/net/bpf_jit_comp.c | 47 ++++++++++++++-- include/linux/bpf.h | 3 ++ include/linux/bpf_verifier.h | 1 + include/linux/filter.h | 2 + kernel/bpf/core.c | 13 ++++- kernel/bpf/syscall.c | 3 +- kernel/bpf/verifier.c | 126 +++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 189 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 68859b58ab84..87f214fbe66e 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1109,13 +1109,23 @@ common_load: return proglen; } +struct x64_jit_data { + struct bpf_binary_header *header; + int *addrs; + u8 *image; + int proglen; + struct jit_context ctx; +}; + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; struct bpf_prog *tmp, *orig_prog = prog; + struct x64_jit_data *jit_data; int proglen, oldproglen = 0; struct jit_context ctx = {}; bool tmp_blinded = false; + bool extra_pass = false; u8 *image = NULL; int *addrs; int pass; @@ -1135,10 +1145,28 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog = tmp; } + jit_data = prog->aux->jit_data; + if (!jit_data) { + jit_data = kzalloc(sizeof(*jit_data), GFP_KERNEL); + if (!jit_data) { + prog = orig_prog; + goto out; + } + prog->aux->jit_data = jit_data; + } + addrs = jit_data->addrs; + if (addrs) { + ctx = jit_data->ctx; + oldproglen = jit_data->proglen; + image = jit_data->image; + header = jit_data->header; + extra_pass = true; + goto skip_init_addrs; + } addrs = kmalloc(prog->len * sizeof(*addrs), GFP_KERNEL); if (!addrs) { prog = orig_prog; - goto out; + goto out_addrs; } /* Before first pass, make a rough estimation of addrs[] @@ -1149,6 +1177,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) addrs[i] = proglen; } ctx.cleanup_addr = proglen; +skip_init_addrs: /* JITed image shrinks with every pass and the loop iterates * until the image stops shrinking. Very large bpf programs @@ -1189,7 +1218,15 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) if (image) { bpf_flush_icache(header, image + proglen); - bpf_jit_binary_lock_ro(header); + if (!prog->is_func || extra_pass) { + bpf_jit_binary_lock_ro(header); + } else { + jit_data->addrs = addrs; + jit_data->ctx = ctx; + jit_data->proglen = proglen; + jit_data->image = image; + jit_data->header = header; + } prog->bpf_func = (void *)image; prog->jited = 1; prog->jited_len = proglen; @@ -1197,8 +1234,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog = orig_prog; } + if (!prog->is_func || extra_pass) { out_addrs: - kfree(addrs); + kfree(addrs); + kfree(jit_data); + prog->aux->jit_data = NULL; + } out: if (tmp_blinded) bpf_jit_prog_release_other(prog, prog == orig_prog ? diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 8935f6f63d5f..da54ef644fcd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -200,6 +200,9 @@ struct bpf_prog_aux { u32 max_ctx_offset; u32 stack_depth; u32 id; + u32 func_cnt; + struct bpf_prog **func; + void *jit_data; /* JIT specific data. arch dependent */ struct latch_tree_node ksym_tnode; struct list_head ksym_lnode; const struct bpf_prog_ops *ops; diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 585d4e17ea88..aaac589e490c 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -143,6 +143,7 @@ struct bpf_insn_aux_data { union { enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ + s32 call_imm; /* saved imm field of call insn */ }; int ctx_field_size; /* the ctx field size for load insn, maybe 0 */ bool seen; /* this insn was processed by the verifier */ diff --git a/include/linux/filter.h b/include/linux/filter.h index 3d6edc34932c..e872b4ebaa57 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -463,6 +463,8 @@ struct bpf_prog { gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1, /* Do we need dst entry? */ + blinded:1, /* Was blinded */ + is_func:1, /* program is a bpf function */ kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index bda911644b1c..768e0a02d8c8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -722,7 +722,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) struct bpf_insn *insn; int i, rewritten; - if (!bpf_jit_blinding_enabled(prog)) + if (!bpf_jit_blinding_enabled(prog) || prog->blinded) return prog; clone = bpf_prog_clone_create(prog, GFP_USER); @@ -764,6 +764,7 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) i += insn_delta; } + clone->blinded = 1; return clone; } #endif /* CONFIG_BPF_JIT */ @@ -1629,11 +1630,19 @@ int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array, static void bpf_prog_free_deferred(struct work_struct *work) { struct bpf_prog_aux *aux; + int i; aux = container_of(work, struct bpf_prog_aux, work); if (bpf_prog_is_dev_bound(aux)) bpf_prog_offload_destroy(aux->prog); - bpf_jit_free(aux->prog); + for (i = 0; i < aux->func_cnt; i++) + bpf_jit_free(aux->func[i]); + if (aux->func_cnt) { + kfree(aux->func); + bpf_prog_unlock_free(aux->prog); + } else { + bpf_jit_free(aux->prog); + } } /* Free internal BPF program */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2c4cfeaa8d5e..e2e1c78ce1dc 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1194,7 +1194,8 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_used_maps; /* eBPF program is ready to be JITed */ - prog = bpf_prog_select_runtime(prog, &err); + if (!prog->bpf_func) + prog = bpf_prog_select_runtime(prog, &err); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8e0e4cd0d5e4..48b2901cf483 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5012,12 +5012,138 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } +static int jit_subprogs(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog, **func, *tmp; + int i, j, subprog_start, subprog_end = 0, len, subprog; + struct bpf_insn *insn = prog->insnsi; + void *old_bpf_func; + int err = -ENOMEM; + + if (env->subprog_cnt == 0) + return 0; + + for (i = 0; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = find_subprog(env, i + insn->imm + 1); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i + insn->imm + 1); + return -EFAULT; + } + /* temporarily remember subprog id inside insn instead of + * aux_data, since next loop will split up all insns into funcs + */ + insn->off = subprog + 1; + /* remember original imm in case JIT fails and fallback + * to interpreter will be needed + */ + env->insn_aux_data[i].call_imm = insn->imm; + /* point imm to __bpf_call_base+1 from JITs point of view */ + insn->imm = 1; + } + + func = kzalloc(sizeof(prog) * (env->subprog_cnt + 1), GFP_KERNEL); + if (!func) + return -ENOMEM; + + for (i = 0; i <= env->subprog_cnt; i++) { + subprog_start = subprog_end; + if (env->subprog_cnt == i) + subprog_end = prog->len; + else + subprog_end = env->subprog_starts[i]; + + len = subprog_end - subprog_start; + func[i] = bpf_prog_alloc(bpf_prog_size(len), GFP_USER); + if (!func[i]) + goto out_free; + memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], + len * sizeof(struct bpf_insn)); + func[i]->len = len; + func[i]->is_func = 1; + /* Use bpf_prog_F_tag to indicate functions in stack traces. + * Long term would need debug info to populate names + */ + func[i]->aux->name[0] = 'F'; + func[i]->aux->stack_depth = env->subprog_stack_depth[i]; + func[i]->jit_requested = 1; + func[i] = bpf_int_jit_compile(func[i]); + if (!func[i]->jited) { + err = -ENOTSUPP; + goto out_free; + } + cond_resched(); + } + /* at this point all bpf functions were successfully JITed + * now populate all bpf_calls with correct addresses and + * run last pass of JIT + */ + for (i = 0; i <= env->subprog_cnt; i++) { + insn = func[i]->insnsi; + for (j = 0; j < func[i]->len; j++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + subprog = insn->off; + insn->off = 0; + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) + func[subprog]->bpf_func - + __bpf_call_base; + } + } + for (i = 0; i <= env->subprog_cnt; i++) { + old_bpf_func = func[i]->bpf_func; + tmp = bpf_int_jit_compile(func[i]); + if (tmp != func[i] || func[i]->bpf_func != old_bpf_func) { + verbose(env, "JIT doesn't support bpf-to-bpf calls\n"); + err = -EFAULT; + goto out_free; + } + cond_resched(); + } + + /* finally lock prog and jit images for all functions and + * populate kallsysm + */ + for (i = 0; i <= env->subprog_cnt; i++) { + bpf_prog_lock_ro(func[i]); + bpf_prog_kallsyms_add(func[i]); + } + prog->jited = 1; + prog->bpf_func = func[0]->bpf_func; + prog->aux->func = func; + prog->aux->func_cnt = env->subprog_cnt + 1; + return 0; +out_free: + for (i = 0; i <= env->subprog_cnt; i++) + if (func[i]) + bpf_jit_free(func[i]); + kfree(func); + /* cleanup main prog to be interpreted */ + prog->jit_requested = 0; + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + insn->off = 0; + insn->imm = env->insn_aux_data[i].call_imm; + } + return err; +} + static int fixup_call_args(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; struct bpf_insn *insn = prog->insnsi; int i, depth; + if (env->prog->jit_requested) + if (jit_subprogs(env) == 0) + return 0; + for (i = 0; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) -- cgit v1.2.3 From 46df3d209db080395a98fc0875bd05e45e8f44e0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 15 Dec 2017 21:42:57 -0500 Subject: trace: reenable preemption if we modify the ip Things got moved around between the original bpf_override_return patches and the final version, and now the ftrace kprobe dispatcher assumes if you modified the ip that you also enabled preemption. Make a comment of this and enable preemption, this fixes the lockdep splat that happened when using this feature. Fixes: 9802d86585db ("bpf: add a bpf_override_function helper") Signed-off-by: Josef Bacik Signed-off-by: Daniel Borkmann --- kernel/trace/trace_kprobe.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5db849809a56..91f4b57dab82 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1322,8 +1322,15 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) if (tk->tp.flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tk->tp.flags & TP_FLAG_PROFILE) + if (tk->tp.flags & TP_FLAG_PROFILE) { ret = kprobe_perf_func(tk, regs); + /* + * The ftrace kprobe handler leaves it up to us to re-enable + * preemption here before returning if we've modified the ip. + */ + if (ret) + preempt_enable_no_resched(); + } #endif return ret; } -- cgit v1.2.3 From e90004d56bf805db7d4401fe51a80a93c311cfe6 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Dec 2017 14:03:12 +0000 Subject: bpf: fix spelling mistake: "funcation"-> "function" Trivial fix to spelling mistake in error message text. Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48b2901cf483..56ef21c71bd3 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -772,7 +772,7 @@ static int check_subprogs(struct bpf_verifier_env *env) return -EPERM; } if (bpf_prog_is_dev_bound(env->prog->aux)) { - verbose(env, "funcation calls in offloaded programs are not supported yet\n"); + verbose(env, "function calls in offloaded programs are not supported yet\n"); return -EINVAL; } ret = add_subprog(env, i + insn[i].imm + 1); -- cgit v1.2.3 From fa2d41adb953235c4acaa98f6c980fd9eabe0062 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 18 Dec 2017 17:47:07 +0000 Subject: bpf: make function skip_callee static and return NULL rather than 0 Function skip_callee is local to the source and does not need to be in global scope, so make it static. Also return NULL rather than 0. Cleans up two sparse warnings: symbol 'skip_callee' was not declared. Should it be static? Using plain integer as NULL pointer Signed-off-by: Colin Ian King Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 56ef21c71bd3..52c9b32d58bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -823,6 +823,7 @@ next: return 0; } +static struct bpf_verifier_state *skip_callee(struct bpf_verifier_env *env, const struct bpf_verifier_state *state, struct bpf_verifier_state *parent, @@ -867,7 +868,7 @@ bug: verbose(env, "verifier bug regno %d tmp %p\n", regno, tmp); verbose(env, "regno %d parent frame %d current frame %d\n", regno, parent->curframe, state->curframe); - return 0; + return NULL; } static int mark_reg_read(struct bpf_verifier_env *env, -- cgit v1.2.3 From 06ef0ccb5a36e1feba9b413ff59a04ecc4407c1c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 18 Dec 2017 10:13:44 -0800 Subject: bpf/cgroup: fix a verification error for a CGROUP_DEVICE type prog The tools/testing/selftests/bpf test program test_dev_cgroup fails with the following error when compiled with llvm 6.0. (I did not try with earlier versions.) libbpf: load bpf program failed: Permission denied libbpf: -- BEGIN DUMP LOG --- libbpf: 0: (61) r2 = *(u32 *)(r1 +4) 1: (b7) r0 = 0 2: (55) if r2 != 0x1 goto pc+8 R0=inv0 R1=ctx(id=0,off=0,imm=0) R2=inv1 R10=fp0 3: (69) r2 = *(u16 *)(r1 +0) invalid bpf_context access off=0 size=2 ... The culprit is the following statement in dev_cgroup.c: short type = ctx->access_type & 0xFFFF; This code is typical as the ctx->access_type is assigned as below in kernel/bpf/cgroup.c: struct bpf_cgroup_dev_ctx ctx = { .access_type = (access << 16) | dev_type, .major = major, .minor = minor, }; The compiler converts it to u16 access while the verifier cgroup_dev_is_valid_access rejects any non u32 access. This patch permits the field access_type to be accessible with type u16 and u8 as well. Signed-off-by: Yonghong Song Tested-by: Roman Gushchin Signed-off-by: Daniel Borkmann --- include/uapi/linux/bpf.h | 3 ++- kernel/bpf/cgroup.c | 15 +++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index d01f1cb3cfc0..69eabfcb9bdb 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -1012,7 +1012,8 @@ struct bpf_perf_event_value { #define BPF_DEVCG_DEV_CHAR (1ULL << 1) struct bpf_cgroup_dev_ctx { - __u32 access_type; /* (access << 16) | type */ + /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ + __u32 access_type; __u32 major; __u32 minor; }; diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index b789ab78d28f..c1c0b60d3f2f 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -568,6 +568,8 @@ static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (type == BPF_WRITE) return false; @@ -576,8 +578,17 @@ static bool cgroup_dev_is_valid_access(int off, int size, /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; + + switch (off) { + case bpf_ctx_range(struct bpf_cgroup_dev_ctx, access_type): + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + break; + default: + if (size != size_default) + return false; + } return true; } -- cgit v1.2.3 From ffd2e8df8d138e7436e218e0a9d3447a18b888e6 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 1 Dec 2017 11:50:33 -0600 Subject: resource: Set type of "reserve=" user-specified resources When we reserve regions because the user specified a "reserve=" parameter, set the resource type to either IORESOURCE_IO (for regions below 0x10000) or IORESOURCE_MEM. The test for 0x10000 is just a heuristic; obviously there can be memory below 0x10000 as well. Improve documentation of the "reserve=" parameter. Signed-off-by: Bjorn Helgaas --- Documentation/admin-guide/kernel-parameters.txt | 6 +++++- kernel/resource.c | 24 ++++++++++++++++++------ 2 files changed, 23 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6571fbfdb2a1..78cdf6a637fc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3675,7 +3675,11 @@ [KNL, SMP] Set scheduler's default relax_domain_level. See Documentation/cgroup-v1/cpusets.txt. - reserve= [KNL,BUGS] Force the kernel to ignore some iomem area + reserve= [KNL,BUGS] Force kernel to ignore I/O ports or memory + Format: ,[,,,...] + Reserve I/O ports or memory so the kernel won't use + them. If is less than 0x10000, the region + is assumed to be I/O ports; otherwise it is memory. reservetop= [X86-32] Format: nn[KMG] diff --git a/kernel/resource.c b/kernel/resource.c index 54ba6de3757c..ba3252f7c319 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1478,7 +1478,7 @@ void __devm_release_region(struct device *dev, struct resource *parent, EXPORT_SYMBOL(__devm_release_region); /* - * Called from init/main.c to reserve IO ports. + * Reserve I/O ports or memory based on "reserve=" kernel parameter. */ #define MAXRESERVE 4 static int __init reserve_setup(char *str) @@ -1489,26 +1489,38 @@ static int __init reserve_setup(char *str) for (;;) { unsigned int io_start, io_num; int x = reserved; + struct resource *parent; - if (get_option (&str, &io_start) != 2) + if (get_option(&str, &io_start) != 2) break; - if (get_option (&str, &io_num) == 0) + if (get_option(&str, &io_num) == 0) break; if (x < MAXRESERVE) { struct resource *res = reserve + x; + + /* + * If the region starts below 0x10000, we assume it's + * I/O port space; otherwise assume it's memory. + */ + if (io_start < 0x10000) { + res->flags = IORESOURCE_IO; + parent = &ioport_resource; + } else { + res->flags = IORESOURCE_MEM; + parent = &iomem_resource; + } res->name = "reserved"; res->start = io_start; res->end = io_start + io_num - 1; - res->flags = IORESOURCE_BUSY; + res->flags |= IORESOURCE_BUSY; res->desc = IORES_DESC_NONE; res->child = NULL; - if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) + if (request_resource(parent, res) == 0) reserved = x+1; } } return 1; } - __setup("reserve=", reserve_setup); /* -- cgit v1.2.3 From f37e2334bca5c5653ad15cadc7d68c78324b956b Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Fri, 1 Dec 2017 14:07:18 -0600 Subject: resource: Set type when reserving new regions Set resource structs inserted by __reserve_region_with_split() to have the correct type. Setting the type doesn't fix any functional problem but makes %pR on the resource work better. Signed-off-by: Bjorn Helgaas --- kernel/resource.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index ba3252f7c319..8c527d83ca76 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1022,6 +1022,7 @@ static void __init __reserve_region_with_split(struct resource *root, struct resource *conflict; struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; + int type = resource_type(root); if (!res) return; @@ -1029,7 +1030,7 @@ static void __init __reserve_region_with_split(struct resource *root, res->name = name; res->start = start; res->end = end; - res->flags = IORESOURCE_BUSY; + res->flags = type | IORESOURCE_BUSY; res->desc = IORES_DESC_NONE; while (1) { @@ -1064,7 +1065,7 @@ static void __init __reserve_region_with_split(struct resource *root, next_res->name = name; next_res->start = conflict->end + 1; next_res->end = end; - next_res->flags = IORESOURCE_BUSY; + next_res->flags = type | IORESOURCE_BUSY; next_res->desc = IORES_DESC_NONE; } } else { -- cgit v1.2.3 From 4f74d80971bce93d9e608c40324d662c70eb4664 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Dec 2017 13:42:56 +0100 Subject: bpf: fix kallsyms handling for subprogs Right now kallsyms handling is not working with JITed subprogs. The reason is that when in 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") in jit_subprogs() they are passed to bpf_prog_kallsyms_add(), then their prog type is 0, which BPF core will think it's a cBPF program as only cBPF programs have a 0 type. Thus, they need to inherit the type from the main prog. Once that is fixed, they are indeed added to the BPF kallsyms infra, but their tag is 0. Therefore, since intention is to add them as bpf_prog_F_, we need to pass them to bpf_prog_calc_tag() first. And once this is resolved, there is a use-after-free on prog cleanup: we remove the kallsyms entry from the main prog, later walk all subprogs and call bpf_jit_free() on them. However, the kallsyms linkage was never released on them. Thus, do that for all subprogs right in __bpf_prog_put() when refcount hits 0. Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 6 ++++++ kernel/bpf/verifier.c | 3 +++ 2 files changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e2e1c78ce1dc..30e728dcd35d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -937,10 +937,16 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu) static void __bpf_prog_put(struct bpf_prog *prog, bool do_idr_lock) { if (atomic_dec_and_test(&prog->aux->refcnt)) { + int i; + trace_bpf_prog_put_rcu(prog); /* bpf_prog_free_id() must be called first */ bpf_prog_free_id(prog, do_idr_lock); + + for (i = 0; i < prog->aux->func_cnt; i++) + bpf_prog_kallsyms_del(prog->aux->func[i]); bpf_prog_kallsyms_del(prog); + call_rcu(&prog->aux->rcu, __bpf_prog_put_rcu); } } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 52c9b32d58bf..3c3eec58b3e8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5063,7 +5063,10 @@ static int jit_subprogs(struct bpf_verifier_env *env) goto out_free; memcpy(func[i]->insnsi, &prog->insnsi[subprog_start], len * sizeof(struct bpf_insn)); + func[i]->type = prog->type; func[i]->len = len; + if (bpf_prog_calc_tag(func[i])) + goto out_free; func[i]->is_func = 1; /* Use bpf_prog_F_tag to indicate functions in stack traces. * Long term would need debug info to populate names -- cgit v1.2.3 From 7105e828c087de970fcb5a9509db51bfe6bd7894 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Dec 2017 13:42:57 +0100 Subject: bpf: allow for correlation of maps and helpers in dump Currently a dump of an xlated prog (post verifier stage) doesn't correlate used helpers as well as maps. The prog info lists involved map ids, however there's no correlation of where in the program they are used as of today. Likewise, bpftool does not correlate helper calls with the target functions. The latter can be done w/o any kernel changes through kallsyms, and also has the advantage that this works with inlined helpers and BPF calls. Example, via interpreter: # tc filter show dev foo ingress filter protocol all pref 49152 bpf chain 0 filter protocol all pref 49152 bpf chain 0 handle 0x1 foo.o:[ingress] \ direct-action not_in_hw id 1 tag c74773051b364165 <-- prog id:1 * Output before patch (calls/maps remain unclear): # bpftool prog dump xlated id 1 <-- dump prog id:1 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = 0xffff95c47a8d4800 6: (85) call unknown#73040 7: (15) if r0 == 0x0 goto pc+18 8: (bf) r2 = r10 9: (07) r2 += -4 10: (bf) r1 = r0 11: (85) call unknown#73040 12: (15) if r0 == 0x0 goto pc+23 [...] * Output after patch: # bpftool prog dump xlated id 1 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = map[id:2] <-- map id:2 6: (85) call bpf_map_lookup_elem#73424 <-- helper call 7: (15) if r0 == 0x0 goto pc+18 8: (bf) r2 = r10 9: (07) r2 += -4 10: (bf) r1 = r0 11: (85) call bpf_map_lookup_elem#73424 12: (15) if r0 == 0x0 goto pc+23 [...] # bpftool map show id 2 <-- show/dump/etc map id:2 2: hash_of_maps flags 0x0 key 4B value 4B max_entries 3 memlock 4096B Example, JITed, same prog: # tc filter show dev foo ingress filter protocol all pref 49152 bpf chain 0 filter protocol all pref 49152 bpf chain 0 handle 0x1 foo.o:[ingress] \ direct-action not_in_hw id 3 tag c74773051b364165 jited # bpftool prog show id 3 3: sched_cls tag c74773051b364165 loaded_at Dec 19/13:48 uid 0 xlated 384B jited 257B memlock 4096B map_ids 2 # bpftool prog dump xlated id 3 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = map[id:2] <-- map id:2 6: (85) call __htab_map_lookup_elem#77408 <-+ inlined rewrite 7: (15) if r0 == 0x0 goto pc+2 | 8: (07) r0 += 56 | 9: (79) r0 = *(u64 *)(r0 +0) <-+ 10: (15) if r0 == 0x0 goto pc+24 11: (bf) r2 = r10 12: (07) r2 += -4 [...] Example, same prog, but kallsyms disabled (in that case we are also not allowed to pass any relative offsets, etc, so prog becomes pointer sanitized on dump): # sysctl kernel.kptr_restrict=2 kernel.kptr_restrict = 2 # bpftool prog dump xlated id 3 0: (b7) r1 = 2 1: (63) *(u32 *)(r10 -4) = r1 2: (bf) r2 = r10 3: (07) r2 += -4 4: (18) r1 = map[id:2] 6: (85) call bpf_unspec#0 7: (15) if r0 == 0x0 goto pc+2 [...] Example, BPF calls via interpreter: # bpftool prog dump xlated id 1 0: (85) call pc+2#__bpf_prog_run_args32 1: (b7) r0 = 1 2: (95) exit 3: (b7) r0 = 2 4: (95) exit Example, BPF calls via JIT: # sysctl net.core.bpf_jit_enable=1 net.core.bpf_jit_enable = 1 # sysctl net.core.bpf_jit_kallsyms=1 net.core.bpf_jit_kallsyms = 1 # bpftool prog dump xlated id 1 0: (85) call pc+2#bpf_prog_3b185187f1855c4c_F 1: (b7) r0 = 1 2: (95) exit 3: (b7) r0 = 2 4: (95) exit And finally, an example for tail calls that is now working as well wrt correlation: # bpftool prog dump xlated id 2 [...] 10: (b7) r2 = 8 11: (85) call bpf_trace_printk#-41312 12: (bf) r1 = r6 13: (18) r2 = map[id:1] 15: (b7) r3 = 0 16: (85) call bpf_tail_call#12 17: (b7) r1 = 42 18: (6b) *(u16 *)(r6 +46) = r1 19: (b7) r0 = 0 20: (95) exit # bpftool map show id 1 1: prog_array flags 0x0 key 4B value 4B max_entries 1 memlock 4096B Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 9 +++ kernel/bpf/core.c | 4 +- kernel/bpf/disasm.c | 65 ++++++++++++++--- kernel/bpf/disasm.h | 29 ++++++-- kernel/bpf/syscall.c | 87 +++++++++++++++++++++-- kernel/bpf/verifier.c | 30 ++++++-- tools/bpf/bpftool/prog.c | 181 ++++++++++++++++++++++++++++++++++++++++++++--- 7 files changed, 370 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index e872b4ebaa57..2b0df2703671 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -724,6 +725,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog); void bpf_jit_compile(struct bpf_prog *prog); bool bpf_helper_changes_pkt_data(void *func); +static inline bool bpf_dump_raw_ok(void) +{ + /* Reconstruction of call-sites is dependent on kallsyms, + * thus make dump the same restriction. + */ + return kallsyms_show_value() == 1; +} + struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 768e0a02d8c8..70a534549cd3 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -771,7 +771,9 @@ struct bpf_prog *bpf_jit_blind_constants(struct bpf_prog *prog) /* Base function for offset calculation. Needs to go into .text section, * therefore keeping it non-static as well; will also be used by JITs - * anyway later on, so do not let the compiler omit it. + * anyway later on, so do not let the compiler omit it. This also needs + * to go into kallsyms for correlation from e.g. bpftool, so naming + * must not change. */ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) { diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 883f88fa5bfc..8740406df2cd 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -21,10 +21,39 @@ static const char * const func_id_str[] = { }; #undef __BPF_FUNC_STR_FN -const char *func_id_name(int id) +static const char *__func_get_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + char *buff, size_t len) { BUILD_BUG_ON(ARRAY_SIZE(func_id_str) != __BPF_FUNC_MAX_ID); + if (insn->src_reg != BPF_PSEUDO_CALL && + insn->imm >= 0 && insn->imm < __BPF_FUNC_MAX_ID && + func_id_str[insn->imm]) + return func_id_str[insn->imm]; + + if (cbs && cbs->cb_call) + return cbs->cb_call(cbs->private_data, insn); + + if (insn->src_reg == BPF_PSEUDO_CALL) + snprintf(buff, len, "%+d", insn->imm); + + return buff; +} + +static const char *__func_imm_name(const struct bpf_insn_cbs *cbs, + const struct bpf_insn *insn, + u64 full_imm, char *buff, size_t len) +{ + if (cbs && cbs->cb_imm) + return cbs->cb_imm(cbs->private_data, insn, full_imm); + + snprintf(buff, len, "0x%llx", (unsigned long long)full_imm); + return buff; +} + +const char *func_id_name(int id) +{ if (id >= 0 && id < __BPF_FUNC_MAX_ID && func_id_str[id]) return func_id_str[id]; else @@ -83,7 +112,7 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_end_insn(bpf_insn_print_cb verbose, +static void print_bpf_end_insn(bpf_insn_print_t verbose, struct bpf_verifier_env *env, const struct bpf_insn *insn) { @@ -92,9 +121,12 @@ static void print_bpf_end_insn(bpf_insn_print_cb verbose, insn->imm, insn->dst_reg); } -void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, - const struct bpf_insn *insn, bool allow_ptr_leaks) +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + struct bpf_verifier_env *env, + const struct bpf_insn *insn, + bool allow_ptr_leaks) { + const bpf_insn_print_t verbose = cbs->cb_print; u8 class = BPF_CLASS(insn->code); if (class == BPF_ALU || class == BPF_ALU64) { @@ -175,12 +207,15 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, */ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + char tmp[64]; if (map_ptr && !allow_ptr_leaks) imm = 0; - verbose(env, "(%02x) r%d = 0x%llx\n", insn->code, - insn->dst_reg, (unsigned long long)imm); + verbose(env, "(%02x) r%d = %s\n", + insn->code, insn->dst_reg, + __func_imm_name(cbs, insn, imm, + tmp, sizeof(tmp))); } else { verbose(env, "BUG_ld_%02x\n", insn->code); return; @@ -189,12 +224,20 @@ void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, u8 opcode = BPF_OP(insn->code); if (opcode == BPF_CALL) { - if (insn->src_reg == BPF_PSEUDO_CALL) - verbose(env, "(%02x) call pc%+d\n", insn->code, - insn->imm); - else + char tmp[64]; + + if (insn->src_reg == BPF_PSEUDO_CALL) { + verbose(env, "(%02x) call pc%s\n", + insn->code, + __func_get_name(cbs, insn, + tmp, sizeof(tmp))); + } else { + strcpy(tmp, "unknown"); verbose(env, "(%02x) call %s#%d\n", insn->code, - func_id_name(insn->imm), insn->imm); + __func_get_name(cbs, insn, + tmp, sizeof(tmp)), + insn->imm); + } } else if (insn->code == (BPF_JMP | BPF_JA)) { verbose(env, "(%02x) goto pc%+d\n", insn->code, insn->off); diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index 8de977e420b6..e0857d016f89 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -17,16 +17,35 @@ #include #include #include +#ifndef __KERNEL__ +#include +#include +#endif + +struct bpf_verifier_env; extern const char *const bpf_alu_string[16]; extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -struct bpf_verifier_env; -typedef void (*bpf_insn_print_cb)(struct bpf_verifier_env *env, - const char *, ...); -void print_bpf_insn(bpf_insn_print_cb verbose, struct bpf_verifier_env *env, - const struct bpf_insn *insn, bool allow_ptr_leaks); +typedef void (*bpf_insn_print_t)(struct bpf_verifier_env *env, + const char *, ...); +typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, + const struct bpf_insn *insn); +typedef const char *(*bpf_insn_print_imm_t)(void *private_data, + const struct bpf_insn *insn, + __u64 full_imm); + +struct bpf_insn_cbs { + bpf_insn_print_t cb_print; + bpf_insn_revmap_call_t cb_call; + bpf_insn_print_imm_t cb_imm; + void *private_data; +}; +void print_bpf_insn(const struct bpf_insn_cbs *cbs, + struct bpf_verifier_env *env, + const struct bpf_insn *insn, + bool allow_ptr_leaks); #endif diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 30e728dcd35d..007802c5ca7d 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1558,6 +1558,67 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr) return fd; } +static const struct bpf_map *bpf_map_from_imm(const struct bpf_prog *prog, + unsigned long addr) +{ + int i; + + for (i = 0; i < prog->aux->used_map_cnt; i++) + if (prog->aux->used_maps[i] == (void *)addr) + return prog->aux->used_maps[i]; + return NULL; +} + +static struct bpf_insn *bpf_insn_prepare_dump(const struct bpf_prog *prog) +{ + const struct bpf_map *map; + struct bpf_insn *insns; + u64 imm; + int i; + + insns = kmemdup(prog->insnsi, bpf_prog_insn_size(prog), + GFP_USER); + if (!insns) + return insns; + + for (i = 0; i < prog->len; i++) { + if (insns[i].code == (BPF_JMP | BPF_TAIL_CALL)) { + insns[i].code = BPF_JMP | BPF_CALL; + insns[i].imm = BPF_FUNC_tail_call; + /* fall-through */ + } + if (insns[i].code == (BPF_JMP | BPF_CALL) || + insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) { + if (insns[i].code == (BPF_JMP | BPF_CALL_ARGS)) + insns[i].code = BPF_JMP | BPF_CALL; + if (!bpf_dump_raw_ok()) + insns[i].imm = 0; + continue; + } + + if (insns[i].code != (BPF_LD | BPF_IMM | BPF_DW)) + continue; + + imm = ((u64)insns[i + 1].imm << 32) | (u32)insns[i].imm; + map = bpf_map_from_imm(prog, imm); + if (map) { + insns[i].src_reg = BPF_PSEUDO_MAP_FD; + insns[i].imm = map->id; + insns[i + 1].imm = 0; + continue; + } + + if (!bpf_dump_raw_ok() && + imm == (unsigned long)prog->aux) { + insns[i].imm = 0; + insns[i + 1].imm = 0; + continue; + } + } + + return insns; +} + static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, const union bpf_attr *attr, union bpf_attr __user *uattr) @@ -1608,18 +1669,34 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, ulen = info.jited_prog_len; info.jited_prog_len = prog->jited_len; if (info.jited_prog_len && ulen) { - uinsns = u64_to_user_ptr(info.jited_prog_insns); - ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; + if (bpf_dump_raw_ok()) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } else { + info.jited_prog_insns = 0; + } } ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { + struct bpf_insn *insns_sanitized; + bool fault; + + if (prog->blinded && !bpf_dump_raw_ok()) { + info.xlated_prog_insns = 0; + goto done; + } + insns_sanitized = bpf_insn_prepare_dump(prog); + if (!insns_sanitized) + return -ENOMEM; uinsns = u64_to_user_ptr(info.xlated_prog_insns); ulen = min_t(u32, info.xlated_prog_len, ulen); - if (copy_to_user(uinsns, prog->insnsi, ulen)) + fault = copy_to_user(uinsns, insns_sanitized, ulen); + kfree(insns_sanitized); + if (fault) return -EFAULT; } diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 3c3eec58b3e8..4ae46b2cba88 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4427,9 +4427,12 @@ static int do_check(struct bpf_verifier_env *env) } if (env->log.level) { + const struct bpf_insn_cbs cbs = { + .cb_print = verbose, + }; + verbose(env, "%d: ", insn_idx); - print_bpf_insn(verbose, env, insn, - env->allow_ptr_leaks); + print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); } err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); @@ -5017,14 +5020,14 @@ static int jit_subprogs(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog, **func, *tmp; int i, j, subprog_start, subprog_end = 0, len, subprog; - struct bpf_insn *insn = prog->insnsi; + struct bpf_insn *insn; void *old_bpf_func; int err = -ENOMEM; if (env->subprog_cnt == 0) return 0; - for (i = 0; i < prog->len; i++, insn++) { + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { if (insn->code != (BPF_JMP | BPF_CALL) || insn->src_reg != BPF_PSEUDO_CALL) continue; @@ -5116,6 +5119,25 @@ static int jit_subprogs(struct bpf_verifier_env *env) bpf_prog_lock_ro(func[i]); bpf_prog_kallsyms_add(func[i]); } + + /* Last step: make now unused interpreter insns from main + * prog consistent for later dump requests, so they can + * later look the same as if they were interpreted only. + */ + for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) { + unsigned long addr; + + if (insn->code != (BPF_JMP | BPF_CALL) || + insn->src_reg != BPF_PSEUDO_CALL) + continue; + insn->off = env->insn_aux_data[i].call_imm; + subprog = find_subprog(env, i + insn->off + 1); + addr = (unsigned long)func[subprog + 1]->bpf_func; + addr &= PAGE_MASK; + insn->imm = (u64 (*)(u64, u64, u64, u64, u64)) + addr - __bpf_call_base; + } + prog->jited = 1; prog->bpf_func = func[0]->bpf_func; prog->aux->func = func; diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index 037484ceaeaf..42ee8892549c 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -401,6 +401,88 @@ static int do_show(int argc, char **argv) return err; } +#define SYM_MAX_NAME 256 + +struct kernel_sym { + unsigned long address; + char name[SYM_MAX_NAME]; +}; + +struct dump_data { + unsigned long address_call_base; + struct kernel_sym *sym_mapping; + __u32 sym_count; + char scratch_buff[SYM_MAX_NAME]; +}; + +static int kernel_syms_cmp(const void *sym_a, const void *sym_b) +{ + return ((struct kernel_sym *)sym_a)->address - + ((struct kernel_sym *)sym_b)->address; +} + +static void kernel_syms_load(struct dump_data *dd) +{ + struct kernel_sym *sym; + char buff[256]; + void *tmp, *address; + FILE *fp; + + fp = fopen("/proc/kallsyms", "r"); + if (!fp) + return; + + while (!feof(fp)) { + if (!fgets(buff, sizeof(buff), fp)) + break; + tmp = realloc(dd->sym_mapping, + (dd->sym_count + 1) * + sizeof(*dd->sym_mapping)); + if (!tmp) { +out: + free(dd->sym_mapping); + dd->sym_mapping = NULL; + fclose(fp); + return; + } + dd->sym_mapping = tmp; + sym = &dd->sym_mapping[dd->sym_count]; + if (sscanf(buff, "%p %*c %s", &address, sym->name) != 2) + continue; + sym->address = (unsigned long)address; + if (!strcmp(sym->name, "__bpf_call_base")) { + dd->address_call_base = sym->address; + /* sysctl kernel.kptr_restrict was set */ + if (!sym->address) + goto out; + } + if (sym->address) + dd->sym_count++; + } + + fclose(fp); + + qsort(dd->sym_mapping, dd->sym_count, + sizeof(*dd->sym_mapping), kernel_syms_cmp); +} + +static void kernel_syms_destroy(struct dump_data *dd) +{ + free(dd->sym_mapping); +} + +static struct kernel_sym *kernel_syms_search(struct dump_data *dd, + unsigned long key) +{ + struct kernel_sym sym = { + .address = key, + }; + + return dd->sym_mapping ? + bsearch(&sym, dd->sym_mapping, dd->sym_count, + sizeof(*dd->sym_mapping), kernel_syms_cmp) : NULL; +} + static void print_insn(struct bpf_verifier_env *env, const char *fmt, ...) { va_list args; @@ -410,8 +492,71 @@ static void print_insn(struct bpf_verifier_env *env, const char *fmt, ...) va_end(args); } -static void dump_xlated_plain(void *buf, unsigned int len, bool opcodes) +static const char *print_call_pcrel(struct dump_data *dd, + struct kernel_sym *sym, + unsigned long address, + const struct bpf_insn *insn) { + if (sym) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "%+d#%s", insn->off, sym->name); + else + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "%+d#0x%lx", insn->off, address); + return dd->scratch_buff; +} + +static const char *print_call_helper(struct dump_data *dd, + struct kernel_sym *sym, + unsigned long address) +{ + if (sym) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "%s", sym->name); + else + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "0x%lx", address); + return dd->scratch_buff; +} + +static const char *print_call(void *private_data, + const struct bpf_insn *insn) +{ + struct dump_data *dd = private_data; + unsigned long address = dd->address_call_base + insn->imm; + struct kernel_sym *sym; + + sym = kernel_syms_search(dd, address); + if (insn->src_reg == BPF_PSEUDO_CALL) + return print_call_pcrel(dd, sym, address, insn); + else + return print_call_helper(dd, sym, address); +} + +static const char *print_imm(void *private_data, + const struct bpf_insn *insn, + __u64 full_imm) +{ + struct dump_data *dd = private_data; + + if (insn->src_reg == BPF_PSEUDO_MAP_FD) + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "map[id:%u]", insn->imm); + else + snprintf(dd->scratch_buff, sizeof(dd->scratch_buff), + "0x%llx", (unsigned long long)full_imm); + return dd->scratch_buff; +} + +static void dump_xlated_plain(struct dump_data *dd, void *buf, + unsigned int len, bool opcodes) +{ + const struct bpf_insn_cbs cbs = { + .cb_print = print_insn, + .cb_call = print_call, + .cb_imm = print_imm, + .private_data = dd, + }; struct bpf_insn *insn = buf; bool double_insn = false; unsigned int i; @@ -425,7 +570,7 @@ static void dump_xlated_plain(void *buf, unsigned int len, bool opcodes) double_insn = insn[i].code == (BPF_LD | BPF_IMM | BPF_DW); printf("% 4d: ", i); - print_bpf_insn(print_insn, NULL, insn + i, true); + print_bpf_insn(&cbs, NULL, insn + i, true); if (opcodes) { printf(" "); @@ -454,8 +599,15 @@ static void print_insn_json(struct bpf_verifier_env *env, const char *fmt, ...) va_end(args); } -static void dump_xlated_json(void *buf, unsigned int len, bool opcodes) +static void dump_xlated_json(struct dump_data *dd, void *buf, + unsigned int len, bool opcodes) { + const struct bpf_insn_cbs cbs = { + .cb_print = print_insn_json, + .cb_call = print_call, + .cb_imm = print_imm, + .private_data = dd, + }; struct bpf_insn *insn = buf; bool double_insn = false; unsigned int i; @@ -470,7 +622,7 @@ static void dump_xlated_json(void *buf, unsigned int len, bool opcodes) jsonw_start_object(json_wtr); jsonw_name(json_wtr, "disasm"); - print_bpf_insn(print_insn_json, NULL, insn + i, true); + print_bpf_insn(&cbs, NULL, insn + i, true); if (opcodes) { jsonw_name(json_wtr, "opcodes"); @@ -505,6 +657,7 @@ static void dump_xlated_json(void *buf, unsigned int len, bool opcodes) static int do_dump(int argc, char **argv) { struct bpf_prog_info info = {}; + struct dump_data dd = {}; __u32 len = sizeof(info); unsigned int buf_size; char *filepath = NULL; @@ -592,6 +745,14 @@ static int do_dump(int argc, char **argv) goto err_free; } + if ((member_len == &info.jited_prog_len && + info.jited_prog_insns == 0) || + (member_len == &info.xlated_prog_len && + info.xlated_prog_insns == 0)) { + p_err("error retrieving insn dump: kernel.kptr_restrict set?"); + goto err_free; + } + if (filepath) { fd = open(filepath, O_WRONLY | O_CREAT | O_TRUNC, 0600); if (fd < 0) { @@ -608,17 +769,19 @@ static int do_dump(int argc, char **argv) goto err_free; } } else { - if (member_len == &info.jited_prog_len) + if (member_len == &info.jited_prog_len) { disasm_print_insn(buf, *member_len, opcodes); - else + } else { + kernel_syms_load(&dd); if (json_output) - dump_xlated_json(buf, *member_len, opcodes); + dump_xlated_json(&dd, buf, *member_len, opcodes); else - dump_xlated_plain(buf, *member_len, opcodes); + dump_xlated_plain(&dd, buf, *member_len, opcodes); + kernel_syms_destroy(&dd); + } } free(buf); - return 0; err_free: -- cgit v1.2.3 From fd05e57bb35ad5eb7e261b64e5cf46445250f842 Mon Sep 17 00:00:00 2001 From: Gianluca Borello Date: Sat, 23 Dec 2017 10:09:55 +0000 Subject: bpf: fix stacksafe exploration when comparing states Commit cc2b14d51053 ("bpf: teach verifier to recognize zero initialized stack") introduced a very relaxed check when comparing stacks of different states, effectively returning a positive result in many cases where it shouldn't. This can create problems in cases such as this following C pseudocode: long var; long *x = bpf_map_lookup(...); if (!x) return; if (*x != 0xbeef) var = 0; else var = 1; /* This is the key part, calling a helper causes an explored state * to be saved with the information that "var" is on the stack as * STACK_ZERO, since the helper is first met by the verifier after * the "var = 0" assignment. This state will however be wrongly used * also for the "var = 1" case, so the verifier assumes "var" is always * 0 and will replace the NULL assignment with nops, because the * search pruning prevents it from exploring the faulty branch. */ bpf_ktime_get_ns(); if (var) *(long *)0 = 0xbeef; Fix the issue by making sure that the stack is fully explored before returning a positive comparison result. Also attach a couple tests that highlight the bad behavior. In the first test, without this fix instructions 16 and 17 are replaced with nops instead of being rejected by the verifier. The second test, instead, allows a program to make a potentially illegal read from the stack. Fixes: cc2b14d51053 ("bpf: teach verifier to recognize zero initialized stack") Signed-off-by: Gianluca Borello Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 2 +- tools/testing/selftests/bpf/test_verifier.c | 51 +++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 4ae46b2cba88..82ae580440b8 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4107,7 +4107,7 @@ static bool stacksafe(struct bpf_func_state *old, if (!(old->stack[spi].spilled_ptr.live & REG_LIVE_READ)) /* explored state didn't use this */ - return true; + continue; if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) continue; diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 3bacff0d6f91..5e79515d10c5 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -9715,6 +9715,57 @@ static struct bpf_test tests[] = { .result = REJECT, .prog_type = BPF_PROG_TYPE_XDP, }, + { + "search pruning: all branches should be verified (nop operation)", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 11), + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0xbeef, 2), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_A(1), + BPF_MOV64_IMM(BPF_REG_4, 1), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_4, -16), + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -16), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_5, 0, 2), + BPF_MOV64_IMM(BPF_REG_6, 0), + BPF_ST_MEM(BPF_DW, BPF_REG_6, 0, 0xdead), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, + .errstr = "R6 invalid mem access 'inv'", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, + { + "search pruning: all branches should be verified (invalid stack access)", + .insns = { + BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), + BPF_ST_MEM(BPF_DW, BPF_REG_2, 0, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_EMIT_CALL(BPF_FUNC_map_lookup_elem), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 8), + BPF_LDX_MEM(BPF_DW, BPF_REG_3, BPF_REG_0, 0), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_JMP_IMM(BPF_JEQ, BPF_REG_3, 0xbeef, 2), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_4, -16), + BPF_JMP_A(1), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_4, -24), + BPF_EMIT_CALL(BPF_FUNC_ktime_get_ns), + BPF_LDX_MEM(BPF_DW, BPF_REG_5, BPF_REG_10, -16), + BPF_EXIT_INSN(), + }, + .fixup_map1 = { 3 }, + .errstr = "invalid read from stack off -16+0 size 8", + .result = REJECT, + .prog_type = BPF_PROG_TYPE_TRACEPOINT, + }, }; static int probe_filter_length(const struct bpf_insn *fp) -- cgit v1.2.3 From 70a87ffea8acc390ae315fa1930e849f656bdb88 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Dec 2017 13:15:40 -0800 Subject: bpf: fix maximum stack depth tracking logic Instead of computing max stack depth for current call chain during the main verifier pass track stack depth of each function independently and after do_check() is done do another pass over all instructions analyzing depth of all possible call stacks. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Reported-by: Jann Horn Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 1 + kernel/bpf/verifier.c | 82 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 67 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index aaac589e490c..94a02ceb1246 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -194,6 +194,7 @@ struct bpf_verifier_env { struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ struct bpf_verifer_log log; u32 subprog_starts[BPF_MAX_SUBPROGS]; + /* computes the stack depth of each bpf function */ u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 82ae580440b8..738e919efdf0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1430,33 +1430,80 @@ static int update_stack_depth(struct bpf_verifier_env *env, const struct bpf_func_state *func, int off) { - u16 stack = env->subprog_stack_depth[func->subprogno], total = 0; - struct bpf_verifier_state *cur = env->cur_state; - int i; + u16 stack = env->subprog_stack_depth[func->subprogno]; if (stack >= -off) return 0; /* update known max for given subprogram */ env->subprog_stack_depth[func->subprogno] = -off; + return 0; +} - /* compute the total for current call chain */ - for (i = 0; i <= cur->curframe; i++) { - u32 depth = env->subprog_stack_depth[cur->frame[i]->subprogno]; - - /* round up to 32-bytes, since this is granularity - * of interpreter stack sizes - */ - depth = round_up(depth, 32); - total += depth; - } +/* starting from main bpf function walk all instructions of the function + * and recursively walk all callees that given function can call. + * Ignore jump and exit insns. + * Since recursion is prevented by check_cfg() this algorithm + * only needs a local stack of MAX_CALL_FRAMES to remember callsites + */ +static int check_max_stack_depth(struct bpf_verifier_env *env) +{ + int depth = 0, frame = 0, subprog = 0, i = 0, subprog_end; + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + int ret_insn[MAX_CALL_FRAMES]; + int ret_prog[MAX_CALL_FRAMES]; - if (total > MAX_BPF_STACK) { +process_func: + /* round up to 32-bytes, since this is granularity + * of interpreter stack size + */ + depth += round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + if (depth > MAX_BPF_STACK) { verbose(env, "combined stack size of %d calls is %d. Too large\n", - cur->curframe, total); + frame + 1, depth); return -EACCES; } - return 0; +continue_func: + if (env->subprog_cnt == subprog) + subprog_end = insn_cnt; + else + subprog_end = env->subprog_starts[subprog]; + for (; i < subprog_end; i++) { + if (insn[i].code != (BPF_JMP | BPF_CALL)) + continue; + if (insn[i].src_reg != BPF_PSEUDO_CALL) + continue; + /* remember insn and function to return to */ + ret_insn[frame] = i + 1; + ret_prog[frame] = subprog; + + /* find the callee */ + i = i + insn[i].imm + 1; + subprog = find_subprog(env, i); + if (subprog < 0) { + WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", + i); + return -EFAULT; + } + subprog++; + frame++; + if (frame >= MAX_CALL_FRAMES) { + WARN_ONCE(1, "verifier bug. Call stack is too deep\n"); + return -EFAULT; + } + goto process_func; + } + /* end of for() loop means the last insn of the 'subprog' + * was reached. Doesn't matter whether it was JA or EXIT + */ + if (frame == 0) + return 0; + depth -= round_up(max_t(u32, env->subprog_stack_depth[subprog], 1), 32); + frame--; + i = ret_insn[frame]; + subprog = ret_prog[frame]; + goto continue_func; } static int get_callee_stack_depth(struct bpf_verifier_env *env, @@ -5403,6 +5450,9 @@ skip_full_check: if (ret == 0) sanitize_dead_code(env); + if (ret == 0) + ret = check_max_stack_depth(env); + if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); -- cgit v1.2.3 From aada9ce644e53410954daa6beb1f7c4ca158abd7 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 25 Dec 2017 13:15:42 -0800 Subject: bpf: fix max call depth check fix off by one error in max call depth check and add a test Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 ++-- tools/testing/selftests/bpf/test_verifier.c | 35 +++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 738e919efdf0..52ad60b3b8be 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2126,9 +2126,9 @@ static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn, struct bpf_func_state *caller, *callee; int i, subprog, target_insn; - if (state->curframe >= MAX_CALL_FRAMES) { + if (state->curframe + 1 >= MAX_CALL_FRAMES) { verbose(env, "the call stack of %d frames is too deep\n", - state->curframe); + state->curframe + 2); return -E2BIG; } diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index b5a7a6c530dc..5d0a574ce270 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -8884,6 +8884,41 @@ static struct bpf_test tests[] = { .result = REJECT, .errstr = "combined stack", }, + { + "calls: stack depth check using three frames. test5", + .insns = { + /* main */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call A */ + BPF_EXIT_INSN(), + /* A */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call B */ + BPF_EXIT_INSN(), + /* B */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call C */ + BPF_EXIT_INSN(), + /* C */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call D */ + BPF_EXIT_INSN(), + /* D */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call E */ + BPF_EXIT_INSN(), + /* E */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call F */ + BPF_EXIT_INSN(), + /* F */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call G */ + BPF_EXIT_INSN(), + /* G */ + BPF_RAW_INSN(BPF_JMP|BPF_CALL, 0, 1, 0, 1), /* call H */ + BPF_EXIT_INSN(), + /* H */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_XDP, + .errstr = "call stack", + .result = REJECT, + }, { "calls: spill into caller stack frame", .insns = { -- cgit v1.2.3 From e0d3974ac77b0d581b92affe5851fc40ad2f42a4 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:03 -0800 Subject: bpf: offload: don't require rtnl for dev list manipulation We don't need the RTNL lock for all operations on offload state. We only need to hold it around ndo calls. The device offload initialization doesn't require it. The soon-to-come querying of the offload info will only need it partially. We will also be able to remove the waitqueue in following patches. Use struct rw_semaphore because map offload will require sleeping with the semaphore held for read. Suggested-by: Kirill Tkhai Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 8455b89d1bbf..032079754d88 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -20,13 +20,16 @@ #include #include #include +#include -/* protected by RTNL */ +/* Protects bpf_prog_offload_devs and offload members of all progs. + * RTNL lock cannot be taken when holding this lock. + */ +static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { - struct net *net = current->nsproxy->net_ns; struct bpf_dev_offload *offload; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && @@ -43,19 +46,26 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) offload->prog = prog; init_waitqueue_head(&offload->verifier_done); - rtnl_lock(); - offload->netdev = __dev_get_by_index(net, attr->prog_ifindex); - if (!offload->netdev) { - rtnl_unlock(); - kfree(offload); - return -EINVAL; - } + offload->netdev = dev_get_by_index(current->nsproxy->net_ns, + attr->prog_ifindex); + if (!offload->netdev) + goto err_free; + down_write(&bpf_devs_lock); + if (offload->netdev->reg_state != NETREG_REGISTERED) + goto err_unlock; prog->aux->offload = offload; list_add_tail(&offload->offloads, &bpf_prog_offload_devs); - rtnl_unlock(); + dev_put(offload->netdev); + up_write(&bpf_devs_lock); return 0; +err_unlock: + up_write(&bpf_devs_lock); + dev_put(offload->netdev); +err_free: + kfree(offload); + return -EINVAL; } static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, @@ -126,7 +136,9 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) wake_up(&offload->verifier_done); rtnl_lock(); + down_write(&bpf_devs_lock); __bpf_prog_offload_destroy(prog); + up_write(&bpf_devs_lock); rtnl_unlock(); kfree(offload); @@ -181,11 +193,13 @@ static int bpf_offload_notification(struct notifier_block *notifier, if (netdev->reg_state != NETREG_UNREGISTERING) break; + down_write(&bpf_devs_lock); list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) { if (offload->netdev == netdev) __bpf_prog_offload_destroy(offload->prog); } + up_write(&bpf_devs_lock); break; default: break; -- cgit v1.2.3 From 9a18eedb145d080d542766af1d7513ebfccd1604 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:04 -0800 Subject: bpf: offload: don't use prog->aux->offload as boolean We currently use aux->offload to indicate that program is bound to a specific device. This forces us to keep the offload structure around even after the device is gone. Add a bool member to struct bpf_prog_aux to indicate if offload was requested. Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 3 ++- kernel/bpf/syscall.c | 4 +++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index da54ef644fcd..838eee10e979 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -201,6 +201,7 @@ struct bpf_prog_aux { u32 stack_depth; u32 id; u32 func_cnt; + bool offload_requested; struct bpf_prog **func; void *jit_data; /* JIT specific data. arch dependent */ struct latch_tree_node ksym_tnode; @@ -529,7 +530,7 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) { - return aux->offload; + return aux->offload_requested; } #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 007802c5ca7d..e0afc2e39fd5 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1157,6 +1157,8 @@ static int bpf_prog_load(union bpf_attr *attr) if (!prog) return -ENOMEM; + prog->aux->offload_requested = !!attr->prog_ifindex; + err = security_bpf_prog_alloc(prog->aux); if (err) goto free_prog_nouncharge; @@ -1178,7 +1180,7 @@ static int bpf_prog_load(union bpf_attr *attr) atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; - if (attr->prog_ifindex) { + if (bpf_prog_is_dev_bound(prog->aux)) { err = bpf_prog_offload_init(prog, attr); if (err) goto free_prog; -- cgit v1.2.3 From cae1927c0b4a93ae15de824faca1f6f611a44fcd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:05 -0800 Subject: bpf: offload: allow netdev to disappear while verifier is running To allow verifier instruction callbacks without any extra locking NETDEV_UNREGISTER notification would wait on a waitqueue for verifier to finish. This design decision was made when rtnl lock was providing all the locking. Use the read/write lock instead and remove the workqueue. Verifier will now call into the offload code, so dev_ops are moved to offload structure. Since verifier calls are all under bpf_prog_is_dev_bound() we no longer need static inline implementations to please builds with CONFIG_NET=n. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/main.h | 2 +- drivers/net/ethernet/netronome/nfp/bpf/verifier.c | 2 +- drivers/net/netdevsim/bpf.c | 2 +- include/linux/bpf.h | 9 +++++-- include/linux/bpf_verifier.h | 16 ++---------- include/linux/netdevice.h | 4 +-- kernel/bpf/offload.c | 30 ++++++++++++----------- kernel/bpf/verifier.c | 20 ++++++--------- 8 files changed, 37 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index aae1be9ed056..89a9b6393882 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -238,7 +238,7 @@ struct nfp_bpf_vnic { int nfp_bpf_jit(struct nfp_prog *prog); -extern const struct bpf_ext_analyzer_ops nfp_bpf_analyzer_ops; +extern const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops; struct netdev_bpf; struct nfp_app; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 9c2608445bd8..d8870c2f11f3 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -260,6 +260,6 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) return 0; } -const struct bpf_ext_analyzer_ops nfp_bpf_analyzer_ops = { +const struct bpf_prog_offload_ops nfp_bpf_analyzer_ops = { .insn_hook = nfp_verify_insn, }; diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c index a243fa7ae02f..5134d5c1306c 100644 --- a/drivers/net/netdevsim/bpf.c +++ b/drivers/net/netdevsim/bpf.c @@ -66,7 +66,7 @@ nsim_bpf_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn) return 0; } -static const struct bpf_ext_analyzer_ops nsim_bpf_analyzer_ops = { +static const struct bpf_prog_offload_ops nsim_bpf_analyzer_ops = { .insn_hook = nsim_bpf_verify_insn, }; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 838eee10e979..669549f7e3e8 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -17,6 +17,7 @@ #include #include +struct bpf_verifier_env; struct perf_event; struct bpf_prog; struct bpf_map; @@ -184,14 +185,18 @@ struct bpf_verifier_ops { struct bpf_prog *prog, u32 *target_size); }; +struct bpf_prog_offload_ops { + int (*insn_hook)(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx); +}; + struct bpf_dev_offload { struct bpf_prog *prog; struct net_device *netdev; void *dev_priv; struct list_head offloads; bool dev_state; - bool verifier_running; - wait_queue_head_t verifier_done; + const struct bpf_prog_offload_ops *dev_ops; }; struct bpf_prog_aux { diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 883a35d50cd5..2feb218c001d 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -166,12 +166,6 @@ static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log) return log->len_used >= log->len_total - 1; } -struct bpf_verifier_env; -struct bpf_ext_analyzer_ops { - int (*insn_hook)(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx); -}; - #define BPF_MAX_SUBPROGS 256 /* single container for all structs @@ -185,7 +179,6 @@ struct bpf_verifier_env { bool strict_alignment; /* perform strict pointer alignment checks */ struct bpf_verifier_state *cur_state; /* current verifier state */ struct bpf_verifier_state_list **explored_states; /* search pruning optimization */ - const struct bpf_ext_analyzer_ops *dev_ops; /* device analyzer ops */ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */ u32 used_map_cnt; /* number of used maps */ u32 id_gen; /* used to generate unique reg IDs */ @@ -206,13 +199,8 @@ static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) return cur->frame[cur->curframe]->regs; } -#if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env); -#else -static inline int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) -{ - return -EOPNOTSUPP; -} -#endif +int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx); #endif /* _LINUX_BPF_VERIFIER_H */ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 352066e4eeef..49bfc6eec74c 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -804,7 +804,7 @@ enum bpf_netdev_command { BPF_OFFLOAD_DESTROY, }; -struct bpf_ext_analyzer_ops; +struct bpf_prog_offload_ops; struct netlink_ext_ack; struct netdev_bpf { @@ -826,7 +826,7 @@ struct netdev_bpf { /* BPF_OFFLOAD_VERIFIER_PREP */ struct { struct bpf_prog *prog; - const struct bpf_ext_analyzer_ops *ops; /* callee set */ + const struct bpf_prog_offload_ops *ops; /* callee set */ } verifier; /* BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY */ struct { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 032079754d88..69ddc3899bab 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -44,7 +44,6 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) return -ENOMEM; offload->prog = prog; - init_waitqueue_head(&offload->verifier_done); offload->netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); @@ -97,15 +96,28 @@ int bpf_prog_offload_verifier_prep(struct bpf_verifier_env *env) if (err) goto exit_unlock; - env->dev_ops = data.verifier.ops; - + env->prog->aux->offload->dev_ops = data.verifier.ops; env->prog->aux->offload->dev_state = true; - env->prog->aux->offload->verifier_running = true; exit_unlock: rtnl_unlock(); return err; } +int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, + int insn_idx, int prev_insn_idx) +{ + struct bpf_dev_offload *offload; + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + offload = env->prog->aux->offload; + if (offload->netdev) + ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); + up_read(&bpf_devs_lock); + + return ret; +} + static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_dev_offload *offload = prog->aux->offload; @@ -117,9 +129,6 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) data.offload.prog = prog; - if (offload->verifier_running) - wait_event(offload->verifier_done, !offload->verifier_running); - if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); @@ -132,9 +141,6 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) { struct bpf_dev_offload *offload = prog->aux->offload; - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); down_write(&bpf_devs_lock); __bpf_prog_offload_destroy(prog); @@ -146,15 +152,11 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog) static int bpf_prog_offload_translate(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; int ret; data.offload.prog = prog; - offload->verifier_running = false; - wake_up(&offload->verifier_done); - rtnl_lock(); ret = __bpf_offload_ndo(prog, BPF_OFFLOAD_TRANSLATE, &data); rtnl_unlock(); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 98d8637cf70d..a2b211262c25 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4438,15 +4438,6 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; } -static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) -{ - if (env->dev_ops && env->dev_ops->insn_hook) - return env->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); - - return 0; -} - static int do_check(struct bpf_verifier_env *env) { struct bpf_verifier_state *state; @@ -4531,9 +4522,12 @@ static int do_check(struct bpf_verifier_env *env) print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); } - err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); - if (err) - return err; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + err = bpf_prog_offload_verify_insn(env, insn_idx, + prev_insn_idx); + if (err) + return err; + } regs = cur_regs(env); env->insn_aux_data[insn_idx].seen = true; @@ -5463,7 +5457,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) env->strict_alignment = true; - if (env->prog->aux->offload) { + if (bpf_prog_is_dev_bound(env->prog->aux)) { ret = bpf_prog_offload_verifier_prep(env); if (ret) goto err_unlock; -- cgit v1.2.3 From ce3b9db4db0e0e2b9761c56d08615ea0159e4a1b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:06 -0800 Subject: bpf: offload: free prog->aux->offload when device disappears All bpf offload operations should now be under bpf_devs_lock, it's safe to free and clear the entire offload structure, not only the netdev pointer. __bpf_prog_offload_destroy() will no longer be called multiple times. Suggested-by: Alexei Starovoitov Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 69ddc3899bab..3126e1a842e6 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -70,12 +70,14 @@ err_free: static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, struct netdev_bpf *data) { - struct net_device *netdev = prog->aux->offload->netdev; + struct bpf_dev_offload *offload = prog->aux->offload; + struct net_device *netdev; ASSERT_RTNL(); - if (!netdev) + if (!offload) return -ENODEV; + netdev = offload->netdev; if (!netdev->netdev_ops->ndo_bpf) return -EOPNOTSUPP; @@ -111,7 +113,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, down_read(&bpf_devs_lock); offload = env->prog->aux->offload; - if (offload->netdev) + if (offload) ret = offload->dev_ops->insn_hook(env, insn_idx, prev_insn_idx); up_read(&bpf_devs_lock); @@ -123,31 +125,24 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) struct bpf_dev_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; - /* Caution - if netdev is destroyed before the program, this function - * will be called twice. - */ - data.offload.prog = prog; if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); - offload->dev_state = false; list_del_init(&offload->offloads); - offload->netdev = NULL; + kfree(offload); + prog->aux->offload = NULL; } void bpf_prog_offload_destroy(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; - rtnl_lock(); down_write(&bpf_devs_lock); - __bpf_prog_offload_destroy(prog); + if (prog->aux->offload) + __bpf_prog_offload_destroy(prog); up_write(&bpf_devs_lock); rtnl_unlock(); - - kfree(offload); } static int bpf_prog_offload_translate(struct bpf_prog *prog) -- cgit v1.2.3 From ad8ad79f4f6078f456792f7f8d344da2be9bc74f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:07 -0800 Subject: bpf: offload: free program id when device disappears Bound programs are quite useless after their device disappears. They are simply waiting for reference count to go to zero, don't list them in BPF_PROG_GET_NEXT_ID by freeing their ID early. Note that orphaned offload programs will return -ENODEV on BPF_OBJ_GET_INFO_BY_FD so user will never see ID 0. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/offload.c | 3 +++ kernel/bpf/syscall.c | 9 +++++++-- 3 files changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 669549f7e3e8..9a916ab34299 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -357,6 +357,8 @@ void bpf_prog_put(struct bpf_prog *prog); int __bpf_prog_charge(struct user_struct *user, u32 pages); void __bpf_prog_uncharge(struct user_struct *user, u32 pages); +void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); + struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref); diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 3126e1a842e6..e4f1668a021c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -130,6 +130,9 @@ static void __bpf_prog_offload_destroy(struct bpf_prog *prog) if (offload->dev_state) WARN_ON(__bpf_offload_ndo(prog, BPF_OFFLOAD_DESTROY, &data)); + /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ + bpf_prog_free_id(prog, true); + list_del_init(&offload->offloads); kfree(offload); prog->aux->offload = NULL; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e0afc2e39fd5..e02dafa6f402 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -905,9 +905,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) return id > 0 ? 0 : id; } -static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) +void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) { - /* cBPF to eBPF migrations are currently not in the idr store. */ + /* cBPF to eBPF migrations are currently not in the idr store. + * Offloaded programs are removed from the store when their device + * disappears - even if someone grabs an fd to them they are unusable, + * simply waiting for refcnt to drop to be freed. + */ if (!prog->aux->id) return; @@ -917,6 +921,7 @@ static void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock) __acquire(&prog_idr_lock); idr_remove(&prog_idr, prog->aux->id); + prog->aux->id = 0; if (do_idr_lock) spin_unlock_bh(&prog_idr_lock); -- cgit v1.2.3 From 675fc275a3a2d905535207237402c6d8dcb5fa4b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 27 Dec 2017 18:39:09 -0800 Subject: bpf: offload: report device information for offloaded programs Report to the user ifindex and namespace information of offloaded programs. If device has disappeared return -ENODEV. Specify the namespace using dev/inode combination. CC: Eric W. Biederman Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 3 +++ kernel/bpf/offload.c | 59 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 +++++ tools/include/uapi/linux/bpf.h | 3 +++ 5 files changed, 73 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9a916ab34299..7810ae57b357 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -531,6 +531,8 @@ static inline struct bpf_prog *bpf_prog_get_type(u32 ufd, int bpf_prog_offload_compile(struct bpf_prog *prog); void bpf_prog_offload_destroy(struct bpf_prog *prog); +int bpf_prog_offload_info_fill(struct bpf_prog_info *info, + struct bpf_prog *prog); #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 69eabfcb9bdb..f2f8b36e2ad4 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -921,6 +921,9 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); struct bpf_map_info { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index e4f1668a021c..040d4e0edf3f 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -16,9 +16,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -176,6 +178,63 @@ int bpf_prog_offload_compile(struct bpf_prog *prog) return bpf_prog_offload_translate(prog); } +struct ns_get_path_bpf_prog_args { + struct bpf_prog *prog; + struct bpf_prog_info *info; +}; + +static struct ns_common *bpf_prog_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_prog_args *args = private_data; + struct bpf_prog_aux *aux = args->prog->aux; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (aux->offload) { + args->info->ifindex = aux->offload->netdev->ifindex; + net = dev_net(aux->offload->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_prog_offload_info_fill(struct bpf_prog_info *info, + struct bpf_prog *prog) +{ + struct ns_get_path_bpf_prog_args args = { + .prog = prog, + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + const struct bpf_prog_ops bpf_offload_prog_ops = { }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index e02dafa6f402..ebf0fb23e237 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1707,6 +1707,12 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, return -EFAULT; } + if (bpf_prog_is_dev_bound(prog->aux)) { + err = bpf_prog_offload_info_fill(&info, prog); + if (err) + return err; + } + done: if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index db1b0923a308..4e8c60acfa32 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -921,6 +921,9 @@ struct bpf_prog_info { __u32 nr_map_ids; __aligned_u64 map_ids; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); struct bpf_map_info { -- cgit v1.2.3 From 82975c46da8275a2a212cd94049bbef9bb961da2 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 2 Jan 2018 11:25:26 +0000 Subject: perf: Export perf_event_update_userpage Export perf_event_update_userpage() so that PMU driver using them, can be built as modules. Acked-by: Peter Zilstra Signed-off-by: Suzuki K Poulose Signed-off-by: Will Deacon --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4df5b695bf0d..64ec2c9d2c44 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4904,6 +4904,7 @@ void perf_event_update_userpage(struct perf_event *event) unlock: rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(perf_event_update_userpage); static int perf_mmap_fault(struct vm_fault *vmf) { -- cgit v1.2.3 From 0b44bf9a6f5cde099ae21b4aa94553484203769a Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 17 Aug 2017 15:45:38 -0500 Subject: signal: Simplify and fix kdb_send_sig - Rename from kdb_send_sig_info to kdb_send_sig As there is no meaningful siginfo sent - Use SEND_SIG_PRIV instead of generating a siginfo for a kdb signal. The generated siginfo had a bogus rationale and was not correct in the face of pid namespaces. SEND_SIG_PRIV is simpler and actually correct. - As the code grabs siglock just send the signal with siglock held instead of dropping siglock and attempting to grab it again. - Move the sig_valid test into kdb_kill where it can generate a good error message. Signed-off-by: Eric W. Biederman --- kernel/debug/kdb/kdb_main.c | 10 ++-------- kernel/debug/kdb/kdb_private.h | 2 +- kernel/signal.c | 14 +++++++------- 3 files changed, 10 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index c8146d53ca67..dbb0781a0533 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2441,7 +2441,6 @@ static int kdb_kill(int argc, const char **argv) long sig, pid; char *endp; struct task_struct *p; - struct siginfo info; if (argc != 2) return KDB_ARGCOUNT; @@ -2449,7 +2448,7 @@ static int kdb_kill(int argc, const char **argv) sig = simple_strtol(argv[1], &endp, 0); if (*endp) return KDB_BADINT; - if (sig >= 0) { + if ((sig >= 0) || !valid_signal(-sig)) { kdb_printf("Invalid signal parameter.<-signal>\n"); return 0; } @@ -2470,12 +2469,7 @@ static int kdb_kill(int argc, const char **argv) return 0; } p = p->group_leader; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_USER; - info.si_pid = pid; /* same capabilities as process being signalled */ - info.si_uid = 0; /* kdb has root authority */ - kdb_send_sig_info(p, &info); + kdb_send_sig(p, sig); return 0; } diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h index fc224fbcf954..1e5a502ba4a7 100644 --- a/kernel/debug/kdb/kdb_private.h +++ b/kernel/debug/kdb/kdb_private.h @@ -208,7 +208,7 @@ extern unsigned long kdb_task_state(const struct task_struct *p, extern void kdb_ps_suppressed(void); extern void kdb_ps1(const struct task_struct *p); extern void kdb_print_nameval(const char *name, unsigned long val); -extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info); +extern void kdb_send_sig(struct task_struct *p, int sig); extern void kdb_meminfo_proc_show(void); extern char *kdb_getstr(char *, size_t, const char *); extern void kdb_gdb_state_pass(char *buf); diff --git a/kernel/signal.c b/kernel/signal.c index 9558664bd9ec..c894162ec96c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3684,26 +3684,25 @@ void __init signals_init(void) #ifdef CONFIG_KGDB_KDB #include /* - * kdb_send_sig_info - Allows kdb to send signals without exposing + * kdb_send_sig - Allows kdb to send signals without exposing * signal internals. This function checks if the required locks are * available before calling the main signal code, to avoid kdb * deadlocks. */ -void -kdb_send_sig_info(struct task_struct *t, struct siginfo *info) +void kdb_send_sig(struct task_struct *t, int sig) { static struct task_struct *kdb_prev_t; - int sig, new_t; + int new_t, ret; if (!spin_trylock(&t->sighand->siglock)) { kdb_printf("Can't do kill command now.\n" "The sigmask lock is held somewhere else in " "kernel, try again later\n"); return; } - spin_unlock(&t->sighand->siglock); new_t = kdb_prev_t != t; kdb_prev_t = t; if (t->state != TASK_RUNNING && new_t) { + spin_unlock(&t->sighand->siglock); kdb_printf("Process is not RUNNING, sending a signal from " "kdb risks deadlock\n" "on the run queue locks. " @@ -3712,8 +3711,9 @@ kdb_send_sig_info(struct task_struct *t, struct siginfo *info) "the deadlock.\n"); return; } - sig = info->si_signo; - if (send_sig_info(sig, info, t)) + ret = send_signal(sig, SEND_SIG_PRIV, t, false); + spin_unlock(&t->sighand->siglock); + if (ret) kdb_printf("Fail to deliver Signal %d to process %d.\n", sig, t->pid); else -- cgit v1.2.3 From cca10d58d25d271f05e1115132b4c2d913bb652e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 21 Dec 2017 14:41:49 +0900 Subject: printk: add console_msg_format command line option 0day and kernelCI automatically parse kernel log - basically some sort of grepping using the pre-defined text patterns - in order to detect and report regressions/errors. There are several sources they get the kernel logs from: a) dmesg or /proc/ksmg This is the preferred way. Because `dmesg --raw' (see later Note) and /proc/kmsg output contains facility and log level, which greatly simplifies grepping for EMERG/ALERT/CRIT/ERR messages. b) serial consoles This option is harder to maintain, because serial console messages don't contain facility and log level. This patch introduces a `console_msg_format=' command line option, to switch between different message formatting on serial consoles. For the time being we have just two options - default and syslog. The "default" option just keeps the existing format. While the "syslog" option makes serial console messages to appear in syslog format [syslog() syscall], matching the `dmesg -S --raw' and `cat /proc/kmsg' output formats: - facility and log level - time stamp (depends on printk_time/PRINTK_TIME) - message <%u>[time stamp] text\n NOTE: while Kevin and Fengguang talk about "dmesg --raw", it's actually "dmesg -S --raw" that always prints messages in syslog format [per Petr Mladek]. Running "dmesg --raw" may produce output in non-syslog format sometimes. console_msg_format=syslog enables syslog format, thus in documentation we mention "dmesg -S --raw", not "dmesg --raw". Per Kevin Hilman: : Right now we can get this info from a "dmesg --raw" after bootup, : but it would be really nice in certain automation frameworks to : have a kernel command-line option to enable printing of loglevels : in default boot log. : : This is especially useful when ingesting kernel logs into advanced : search/analytics frameworks (I'm playing with and ELK stack: Elastic : Search, Logstash, Kibana). : : The other important reason for having this on the command line is that : for testing linux-next (and other bleeding edge developer branches), : it's common that we never make it to userspace, so can't even run : "dmesg --raw" (or equivalent.) So we really want this on the primary : boot (serial) console. Per Fengguang Wu, 0day scripts should quickly benefit from that feature, because they will be able to switch to a more reliable parsing, based on messages' facility and log levels [1]: `#{grep} -a -E -e '^<[0123]>' -e '^kern :(err |crit |alert |emerg )' instead of doing text pattern matching `#{grep} -a -F -f /lkp/printk-error-messages #{kmsg_file} | grep -a -v -E -f #{LKP_SRC}/etc/oops-pattern | grep -a -v -F -f #{LKP_SRC}/etc/kmsg-blacklist` [1] https://github.com/fengguang/lkp-tests/blob/master/lib/dmesg.rb Link: http://lkml.kernel.org/r/20171221054149.4398-1-sergey.senozhatsky@gmail.com To: Steven Rostedt Cc: Linus Torvalds Cc: Fengguang Wu Cc: Kevin Hilman Cc: Mark Brown Cc: Greg Kroah-Hartman Cc: Andrew Morton Cc: LKML Signed-off-by: Sergey Senozhatsky Reviewed-by: Fengguang Wu Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: Petr Mladek --- Documentation/admin-guide/kernel-parameters.txt | 14 ++++++++++++++ kernel/printk/printk.c | 23 ++++++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 62436bd5f34a..af614995b71d 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -640,6 +640,20 @@ console=brl,ttyS0 For now, only VisioBraille is supported. + console_msg_format= + [KNL] Change console messages format + default + By default we print messages on consoles in + "[time stamp] text\n" format (time stamp may not be + printed, depending on CONFIG_PRINTK_TIME or + `printk_time' param). + syslog + Switch to syslog format: "<%u>[time stamp] text\n" + IOW, each message will have a facility and loglevel + prefix. The format is similar to one used by syslog() + syscall, or to executing "dmesg -S --raw" or to reading + from /proc/kmsg. + consoleblank= [KNL] The console blank (screen saver) timeout in seconds. A value of 0 disables the blank timer. Defaults to 0. diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5d81206a572d..568729e0dc2c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -277,6 +277,13 @@ EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; +enum con_msg_format_flags { + MSG_FORMAT_DEFAULT = 0, + MSG_FORMAT_SYSLOG = (1 << 0), +}; + +static int console_msg_format = MSG_FORMAT_DEFAULT; + /* * The printk log buffer consists of a chain of concatenated variable * length records. Every record starts with a record header, containing @@ -1913,6 +1920,17 @@ static int __add_preferred_console(char *name, int idx, char *options, c->index = idx; return 0; } + +static int __init console_msg_format_setup(char *str) +{ + if (!strcmp(str, "syslog")) + console_msg_format = MSG_FORMAT_SYSLOG; + if (!strcmp(str, "default")) + console_msg_format = MSG_FORMAT_DEFAULT; + return 1; +} +__setup("console_msg_format=", console_msg_format_setup); + /* * Set up a console. Called via do_early_param() in init/main.c * for each "console=" parameter in the boot command line. @@ -2215,7 +2233,10 @@ skip: goto skip; } - len += msg_print_text(msg, false, text + len, sizeof(text) - len); + len += msg_print_text(msg, + console_msg_format & MSG_FORMAT_SYSLOG, + text + len, + sizeof(text) - len); if (nr_ext_console_drivers) { ext_len = msg_print_ext_header(ext_text, sizeof(ext_text), -- cgit v1.2.3 From c20a71a7a3841a6198721a0b4276f3298b38bbcf Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Jan 2018 17:57:39 -0800 Subject: bpf: sockmap remove unused function This was added for some work that was eventually factored out but the helper call was missed. Remove it now and add it back later if needed. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- kernel/bpf/sockmap.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 5ee2e41893d9..3f662ee23a34 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -96,14 +96,6 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } -/* compute the linear packet data range [data, data_end) for skb when - * sk_skb type programs are in use. - */ -static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) -{ - TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); -} - enum __sk_action { __SK_DROP = 0, __SK_PASS, -- cgit v1.2.3 From 5f103c5d4dbadec0f2cacd39b6429e1b8a8cf983 Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 3 Jan 2018 17:57:56 -0800 Subject: bpf: only build sockmap with CONFIG_INET The sockmap infrastructure is only aware of TCP sockets at the moment. In the future we plan to add UDP. In both cases CONFIG_NET should be built-in. So lets only build sockmap if CONFIG_INET is enabled. Signed-off-by: John Fastabend Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 +- include/linux/bpf_types.h | 2 +- kernel/bpf/Makefile | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 7810ae57b357..9e03046d1df2 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -554,7 +554,7 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) } #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ -#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) +#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key); int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type); #else diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 978c1d9c9383..19b8349a3809 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -42,7 +42,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) -#ifdef CONFIG_STREAM_PARSER +#if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_INET) BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif BPF_MAP_TYPE(BPF_MAP_TYPE_CPUMAP, cpu_map_ops) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e691da0b3bab..a713fd23ec88 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -9,9 +9,11 @@ obj-$(CONFIG_BPF_SYSCALL) += devmap.o obj-$(CONFIG_BPF_SYSCALL) += cpumap.o obj-$(CONFIG_BPF_SYSCALL) += offload.o ifeq ($(CONFIG_STREAM_PARSER),y) +ifeq ($(CONFIG_INET),y) obj-$(CONFIG_BPF_SYSCALL) += sockmap.o endif endif +endif ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif -- cgit v1.2.3 From 08b21fbf4bc4befbd77cd19422a81149ba8cda00 Mon Sep 17 00:00:00 2001 From: Cheah Kok Cheong Date: Thu, 21 Dec 2017 19:35:30 +0800 Subject: padata: add SPDX identifier Add SPDX license identifier according to the type of license text found in the file. Cc: Philippe Ombredanne Signed-off-by: Cheah Kok Cheong Acked-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 57c0074d50cc..d568cc56405f 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0 /* * padata.c - generic interface to process data streams in parallel * -- cgit v1.2.3 From 3b14f08d169400a5a635b299a273db23d2be8e49 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 12 Dec 2017 16:34:53 +0900 Subject: irq debug: do not use print_symbol() print_symbol() is a very old API that has been obsoleted by %pS format specifier in a normal printk() call. Replace print_symbol() with a direct printk("%pS") call and avoid using continuous lines. Link: http://lkml.kernel.org/r/20171212073453.21455-1-sergey.senozhatsky@gmail.com To: Andrew Morton To: Russell King To: Catalin Marinas To: Mark Salter To: Tony Luck To: David Howells To: Yoshinori Sato To: Guan Xuetao To: Borislav Petkov To: Greg Kroah-Hartman To: Thomas Gleixner To: Peter Zijlstra To: Vineet Gupta To: Fengguang Wu To: David Laight Cc: Steven Rostedt Cc: Petr Mladek Cc: LKML Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: linux-ia64@vger.kernel.org Cc: linux-am33-list@redhat.com Cc: linux-sh@vger.kernel.org Cc: linux-edac@vger.kernel.org Cc: x86@kernel.org Cc: linux-snps-arc@lists.infradead.org Signed-off-by: Sergey Senozhatsky [pmladek@suse.com: updated commit message] Signed-off-by: Petr Mladek --- kernel/irq/debug.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h index 17f05ef8f575..7e06dd275c17 100644 --- a/kernel/irq/debug.h +++ b/kernel/irq/debug.h @@ -3,8 +3,6 @@ * Debugging printout: */ -#include - #define ___P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f) #define ___PS(f) if (desc->istate & f) printk("%14s set\n", #f) /* FIXME */ @@ -14,14 +12,14 @@ static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc) { printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n", irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled); - printk("->handle_irq(): %p, ", desc->handle_irq); - print_symbol("%s\n", (unsigned long)desc->handle_irq); - printk("->irq_data.chip(): %p, ", desc->irq_data.chip); - print_symbol("%s\n", (unsigned long)desc->irq_data.chip); + printk("->handle_irq(): %p, %pS\n", + desc->handle_irq, desc->handle_irq); + printk("->irq_data.chip(): %p, %pS\n", + desc->irq_data.chip, desc->irq_data.chip); printk("->action(): %p\n", desc->action); if (desc->action) { - printk("->action->handler(): %p, ", desc->action->handler); - print_symbol("%s\n", (unsigned long)desc->action->handler); + printk("->action->handler(): %p, %pS\n", + desc->action->handler, desc->action->handler); } ___P(IRQ_LEVEL); -- cgit v1.2.3 From a4a0683fd5e64e029421a465525352f01d57f27a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 1 Dec 2017 17:22:19 -0500 Subject: bpf_obj_do_pin(): switch to vfs_mkobj(), quit abusing ->mknod() Signed-off-by: Al Viro --- kernel/bpf/inode.c | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 01aaef1a77c5..2b75faccc771 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -150,39 +150,29 @@ static int bpf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; } -static int bpf_mkobj_ops(struct inode *dir, struct dentry *dentry, - umode_t mode, const struct inode_operations *iops) +static int bpf_mkobj_ops(struct dentry *dentry, umode_t mode, void *raw, + const struct inode_operations *iops) { - struct inode *inode; - - inode = bpf_get_inode(dir->i_sb, dir, mode | S_IFREG); + struct inode *dir = dentry->d_parent->d_inode; + struct inode *inode = bpf_get_inode(dir->i_sb, dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); inode->i_op = iops; - inode->i_private = dentry->d_fsdata; + inode->i_private = raw; bpf_dentry_finalize(dentry, inode, dir); return 0; } -static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t devt) +static int bpf_mkprog(struct dentry *dentry, umode_t mode, void *arg) { - enum bpf_type type = MINOR(devt); - - if (MAJOR(devt) != UNNAMED_MAJOR || !S_ISREG(mode) || - dentry->d_fsdata == NULL) - return -EPERM; + return bpf_mkobj_ops(dentry, mode, arg, &bpf_prog_iops); +} - switch (type) { - case BPF_TYPE_PROG: - return bpf_mkobj_ops(dir, dentry, mode, &bpf_prog_iops); - case BPF_TYPE_MAP: - return bpf_mkobj_ops(dir, dentry, mode, &bpf_map_iops); - default: - return -EPERM; - } +static int bpf_mkmap(struct dentry *dentry, umode_t mode, void *arg) +{ + return bpf_mkobj_ops(dentry, mode, arg, &bpf_map_iops); } static struct dentry * @@ -218,7 +208,6 @@ static int bpf_symlink(struct inode *dir, struct dentry *dentry, static const struct inode_operations bpf_dir_iops = { .lookup = bpf_lookup, - .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .symlink = bpf_symlink, .rmdir = simple_rmdir, @@ -234,7 +223,6 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, struct inode *dir; struct path path; umode_t mode; - dev_t devt; int ret; dentry = kern_path_create(AT_FDCWD, pathname->name, &path, 0); @@ -242,9 +230,8 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, return PTR_ERR(dentry); mode = S_IFREG | ((S_IRUSR | S_IWUSR) & ~current_umask()); - devt = MKDEV(UNNAMED_MAJOR, type); - ret = security_path_mknod(&path, dentry, mode, devt); + ret = security_path_mknod(&path, dentry, mode, 0); if (ret) goto out; @@ -254,9 +241,16 @@ static int bpf_obj_do_pin(const struct filename *pathname, void *raw, goto out; } - dentry->d_fsdata = raw; - ret = vfs_mknod(dir, dentry, mode, devt); - dentry->d_fsdata = NULL; + switch (type) { + case BPF_TYPE_PROG: + ret = vfs_mkobj(dentry, mode, bpf_mkprog, raw); + break; + case BPF_TYPE_MAP: + ret = vfs_mkobj(dentry, mode, bpf_mkmap, raw); + break; + default: + ret = -EPERM; + } out: done_path_create(&path, dentry); return ret; -- cgit v1.2.3 From 16f07c551e3ac433874031433231140d38e69ccd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 4 Jan 2018 13:55:03 -0800 Subject: bpf: implement syscall command BPF_MAP_GET_NEXT_KEY for stacktrace map Currently, bpf syscall command BPF_MAP_GET_NEXT_KEY is not supported for stacktrace map. However, there are use cases where user space wants to enumerate all stacktrace map entries where BPF_MAP_GET_NEXT_KEY command will be really helpful. In addition, if user space wants to delete all map entries in order to save memory and does not want to close the map file descriptor, BPF_MAP_GET_NEXT_KEY may help improve performance if map entries are sparsely populated. The implementation has similar behavior for BPF_MAP_GET_NEXT_KEY implementation in hashtab. If user provides a NULL key pointer or an invalid key, the first key is returned. Otherwise, the first valid key after the input parameter "key" is returned, or -ENOENT if no valid key can be found. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/stackmap.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index a15bc636cc98..6c63c2222ea8 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -226,9 +226,33 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) return 0; } -static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int stack_map_get_next_key(struct bpf_map *map, void *key, + void *next_key) { - return -EINVAL; + struct bpf_stack_map *smap = container_of(map, + struct bpf_stack_map, map); + u32 id; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (!key) { + id = 0; + } else { + id = *(u32 *)key; + if (id >= smap->n_buckets || !smap->buckets[id]) + id = 0; + else + id++; + } + + while (id < smap->n_buckets && !smap->buckets[id]) + id++; + + if (id >= smap->n_buckets) + return -ENOENT; + + *(u32 *)next_key = id; + return 0; } static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, -- cgit v1.2.3 From 983c751532738b83c4c5b51192ebc6fbfe9330f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Jan 2018 05:38:32 -0800 Subject: workqueue: separate out init_rescuer() Separate out init_rescuer() from __alloc_workqueue_key() to prepare for early init support for WQ_MEM_RECLAIM. This patch doesn't introduce any functional changes. Signed-off-by: Tejun Heo Cc: Paul McKenney --- kernel/workqueue.c | 56 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 43d18cb46308..5baac7c8a5f9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3939,6 +3939,37 @@ static int wq_clamp_max_active(int max_active, unsigned int flags, return clamp_val(max_active, 1, lim); } +/* + * Workqueues which may be used during memory reclaim should have a rescuer + * to guarantee forward progress. + */ +static int init_rescuer(struct workqueue_struct *wq) +{ + struct worker *rescuer; + int ret; + + if (!(wq->flags & WQ_MEM_RECLAIM)) + return 0; + + rescuer = alloc_worker(NUMA_NO_NODE); + if (!rescuer) + return -ENOMEM; + + rescuer->rescue_wq = wq; + rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); + ret = PTR_ERR_OR_ZERO(rescuer->task); + if (ret) { + kfree(rescuer); + return ret; + } + + wq->rescuer = rescuer; + kthread_bind_mask(rescuer->task, cpu_possible_mask); + wake_up_process(rescuer->task); + + return 0; +} + struct workqueue_struct *__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active, @@ -4001,29 +4032,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (alloc_and_link_pwqs(wq) < 0) goto err_free_wq; - /* - * Workqueues which may be used during memory reclaim should - * have a rescuer to guarantee forward progress. - */ - if (flags & WQ_MEM_RECLAIM) { - struct worker *rescuer; - - rescuer = alloc_worker(NUMA_NO_NODE); - if (!rescuer) - goto err_destroy; - - rescuer->rescue_wq = wq; - rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", - wq->name); - if (IS_ERR(rescuer->task)) { - kfree(rescuer); - goto err_destroy; - } - - wq->rescuer = rescuer; - kthread_bind_mask(rescuer->task, cpu_possible_mask); - wake_up_process(rescuer->task); - } + if (init_rescuer(wq) < 0) + goto err_destroy; if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) goto err_destroy; -- cgit v1.2.3 From 40c17f75dfa9ec163478efd3f7443fd6af9992c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 8 Jan 2018 05:38:37 -0800 Subject: workqueue: allow WQ_MEM_RECLAIM on early init workqueues Workqueues can be created early during boot before workqueue subsystem in fully online - work items are queued waiting for later full initialization. However, early init wasn't supported for WQ_MEM_RECLAIM workqueues causing unnecessary annoyances for a subset of users. Expand early init support to include WQ_MEM_RECLAIM workqueues. Signed-off-by: Tejun Heo Cc: Paul McKenney --- kernel/workqueue.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5baac7c8a5f9..c86cc1ed678b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4032,7 +4032,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, if (alloc_and_link_pwqs(wq) < 0) goto err_free_wq; - if (init_rescuer(wq) < 0) + if (wq_online && init_rescuer(wq) < 0) goto err_destroy; if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) @@ -5639,6 +5639,8 @@ int __init workqueue_init(void) * archs such as power and arm64. As per-cpu pools created * previously could be missing node hint and unbound pools NUMA * affinity, fix them up. + * + * Also, while iterating workqueues, create rescuers if requested. */ wq_numa_init(); @@ -5650,8 +5652,12 @@ int __init workqueue_init(void) } } - list_for_each_entry(wq, &workqueues, list) + list_for_each_entry(wq, &workqueues, list) { wq_update_unbound_numa(wq, smp_processor_id(), true); + WARN(init_rescuer(wq), + "workqueue: failed to create early rescuer for %s", + wq->name); + } mutex_unlock(&wq_pool_mutex); -- cgit v1.2.3 From 5896351ea9360072f8bdd9eee186861a9d13db6d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Mon, 8 Jan 2018 07:51:17 -0800 Subject: bpf: fix verifier GPF in kmalloc failure path syzbot reported the following panic in the verifier triggered by kmalloc error injection: kasan: GPF could be caused by NULL-ptr deref or user memory access RIP: 0010:copy_func_state kernel/bpf/verifier.c:403 [inline] RIP: 0010:copy_verifier_state+0x364/0x590 kernel/bpf/verifier.c:431 Call Trace: pop_stack+0x8c/0x270 kernel/bpf/verifier.c:449 push_stack kernel/bpf/verifier.c:491 [inline] check_cond_jmp_op kernel/bpf/verifier.c:3598 [inline] do_check+0x4b60/0xa050 kernel/bpf/verifier.c:4731 bpf_check+0x3296/0x58c0 kernel/bpf/verifier.c:5489 bpf_prog_load+0xa2a/0x1b00 kernel/bpf/syscall.c:1198 SYSC_bpf kernel/bpf/syscall.c:1807 [inline] SyS_bpf+0x1044/0x4420 kernel/bpf/syscall.c:1769 when copy_verifier_state() aborts in the middle due to kmalloc failure some of the frames could have been partially copied while current free_verifier_state() loop for (i = 0; i <= state->curframe; i++) assumed that all frames are non-null. Simply fix it by adding 'if (!state)' to free_func_state(). Also avoid stressing copy frame logic more if kzalloc fails in push_stack() free env->cur_state right away. Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Reported-by: syzbot+32ac5a3e473f2e01cfc7@syzkaller.appspotmail.com Reported-by: syzbot+fa99e24f3c29d269a7d5@syzkaller.appspotmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/verifier.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a2b211262c25..d921ab387b0b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -375,6 +375,8 @@ static int realloc_func_state(struct bpf_func_state *state, int size, static void free_func_state(struct bpf_func_state *state) { + if (!state) + return; kfree(state->stack); kfree(state); } @@ -487,6 +489,8 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, } return &elem->st; err: + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; /* pop all elements and return */ while (!pop_stack(env, NULL, NULL)); return NULL; -- cgit v1.2.3 From 24e6d5a59ac7d31adc0322de2d0117dfa370936f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:53:53 +0100 Subject: mm: pass the vmem_altmap to arch_add_memory and __add_pages We can just pass this on instead of having to do a radix tree lookup without proper locking 2 levels into the callchain. Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- arch/ia64/mm/init.c | 5 +++-- arch/powerpc/mm/mem.c | 5 +++-- arch/s390/mm/init.c | 5 +++-- arch/sh/mm/init.c | 5 +++-- arch/x86/mm/init_32.c | 5 +++-- arch/x86/mm/init_64.c | 11 ++++++----- include/linux/memory_hotplug.h | 17 ++++++++++------- kernel/memremap.c | 3 ++- mm/hmm.c | 5 +++-- mm/memory_hotplug.c | 7 +++---- 10 files changed, 39 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 7af4e05bb61e..2e2e4f532204 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -647,13 +647,14 @@ mem_init (void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, + bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 4362b86ef84c..e670cfc2766e 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -127,7 +127,8 @@ int __weak remove_section_mapping(unsigned long start, unsigned long end) return -ENODEV; } -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, + bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -144,7 +145,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) return -EFAULT; } - return __add_pages(nid, start_pfn, nr_pages, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index 671535e64aba..e12c5af50cd7 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -222,7 +222,8 @@ device_initcall(s390_cma_mem_init); #endif /* CONFIG_CMA */ -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, + bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); @@ -232,7 +233,7 @@ int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, want_memblock); + rc = __add_pages(nid, start_pfn, size_pages, altmap, want_memblock); if (rc) vmem_remove_mapping(start, size); return rc; diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index afc54d593a26..552afbf55bad 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -485,14 +485,15 @@ void free_initrd_mem(unsigned long start, unsigned long end) #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, + bool want_memblock) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 135c9a7898c7..8a3091511a71 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -829,12 +829,13 @@ void __init mem_init(void) } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, + bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - return __add_pages(nid, start_pfn, nr_pages, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8acdc35c2dfa..e80bb4189254 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -772,12 +772,12 @@ static void update_end_of_memory_vars(u64 start, u64 size) } } -int add_pages(int nid, unsigned long start_pfn, - unsigned long nr_pages, bool want_memblock) +int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, + struct vmem_altmap *altmap, bool want_memblock) { int ret; - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); + ret = __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); WARN_ON_ONCE(ret); /* update max_pfn, max_low_pfn and high_memory */ @@ -787,14 +787,15 @@ int add_pages(int nid, unsigned long start_pfn, return ret; } -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, + bool want_memblock) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; init_memory_mapping(start, start + size); - return add_pages(nid, start_pfn, nr_pages, want_memblock); + return add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); } #define PAGE_INUSE 0xFD diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 58e110aee7ab..db276afbefcc 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -13,6 +13,7 @@ struct pglist_data; struct mem_section; struct memory_block; struct resource; +struct vmem_altmap; #ifdef CONFIG_MEMORY_HOTPLUG /* @@ -131,18 +132,19 @@ extern int __remove_pages(struct zone *zone, unsigned long start_pfn, #endif /* CONFIG_MEMORY_HOTREMOVE */ /* reasonably generic interface to expand the physical pages */ -extern int __add_pages(int nid, unsigned long start_pfn, - unsigned long nr_pages, bool want_memblock); +extern int __add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, + struct vmem_altmap *altmap, bool want_memblock); #ifndef CONFIG_ARCH_HAS_ADD_PAGES static inline int add_pages(int nid, unsigned long start_pfn, - unsigned long nr_pages, bool want_memblock) + unsigned long nr_pages, struct vmem_altmap *altmap, + bool want_memblock) { - return __add_pages(nid, start_pfn, nr_pages, want_memblock); + return __add_pages(nid, start_pfn, nr_pages, altmap, want_memblock); } #else /* ARCH_HAS_ADD_PAGES */ -int add_pages(int nid, unsigned long start_pfn, - unsigned long nr_pages, bool want_memblock); +int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, + struct vmem_altmap *altmap, bool want_memblock); #endif /* ARCH_HAS_ADD_PAGES */ #ifdef CONFIG_NUMA @@ -318,7 +320,8 @@ extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); extern int add_memory_resource(int nid, struct resource *resource, bool online); -extern int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock); +extern int arch_add_memory(int nid, u64 start, u64 size, + struct vmem_altmap *altmap, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); diff --git a/kernel/memremap.c b/kernel/memremap.c index 403ab9cdb949..8488cdeead16 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -382,6 +382,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (altmap) { memcpy(&page_map->altmap, altmap, sizeof(*altmap)); pgmap->altmap = &page_map->altmap; + altmap = pgmap->altmap; } pgmap->ref = ref; pgmap->res = &page_map->res; @@ -427,7 +428,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, goto err_pfn_remap; mem_hotplug_begin(); - error = arch_add_memory(nid, align_start, align_size, false); + error = arch_add_memory(nid, align_start, align_size, altmap, false); if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, diff --git a/mm/hmm.c b/mm/hmm.c index ea19742a5d60..231aaacd1997 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -931,10 +931,11 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) * want the linear mapping and thus use arch_add_memory(). */ if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) - ret = arch_add_memory(nid, align_start, align_size, false); + ret = arch_add_memory(nid, align_start, align_size, NULL, + false); else ret = add_pages(nid, align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT, false); + align_size >> PAGE_SHIFT, NULL, false); if (ret) { mem_hotplug_done(); goto error_add_memory; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5c6f96e6b334..fc0485dcece1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -292,18 +292,17 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, * add the new pages. */ int __ref __add_pages(int nid, unsigned long phys_start_pfn, - unsigned long nr_pages, bool want_memblock) + unsigned long nr_pages, struct vmem_altmap *altmap, + bool want_memblock) { unsigned long i; int err = 0; int start_sec, end_sec; - struct vmem_altmap *altmap; /* during initialize mem_map, align hot-added range to section */ start_sec = pfn_to_section_nr(phys_start_pfn); end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); - altmap = to_vmem_altmap((unsigned long) pfn_to_page(phys_start_pfn)); if (altmap) { /* * Validate altmap is within bounds of the total request @@ -1148,7 +1147,7 @@ int __ref add_memory_resource(int nid, struct resource *res, bool online) } /* call arch's memory hotadd */ - ret = arch_add_memory(nid, start, size, true); + ret = arch_add_memory(nid, start, size, NULL, true); if (ret < 0) goto error; -- cgit v1.2.3 From da024512a1fa5c979257e442130ee1d468285057 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:53:55 +0100 Subject: mm: pass the vmem_altmap to arch_remove_memory and __remove_pages We can just pass this on instead of having to do a radix tree lookup without proper locking 2 levels into the callchain. Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- arch/ia64/mm/init.c | 4 ++-- arch/powerpc/mm/mem.c | 6 ++---- arch/s390/mm/init.c | 2 +- arch/sh/mm/init.c | 4 ++-- arch/x86/mm/init_32.c | 4 ++-- arch/x86/mm/init_64.c | 6 ++---- include/linux/memory_hotplug.h | 5 +++-- kernel/memremap.c | 2 +- mm/hmm.c | 4 ++-- mm/memory_hotplug.c | 8 ++------ 10 files changed, 19 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 2e2e4f532204..6a8ce9e1536e 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -663,7 +663,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size) +int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; @@ -671,7 +671,7 @@ int arch_remove_memory(u64 start, u64 size) int ret; zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages); + ret = __remove_pages(zone, start_pfn, nr_pages, altmap); if (ret) pr_warn("%s: Problem encountered in __remove_pages() as" " ret=%d\n", __func__, ret); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index e670cfc2766e..22aa528b78a2 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -149,11 +149,10 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size) +int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; - struct vmem_altmap *altmap; struct page *page; int ret; @@ -162,11 +161,10 @@ int arch_remove_memory(u64 start, u64 size) * when querying the zone. */ page = pfn_to_page(start_pfn); - altmap = to_vmem_altmap((unsigned long) page); if (altmap) page += vmem_altmap_offset(altmap); - ret = __remove_pages(page_zone(page), start_pfn, nr_pages); + ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap); if (ret) return ret; diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index e12c5af50cd7..3fa3e5323612 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -240,7 +240,7 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size) +int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { /* * There is no hardware or firmware interface which could trigger a diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 552afbf55bad..ce0bbaa7e404 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -510,7 +510,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size) +int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; @@ -518,7 +518,7 @@ int arch_remove_memory(u64 start, u64 size) int ret; zone = page_zone(pfn_to_page(start_pfn)); - ret = __remove_pages(zone, start_pfn, nr_pages); + ret = __remove_pages(zone, start_pfn, nr_pages, altmap); if (unlikely(ret)) pr_warn("%s: Failed, __remove_pages() == %d\n", __func__, ret); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8a3091511a71..79cb066f40c0 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -839,14 +839,14 @@ int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, } #ifdef CONFIG_MEMORY_HOTREMOVE -int arch_remove_memory(u64 start, u64 size) +int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; zone = page_zone(pfn_to_page(start_pfn)); - return __remove_pages(zone, start_pfn, nr_pages); + return __remove_pages(zone, start_pfn, nr_pages, altmap); } #endif #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 594902ef56ef..3c046618cc7e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1132,21 +1132,19 @@ kernel_physical_mapping_remove(unsigned long start, unsigned long end) remove_pagetable(start, end, true); } -int __ref arch_remove_memory(u64 start, u64 size) +int __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) { unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; struct page *page = pfn_to_page(start_pfn); - struct vmem_altmap *altmap; struct zone *zone; int ret; /* With altmap the first mapped page is offset from @start */ - altmap = to_vmem_altmap((unsigned long) page); if (altmap) page += vmem_altmap_offset(altmap); zone = page_zone(page); - ret = __remove_pages(zone, start_pfn, nr_pages); + ret = __remove_pages(zone, start_pfn, nr_pages, altmap); WARN_ON_ONCE(ret); kernel_physical_mapping_remove(start, start + size); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index cbdd6d52e877..e71927d0d46b 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -126,9 +126,10 @@ static inline bool movable_node_is_enabled(void) #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_pageblock_removable_nolock(struct page *page); -extern int arch_remove_memory(u64 start, u64 size); +extern int arch_remove_memory(u64 start, u64 size, + struct vmem_altmap *altmap); extern int __remove_pages(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages); + unsigned long nr_pages, struct vmem_altmap *altmap); #endif /* CONFIG_MEMORY_HOTREMOVE */ /* reasonably generic interface to expand the physical pages */ diff --git a/kernel/memremap.c b/kernel/memremap.c index 8488cdeead16..380fca1c4a02 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -304,7 +304,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_size = ALIGN(resource_size(res), SECTION_SIZE); mem_hotplug_begin(); - arch_remove_memory(align_start, align_size); + arch_remove_memory(align_start, align_size, pgmap->altmap); mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); diff --git a/mm/hmm.c b/mm/hmm.c index 231aaacd1997..5d17ba89062f 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -838,10 +838,10 @@ static void hmm_devmem_release(struct device *dev, void *data) mem_hotplug_begin(); if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) - __remove_pages(zone, start_pfn, npages); + __remove_pages(zone, start_pfn, npages, NULL); else arch_remove_memory(start_pfn << PAGE_SHIFT, - npages << PAGE_SHIFT); + npages << PAGE_SHIFT, NULL); mem_hotplug_done(); hmm_devmem_radix_release(resource); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b36f1822c432..eae6bf47caf7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -569,7 +569,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms, * calling offline_pages(). */ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, - unsigned long nr_pages) + unsigned long nr_pages, struct vmem_altmap *altmap) { unsigned long i; unsigned long map_offset = 0; @@ -577,10 +577,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, /* In the ZONE_DEVICE case device driver owns the memory region */ if (is_dev_zone(zone)) { - struct page *page = pfn_to_page(phys_start_pfn); - struct vmem_altmap *altmap; - - altmap = to_vmem_altmap((unsigned long) page); if (altmap) map_offset = vmem_altmap_offset(altmap); } else { @@ -1890,7 +1886,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) memblock_free(start, size); memblock_remove(start, size); - arch_remove_memory(start, size); + arch_remove_memory(start, size, NULL); try_offline_node(nid); -- cgit v1.2.3 From a99583e780c751003ac9c0105eec9a3b23ec3bc4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:53:57 +0100 Subject: mm: pass the vmem_altmap to memmap_init_zone Pass the vmem_altmap two levels down instead of needing a lookup. Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- arch/ia64/mm/init.c | 9 +++++---- include/linux/memory_hotplug.h | 2 +- include/linux/mm.h | 4 ++-- kernel/memremap.c | 2 +- mm/hmm.c | 2 +- mm/memory_hotplug.c | 9 +++++---- mm/page_alloc.c | 6 +++--- 7 files changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index 6a8ce9e1536e..18278b448530 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -501,7 +501,7 @@ virtual_memmap_init(u64 start, u64 end, void *arg) if (map_start < map_end) memmap_init_zone((unsigned long)(map_end - map_start), args->nid, args->zone, page_to_pfn(map_start), - MEMMAP_EARLY); + MEMMAP_EARLY, NULL); return 0; } @@ -509,9 +509,10 @@ void __meminit memmap_init (unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { - if (!vmem_map) - memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY); - else { + if (!vmem_map) { + memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, + NULL); + } else { struct page *start; struct memmap_init_callback_data args; diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 20dd98ad44a0..aba5f86eb038 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -324,7 +324,7 @@ extern int add_memory_resource(int nid, struct resource *resource, bool online); extern int arch_add_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap, bool want_memblock); extern void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, - unsigned long nr_pages); + unsigned long nr_pages, struct vmem_altmap *altmap); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); diff --git a/include/linux/mm.h b/include/linux/mm.h index 9d4cd4c1dc6d..fd01135324b6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2069,8 +2069,8 @@ static inline void zero_resv_unavail(void) {} #endif extern void set_dma_reserve(unsigned long new_dma_reserve); -extern void memmap_init_zone(unsigned long, int, unsigned long, - unsigned long, enum memmap_context); +extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, + enum memmap_context, struct vmem_altmap *); extern void setup_per_zone_wmarks(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); diff --git a/kernel/memremap.c b/kernel/memremap.c index 380fca1c4a02..64b12c806cc5 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -432,7 +432,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (!error) move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT); + align_size >> PAGE_SHIFT, altmap); mem_hotplug_done(); if (error) goto err_add_memory; diff --git a/mm/hmm.c b/mm/hmm.c index 5d17ba89062f..2f2e13c61040 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -942,7 +942,7 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) } move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], align_start >> PAGE_SHIFT, - align_size >> PAGE_SHIFT); + align_size >> PAGE_SHIFT, NULL); mem_hotplug_done(); for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a8dde9734120..12df8a5fadcc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -798,8 +798,8 @@ static void __meminit resize_pgdat_range(struct pglist_data *pgdat, unsigned lon pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn; } -void __ref move_pfn_range_to_zone(struct zone *zone, - unsigned long start_pfn, unsigned long nr_pages) +void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, + unsigned long nr_pages, struct vmem_altmap *altmap) { struct pglist_data *pgdat = zone->zone_pgdat; int nid = pgdat->node_id; @@ -824,7 +824,8 @@ void __ref move_pfn_range_to_zone(struct zone *zone, * expects the zone spans the pfn range. All the pages in the range * are reserved so nobody should be touching them so we should be safe */ - memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, MEMMAP_HOTPLUG); + memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, + MEMMAP_HOTPLUG, altmap); set_zone_contiguous(zone); } @@ -896,7 +897,7 @@ static struct zone * __meminit move_pfn_range(int online_type, int nid, struct zone *zone; zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages); - move_pfn_range_to_zone(zone, start_pfn, nr_pages); + move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL); return zone; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e5e775e97f4..1748dd4a4b1b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5314,9 +5314,9 @@ void __ref build_all_zonelists(pg_data_t *pgdat) * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, enum memmap_context context) + unsigned long start_pfn, enum memmap_context context, + struct vmem_altmap *altmap) { - struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn)); unsigned long end_pfn = start_pfn + size; pg_data_t *pgdat = NODE_DATA(nid); unsigned long pfn; @@ -5417,7 +5417,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) #ifndef __HAVE_ARCH_MEMMAP_INIT #define memmap_init(size, nid, zone, start_pfn) \ - memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) + memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL) #endif static int zone_batchsize(struct zone *zone) -- cgit v1.2.3 From 0822acb86cf340cd45b3af6436cec7e3bb24ebd2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:00 +0100 Subject: mm: move get_dev_pagemap out of line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is a pretty big function, which should be out of line in general, and a no-op stub if CONFIG_ZONE_DEVICЕ is not set. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Signed-off-by: Dan Williams --- include/linux/memremap.h | 39 ++++----------------------------------- kernel/memremap.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index d5a6736d9737..26e8aaba27d5 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -133,7 +133,8 @@ struct dev_pagemap { #ifdef CONFIG_ZONE_DEVICE void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap); -struct dev_pagemap *find_dev_pagemap(resource_size_t phys); +struct dev_pagemap *get_dev_pagemap(unsigned long pfn, + struct dev_pagemap *pgmap); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); @@ -153,7 +154,8 @@ static inline void *devm_memremap_pages(struct device *dev, return ERR_PTR(-ENXIO); } -static inline struct dev_pagemap *find_dev_pagemap(resource_size_t phys) +static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, + struct dev_pagemap *pgmap) { return NULL; } @@ -183,39 +185,6 @@ static inline bool is_device_public_page(const struct page *page) } #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ -/** - * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn - * @pfn: page frame number to lookup page_map - * @pgmap: optional known pgmap that already has a reference - * - * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the - * same mapping. - */ -static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, - struct dev_pagemap *pgmap) -{ - const struct resource *res = pgmap ? pgmap->res : NULL; - resource_size_t phys = PFN_PHYS(pfn); - - /* - * In the cached case we're already holding a live reference so - * we can simply do a blind increment - */ - if (res && phys >= res->start && phys <= res->end) { - percpu_ref_get(pgmap->ref); - return pgmap; - } - - /* fall back to slow path lookup */ - rcu_read_lock(); - pgmap = find_dev_pagemap(phys); - if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) - pgmap = NULL; - rcu_read_unlock(); - - return pgmap; -} - static inline void put_dev_pagemap(struct dev_pagemap *pgmap) { if (pgmap) diff --git a/kernel/memremap.c b/kernel/memremap.c index 64b12c806cc5..3df6cd4ffb40 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -314,7 +314,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) } /* assumes rcu_read_lock() held at entry */ -struct dev_pagemap *find_dev_pagemap(resource_size_t phys) +static struct dev_pagemap *find_dev_pagemap(resource_size_t phys) { struct page_map *page_map; @@ -501,8 +501,40 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) return pgmap ? pgmap->altmap : NULL; } -#endif /* CONFIG_ZONE_DEVICE */ +/** + * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn + * @pfn: page frame number to lookup page_map + * @pgmap: optional known pgmap that already has a reference + * + * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the + * same mapping. + */ +struct dev_pagemap *get_dev_pagemap(unsigned long pfn, + struct dev_pagemap *pgmap) +{ + const struct resource *res = pgmap ? pgmap->res : NULL; + resource_size_t phys = PFN_PHYS(pfn); + + /* + * In the cached case we're already holding a live reference so + * we can simply do a blind increment + */ + if (res && phys >= res->start && phys <= res->end) { + percpu_ref_get(pgmap->ref); + return pgmap; + } + + /* fall back to slow path lookup */ + rcu_read_lock(); + pgmap = find_dev_pagemap(phys); + if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) + pgmap = NULL; + rcu_read_unlock(); + + return pgmap; +} +#endif /* CONFIG_ZONE_DEVICE */ #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) void put_zone_device_private_or_public_page(struct page *page) -- cgit v1.2.3 From 832d7aa051106c927cae05ced29d3fd31459ed21 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:01 +0100 Subject: mm: optimize dev_pagemap reference counting around get_dev_pagemap Change the calling convention so that get_dev_pagemap always consumes the previous reference instead of doing this using an explicit earlier call to put_dev_pagemap in the callers. The callers will still need to put the final reference after finishing the loop over the pages. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Signed-off-by: Dan Williams --- kernel/memremap.c | 17 +++++++++-------- mm/gup.c | 7 +++++-- 2 files changed, 14 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 3df6cd4ffb40..891c77487a6a 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -507,22 +507,23 @@ struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) * @pfn: page frame number to lookup page_map * @pgmap: optional known pgmap that already has a reference * - * @pgmap allows the overhead of a lookup to be bypassed when @pfn lands in the - * same mapping. + * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap + * is non-NULL but does not cover @pfn the reference to it will be released. */ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap) { - const struct resource *res = pgmap ? pgmap->res : NULL; resource_size_t phys = PFN_PHYS(pfn); /* - * In the cached case we're already holding a live reference so - * we can simply do a blind increment + * In the cached case we're already holding a live reference. */ - if (res && phys >= res->start && phys <= res->end) { - percpu_ref_get(pgmap->ref); - return pgmap; + if (pgmap) { + const struct resource *res = pgmap ? pgmap->res : NULL; + + if (res && phys >= res->start && phys <= res->end) + return pgmap; + put_dev_pagemap(pgmap); } /* fall back to slow path lookup */ diff --git a/mm/gup.c b/mm/gup.c index e0d82b6706d7..3affe7544b0c 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1410,7 +1410,6 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, VM_BUG_ON_PAGE(compound_head(page) != head, page); - put_dev_pagemap(pgmap); SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -1420,6 +1419,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, ret = 1; pte_unmap: + if (pgmap) + put_dev_pagemap(pgmap); pte_unmap(ptem); return ret; } @@ -1459,10 +1460,12 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr, SetPageReferenced(page); pages[*nr] = page; get_page(page); - put_dev_pagemap(pgmap); (*nr)++; pfn++; } while (addr += PAGE_SIZE, addr != end); + + if (pgmap) + put_dev_pagemap(pgmap); return 1; } -- cgit v1.2.3 From 0628b8c650718f4dfedfcdc9ed136bf7e394aae7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:02 +0100 Subject: memremap: remove to_vmem_altmap All callers are gone now. Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/linux/memremap.h | 9 --------- kernel/memremap.c | 26 -------------------------- 2 files changed, 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 26e8aaba27d5..3fddcfe57bb0 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -26,15 +26,6 @@ struct vmem_altmap { unsigned long alloc; }; -#ifdef CONFIG_ZONE_DEVICE -struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start); -#else -static inline struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) -{ - return NULL; -} -#endif - /* * Specialize ZONE_DEVICE memory into multiple types each having differents * usage. diff --git a/kernel/memremap.c b/kernel/memremap.c index 891c77487a6a..b09517439dec 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -476,32 +476,6 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) altmap->alloc -= nr_pfns; } -struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) -{ - /* - * 'memmap_start' is the virtual address for the first "struct - * page" in this range of the vmemmap array. In the case of - * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple - * pointer arithmetic, so we can perform this to_vmem_altmap() - * conversion without concern for the initialization state of - * the struct page fields. - */ - struct page *page = (struct page *) memmap_start; - struct dev_pagemap *pgmap; - - /* - * Unconditionally retrieve a dev_pagemap associated with the - * given physical address, this is only for use in the - * arch_{add|remove}_memory() for setting up and tearing down - * the memmap. - */ - rcu_read_lock(); - pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); - rcu_read_unlock(); - - return pgmap ? pgmap->altmap : NULL; -} - /** * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn * @pfn: page frame number to lookup page_map -- cgit v1.2.3 From 7003e3b1f64d0195ea9d31aed0b096ad38f3cb54 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:03 +0100 Subject: memremap: simplify duplicate region handling in devm_memremap_pages __radix_tree_insert already checks for duplicates and returns -EEXIST in that case, so remove the duplicate (and racy) duplicates check. Signed-off-by: Christoph Hellwig Reviewed-by: Logan Gunthorpe Signed-off-by: Dan Williams --- kernel/memremap.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index b09517439dec..12e78528fea4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -396,17 +396,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, align_end = align_start + align_size - 1; foreach_order_pgoff(res, order, pgoff) { - struct dev_pagemap *dup; - - rcu_read_lock(); - dup = find_dev_pagemap(res->start + PFN_PHYS(pgoff)); - rcu_read_unlock(); - if (dup) { - dev_err(dev, "%s: %pr collides with mapping for %s\n", - __func__, res, dev_name(dup->dev)); - error = -EBUSY; - break; - } error = __radix_tree_insert(&pgmap_radix, PHYS_PFN(res->start) + pgoff, order, page_map); if (error) { -- cgit v1.2.3 From e7744aa25cffe26d3767c9ffcf4e130cca1dff00 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Fri, 29 Dec 2017 08:54:04 +0100 Subject: memremap: drop private struct page_map 'struct page_map' is a private structure of 'struct dev_pagemap' but the latter replicates all the same fields as the former so there isn't much value in it. Thus drop it in favour of a completely public struct. This is a clean up in preperation for a more generally useful 'devm_memeremap_pages' interface. Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- include/linux/memremap.h | 5 ++-- kernel/memremap.c | 68 ++++++++++++++++++------------------------------ mm/hmm.c | 2 +- 3 files changed, 30 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 3fddcfe57bb0..1cb5f39d25c1 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -113,8 +113,9 @@ typedef void (*dev_page_free_t)(struct page *page, void *data); struct dev_pagemap { dev_page_fault_t page_fault; dev_page_free_t page_free; - struct vmem_altmap *altmap; - const struct resource *res; + struct vmem_altmap altmap; + bool altmap_valid; + struct resource res; struct percpu_ref *ref; struct device *dev; void *data; diff --git a/kernel/memremap.c b/kernel/memremap.c index 12e78528fea4..9207c44cce20 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -188,13 +188,6 @@ static RADIX_TREE(pgmap_radix, GFP_KERNEL); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) #define SECTION_SIZE (1UL << PA_SECTION_SHIFT) -struct page_map { - struct resource res; - struct percpu_ref *ref; - struct dev_pagemap pgmap; - struct vmem_altmap altmap; -}; - static unsigned long order_at(struct resource *res, unsigned long pgoff) { unsigned long phys_pgoff = PHYS_PFN(res->start) + pgoff; @@ -260,22 +253,21 @@ static void pgmap_radix_release(struct resource *res) synchronize_rcu(); } -static unsigned long pfn_first(struct page_map *page_map) +static unsigned long pfn_first(struct dev_pagemap *pgmap) { - struct dev_pagemap *pgmap = &page_map->pgmap; - const struct resource *res = &page_map->res; - struct vmem_altmap *altmap = pgmap->altmap; + const struct resource *res = &pgmap->res; + struct vmem_altmap *altmap = &pgmap->altmap; unsigned long pfn; pfn = res->start >> PAGE_SHIFT; - if (altmap) + if (pgmap->altmap_valid) pfn += vmem_altmap_offset(altmap); return pfn; } -static unsigned long pfn_end(struct page_map *page_map) +static unsigned long pfn_end(struct dev_pagemap *pgmap) { - const struct resource *res = &page_map->res; + const struct resource *res = &pgmap->res; return (res->start + resource_size(res)) >> PAGE_SHIFT; } @@ -285,13 +277,12 @@ static unsigned long pfn_end(struct page_map *page_map) static void devm_memremap_pages_release(struct device *dev, void *data) { - struct page_map *page_map = data; - struct resource *res = &page_map->res; + struct dev_pagemap *pgmap = data; + struct resource *res = &pgmap->res; resource_size_t align_start, align_size; - struct dev_pagemap *pgmap = &page_map->pgmap; unsigned long pfn; - for_each_device_pfn(pfn, page_map) + for_each_device_pfn(pfn, pgmap) put_page(pfn_to_page(pfn)); if (percpu_ref_tryget_live(pgmap->ref)) { @@ -304,24 +295,22 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_size = ALIGN(resource_size(res), SECTION_SIZE); mem_hotplug_begin(); - arch_remove_memory(align_start, align_size, pgmap->altmap); + arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? + &pgmap->altmap : NULL); mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); - dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, - "%s: failed to free all reserved pages\n", __func__); + dev_WARN_ONCE(dev, pgmap->altmap.alloc, + "%s: failed to free all reserved pages\n", __func__); } /* assumes rcu_read_lock() held at entry */ static struct dev_pagemap *find_dev_pagemap(resource_size_t phys) { - struct page_map *page_map; - WARN_ON_ONCE(!rcu_read_lock_held()); - page_map = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); - return page_map ? &page_map->pgmap : NULL; + return radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); } /** @@ -349,7 +338,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; - struct page_map *page_map; int error, nid, is_ram, i = 0; align_start = res->start & ~(SECTION_SIZE - 1); @@ -370,22 +358,20 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (!ref) return ERR_PTR(-EINVAL); - page_map = devres_alloc_node(devm_memremap_pages_release, - sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); - if (!page_map) + pgmap = devres_alloc_node(devm_memremap_pages_release, + sizeof(*pgmap), GFP_KERNEL, dev_to_node(dev)); + if (!pgmap) return ERR_PTR(-ENOMEM); - pgmap = &page_map->pgmap; - memcpy(&page_map->res, res, sizeof(*res)); + memcpy(&pgmap->res, res, sizeof(*res)); pgmap->dev = dev; if (altmap) { - memcpy(&page_map->altmap, altmap, sizeof(*altmap)); - pgmap->altmap = &page_map->altmap; - altmap = pgmap->altmap; + memcpy(&pgmap->altmap, altmap, sizeof(*altmap)); + pgmap->altmap_valid = true; + altmap = &pgmap->altmap; } pgmap->ref = ref; - pgmap->res = &page_map->res; pgmap->type = MEMORY_DEVICE_HOST; pgmap->page_fault = NULL; pgmap->page_free = NULL; @@ -397,7 +383,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, foreach_order_pgoff(res, order, pgoff) { error = __radix_tree_insert(&pgmap_radix, - PHYS_PFN(res->start) + pgoff, order, page_map); + PHYS_PFN(res->start) + pgoff, order, pgmap); if (error) { dev_err(dev, "%s: failed: %d\n", __func__, error); break; @@ -426,7 +412,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (error) goto err_add_memory; - for_each_device_pfn(pfn, page_map) { + for_each_device_pfn(pfn, pgmap) { struct page *page = pfn_to_page(pfn); /* @@ -441,7 +427,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (!(++i % 1024)) cond_resched(); } - devres_add(dev, page_map); + devres_add(dev, pgmap); return __va(res->start); err_add_memory: @@ -449,7 +435,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, err_pfn_remap: err_radix: pgmap_radix_release(res); - devres_free(page_map); + devres_free(pgmap); return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); @@ -482,9 +468,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, * In the cached case we're already holding a live reference. */ if (pgmap) { - const struct resource *res = pgmap ? pgmap->res : NULL; - - if (res && phys >= res->start && phys <= res->end) + if (phys >= pgmap->res.start && phys <= pgmap->res.end) return pgmap; put_dev_pagemap(pgmap); } diff --git a/mm/hmm.c b/mm/hmm.c index 2f2e13c61040..320fdc87f064 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -882,7 +882,7 @@ static int hmm_devmem_pages_create(struct hmm_devmem *devmem) else devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; - devmem->pagemap.res = devmem->resource; + devmem->pagemap.res = *devmem->resource; devmem->pagemap.page_fault = hmm_devmem_fault; devmem->pagemap.page_free = hmm_devmem_free; devmem->pagemap.dev = devmem->device; -- cgit v1.2.3 From e8d5134833006a46fcbefc5f4a84d0b62bd520e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:05 +0100 Subject: memremap: change devm_memremap_pages interface to use struct dev_pagemap This new interface is similar to how struct device (and many others) work. The caller initializes a 'struct dev_pagemap' as required and calls 'devm_memremap_pages'. This allows the pagemap structure to be embedded in another structure and thus container_of can be used. In this way application specific members can be stored in a containing struct. This will be used by the P2P infrastructure and HMM could probably be cleaned up to use it as well (instead of having it's own, similar 'hmm_devmem_pages_create' function). Signed-off-by: Logan Gunthorpe Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- drivers/dax/pmem.c | 20 ++++++++------- drivers/nvdimm/nd.h | 9 +++---- drivers/nvdimm/pfn_devs.c | 27 ++++++++++++--------- drivers/nvdimm/pmem.c | 37 +++++++++++++++------------- drivers/nvdimm/pmem.h | 1 + include/linux/memremap.h | 6 ++--- kernel/memremap.c | 51 ++++++++++++++++----------------------- tools/testing/nvdimm/test/iomap.c | 7 +++--- 8 files changed, 77 insertions(+), 81 deletions(-) (limited to 'kernel') diff --git a/drivers/dax/pmem.c b/drivers/dax/pmem.c index 8d8c852ba8f2..31b6ecce4c64 100644 --- a/drivers/dax/pmem.c +++ b/drivers/dax/pmem.c @@ -21,6 +21,7 @@ struct dax_pmem { struct device *dev; struct percpu_ref ref; + struct dev_pagemap pgmap; struct completion cmp; }; @@ -69,20 +70,23 @@ static int dax_pmem_probe(struct device *dev) struct nd_namespace_common *ndns; struct nd_dax *nd_dax = to_nd_dax(dev); struct nd_pfn *nd_pfn = &nd_dax->nd_pfn; - struct vmem_altmap __altmap, *altmap = NULL; ndns = nvdimm_namespace_common_probe(dev); if (IS_ERR(ndns)) return PTR_ERR(ndns); nsio = to_nd_namespace_io(&ndns->dev); + dax_pmem = devm_kzalloc(dev, sizeof(*dax_pmem), GFP_KERNEL); + if (!dax_pmem) + return -ENOMEM; + /* parse the 'pfn' info block via ->rw_bytes */ rc = devm_nsio_enable(dev, nsio); if (rc) return rc; - altmap = nvdimm_setup_pfn(nd_pfn, &res, &__altmap); - if (IS_ERR(altmap)) - return PTR_ERR(altmap); + rc = nvdimm_setup_pfn(nd_pfn, &dax_pmem->pgmap); + if (rc) + return rc; devm_nsio_disable(dev, nsio); pfn_sb = nd_pfn->pfn_sb; @@ -94,10 +98,6 @@ static int dax_pmem_probe(struct device *dev) return -EBUSY; } - dax_pmem = devm_kzalloc(dev, sizeof(*dax_pmem), GFP_KERNEL); - if (!dax_pmem) - return -ENOMEM; - dax_pmem->dev = dev; init_completion(&dax_pmem->cmp); rc = percpu_ref_init(&dax_pmem->ref, dax_pmem_percpu_release, 0, @@ -110,7 +110,8 @@ static int dax_pmem_probe(struct device *dev) if (rc) return rc; - addr = devm_memremap_pages(dev, &res, &dax_pmem->ref, altmap); + dax_pmem->pgmap.ref = &dax_pmem->ref; + addr = devm_memremap_pages(dev, &dax_pmem->pgmap); if (IS_ERR(addr)) return PTR_ERR(addr); @@ -120,6 +121,7 @@ static int dax_pmem_probe(struct device *dev) return rc; /* adjust the dax_region resource to the start of data */ + memcpy(&res, &dax_pmem->pgmap.res, sizeof(res)); res.start += le64_to_cpu(pfn_sb->dataoff); rc = sscanf(dev_name(&ndns->dev), "namespace%d.%d", ®ion_id, &id); diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index e958f3724c41..8d6375ee0fda 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -368,15 +368,14 @@ unsigned int pmem_sector_size(struct nd_namespace_common *ndns); void nvdimm_badblocks_populate(struct nd_region *nd_region, struct badblocks *bb, const struct resource *res); #if IS_ENABLED(CONFIG_ND_CLAIM) -struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn, - struct resource *res, struct vmem_altmap *altmap); +int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap); int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio); void devm_nsio_disable(struct device *dev, struct nd_namespace_io *nsio); #else -static inline struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn, - struct resource *res, struct vmem_altmap *altmap) +static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, + struct dev_pagemap *pgmap) { - return ERR_PTR(-ENXIO); + return -ENXIO; } static inline int devm_nsio_enable(struct device *dev, struct nd_namespace_io *nsio) diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 2adada1a5855..f5c4e8c6e29d 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -542,9 +542,10 @@ static unsigned long init_altmap_reserve(resource_size_t base) return reserve; } -static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn, - struct resource *res, struct vmem_altmap *altmap) +static int __nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) { + struct resource *res = &pgmap->res; + struct vmem_altmap *altmap = &pgmap->altmap; struct nd_pfn_sb *pfn_sb = nd_pfn->pfn_sb; u64 offset = le64_to_cpu(pfn_sb->dataoff); u32 start_pad = __le32_to_cpu(pfn_sb->start_pad); @@ -561,11 +562,13 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn, res->start += start_pad; res->end -= end_trunc; + pgmap->type = MEMORY_DEVICE_HOST; + if (nd_pfn->mode == PFN_MODE_RAM) { if (offset < SZ_8K) - return ERR_PTR(-EINVAL); + return -EINVAL; nd_pfn->npfns = le64_to_cpu(pfn_sb->npfns); - altmap = NULL; + pgmap->altmap_valid = false; } else if (nd_pfn->mode == PFN_MODE_PMEM) { nd_pfn->npfns = PFN_SECTION_ALIGN_UP((resource_size(res) - offset) / PAGE_SIZE); @@ -577,10 +580,11 @@ static struct vmem_altmap *__nvdimm_setup_pfn(struct nd_pfn *nd_pfn, memcpy(altmap, &__altmap, sizeof(*altmap)); altmap->free = PHYS_PFN(offset - SZ_8K); altmap->alloc = 0; + pgmap->altmap_valid = true; } else - return ERR_PTR(-ENXIO); + return -ENXIO; - return altmap; + return 0; } static u64 phys_pmem_align_down(struct nd_pfn *nd_pfn, u64 phys) @@ -708,19 +712,18 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) * Determine the effective resource range and vmem_altmap from an nd_pfn * instance. */ -struct vmem_altmap *nvdimm_setup_pfn(struct nd_pfn *nd_pfn, - struct resource *res, struct vmem_altmap *altmap) +int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap) { int rc; if (!nd_pfn->uuid || !nd_pfn->ndns) - return ERR_PTR(-ENODEV); + return -ENODEV; rc = nd_pfn_init(nd_pfn); if (rc) - return ERR_PTR(rc); + return rc; - /* we need a valid pfn_sb before we can init a vmem_altmap */ - return __nvdimm_setup_pfn(nd_pfn, res, altmap); + /* we need a valid pfn_sb before we can init a dev_pagemap */ + return __nvdimm_setup_pfn(nd_pfn, pgmap); } EXPORT_SYMBOL_GPL(nvdimm_setup_pfn); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 7fbc5c5dc8e1..cf074b1ce219 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -298,34 +298,34 @@ static int pmem_attach_disk(struct device *dev, { struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); struct nd_region *nd_region = to_nd_region(dev->parent); - struct vmem_altmap __altmap, *altmap = NULL; int nid = dev_to_node(dev), fua, wbc; struct resource *res = &nsio->res; + struct resource bb_res; struct nd_pfn *nd_pfn = NULL; struct dax_device *dax_dev; struct nd_pfn_sb *pfn_sb; struct pmem_device *pmem; - struct resource pfn_res; struct request_queue *q; struct device *gendev; struct gendisk *disk; void *addr; + int rc; + + pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); + if (!pmem) + return -ENOMEM; /* while nsio_rw_bytes is active, parse a pfn info block if present */ if (is_nd_pfn(dev)) { nd_pfn = to_nd_pfn(dev); - altmap = nvdimm_setup_pfn(nd_pfn, &pfn_res, &__altmap); - if (IS_ERR(altmap)) - return PTR_ERR(altmap); + rc = nvdimm_setup_pfn(nd_pfn, &pmem->pgmap); + if (rc) + return rc; } /* we're attaching a block device, disable raw namespace access */ devm_nsio_disable(dev, nsio); - pmem = devm_kzalloc(dev, sizeof(*pmem), GFP_KERNEL); - if (!pmem) - return -ENOMEM; - dev_set_drvdata(dev, pmem); pmem->phys_addr = res->start; pmem->size = resource_size(res); @@ -350,19 +350,22 @@ static int pmem_attach_disk(struct device *dev, return -ENOMEM; pmem->pfn_flags = PFN_DEV; + pmem->pgmap.ref = &q->q_usage_counter; if (is_nd_pfn(dev)) { - addr = devm_memremap_pages(dev, &pfn_res, &q->q_usage_counter, - altmap); + addr = devm_memremap_pages(dev, &pmem->pgmap); pfn_sb = nd_pfn->pfn_sb; pmem->data_offset = le64_to_cpu(pfn_sb->dataoff); - pmem->pfn_pad = resource_size(res) - resource_size(&pfn_res); + pmem->pfn_pad = resource_size(res) - + resource_size(&pmem->pgmap.res); pmem->pfn_flags |= PFN_MAP; - res = &pfn_res; /* for badblocks populate */ - res->start += pmem->data_offset; + memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); + bb_res.start += pmem->data_offset; } else if (pmem_should_map_pages(dev)) { - addr = devm_memremap_pages(dev, &nsio->res, - &q->q_usage_counter, NULL); + memcpy(&pmem->pgmap.res, &nsio->res, sizeof(pmem->pgmap.res)); + pmem->pgmap.altmap_valid = false; + addr = devm_memremap_pages(dev, &pmem->pgmap); pmem->pfn_flags |= PFN_MAP; + memcpy(&bb_res, &pmem->pgmap.res, sizeof(bb_res)); } else addr = devm_memremap(dev, pmem->phys_addr, pmem->size, ARCH_MEMREMAP_PMEM); @@ -401,7 +404,7 @@ static int pmem_attach_disk(struct device *dev, / 512); if (devm_init_badblocks(dev, &pmem->bb)) return -ENOMEM; - nvdimm_badblocks_populate(nd_region, &pmem->bb, res); + nvdimm_badblocks_populate(nd_region, &pmem->bb, &bb_res); disk->bb = &pmem->bb; dax_dev = alloc_dax(pmem, disk->disk_name, &pmem_dax_ops); diff --git a/drivers/nvdimm/pmem.h b/drivers/nvdimm/pmem.h index 6a3cd2a10db6..a64ebc78b5df 100644 --- a/drivers/nvdimm/pmem.h +++ b/drivers/nvdimm/pmem.h @@ -22,6 +22,7 @@ struct pmem_device { struct badblocks bb; struct dax_device *dax_dev; struct gendisk *disk; + struct dev_pagemap pgmap; }; long __pmem_direct_access(struct pmem_device *pmem, pgoff_t pgoff, diff --git a/include/linux/memremap.h b/include/linux/memremap.h index 1cb5f39d25c1..7b4899c06f49 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -123,8 +123,7 @@ struct dev_pagemap { }; #ifdef CONFIG_ZONE_DEVICE -void *devm_memremap_pages(struct device *dev, struct resource *res, - struct percpu_ref *ref, struct vmem_altmap *altmap); +void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); @@ -134,8 +133,7 @@ void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); static inline bool is_zone_device_page(const struct page *page); #else static inline void *devm_memremap_pages(struct device *dev, - struct resource *res, struct percpu_ref *ref, - struct vmem_altmap *altmap) + struct dev_pagemap *pgmap) { /* * Fail attempts to call devm_memremap_pages() without diff --git a/kernel/memremap.c b/kernel/memremap.c index 9207c44cce20..a9a948cd3d7f 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -275,9 +275,10 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap) #define for_each_device_pfn(pfn, map) \ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) -static void devm_memremap_pages_release(struct device *dev, void *data) +static void devm_memremap_pages_release(void *data) { struct dev_pagemap *pgmap = data; + struct device *dev = pgmap->dev; struct resource *res = &pgmap->res; resource_size_t align_start, align_size; unsigned long pfn; @@ -316,29 +317,34 @@ static struct dev_pagemap *find_dev_pagemap(resource_size_t phys) /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res - * @res: "host memory" address range - * @ref: a live per-cpu reference count - * @altmap: optional descriptor for allocating the memmap from @res + * @pgmap: pointer to a struct dev_pgmap * * Notes: - * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time - * (or devm release event). The expected order of events is that @ref has + * 1/ At a minimum the res, ref and type members of @pgmap must be initialized + * by the caller before passing it to this function + * + * 2/ The altmap field may optionally be initialized, in which case altmap_valid + * must be set to true + * + * 3/ pgmap.ref must be 'live' on entry and 'dead' before devm_memunmap_pages() + * time (or devm release event). The expected order of events is that ref has * been through percpu_ref_kill() before devm_memremap_pages_release(). The * wait for the completion of all references being dropped and * percpu_ref_exit() must occur after devm_memremap_pages_release(). * - * 2/ @res is expected to be a host memory range that could feasibly be + * 4/ res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but * this is not enforced. */ -void *devm_memremap_pages(struct device *dev, struct resource *res, - struct percpu_ref *ref, struct vmem_altmap *altmap) +void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { resource_size_t align_start, align_size, align_end; + struct vmem_altmap *altmap = pgmap->altmap_valid ? + &pgmap->altmap : NULL; unsigned long pfn, pgoff, order; pgprot_t pgprot = PAGE_KERNEL; - struct dev_pagemap *pgmap; int error, nid, is_ram, i = 0; + struct resource *res = &pgmap->res; align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) @@ -355,27 +361,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (is_ram == REGION_INTERSECTS) return __va(res->start); - if (!ref) + if (!pgmap->ref) return ERR_PTR(-EINVAL); - pgmap = devres_alloc_node(devm_memremap_pages_release, - sizeof(*pgmap), GFP_KERNEL, dev_to_node(dev)); - if (!pgmap) - return ERR_PTR(-ENOMEM); - - memcpy(&pgmap->res, res, sizeof(*res)); - pgmap->dev = dev; - if (altmap) { - memcpy(&pgmap->altmap, altmap, sizeof(*altmap)); - pgmap->altmap_valid = true; - altmap = &pgmap->altmap; - } - pgmap->ref = ref; - pgmap->type = MEMORY_DEVICE_HOST; - pgmap->page_fault = NULL; - pgmap->page_free = NULL; - pgmap->data = NULL; mutex_lock(&pgmap_lock); error = 0; @@ -423,11 +412,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, */ list_del(&page->lru); page->pgmap = pgmap; - percpu_ref_get(ref); + percpu_ref_get(pgmap->ref); if (!(++i % 1024)) cond_resched(); } - devres_add(dev, pgmap); + + devm_add_action(dev, devm_memremap_pages_release, pgmap); + return __va(res->start); err_add_memory: diff --git a/tools/testing/nvdimm/test/iomap.c b/tools/testing/nvdimm/test/iomap.c index e1f75a1914a1..ff9d3a5825e1 100644 --- a/tools/testing/nvdimm/test/iomap.c +++ b/tools/testing/nvdimm/test/iomap.c @@ -104,15 +104,14 @@ void *__wrap_devm_memremap(struct device *dev, resource_size_t offset, } EXPORT_SYMBOL(__wrap_devm_memremap); -void *__wrap_devm_memremap_pages(struct device *dev, struct resource *res, - struct percpu_ref *ref, struct vmem_altmap *altmap) +void *__wrap_devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { - resource_size_t offset = res->start; + resource_size_t offset = pgmap->res.start; struct nfit_test_resource *nfit_res = get_nfit_res(offset); if (nfit_res) return nfit_res->buf + offset - nfit_res->res.start; - return devm_memremap_pages(dev, res, ref, altmap); + return devm_memremap_pages(dev, pgmap); } EXPORT_SYMBOL(__wrap_devm_memremap_pages); -- cgit v1.2.3 From e697c5b90e97792187e45f8d78fb2bfa62eb0496 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 29 Dec 2017 08:54:06 +0100 Subject: memremap: merge find_dev_pagemap into get_dev_pagemap There is only one caller of the trivial function find_dev_pagemap left, so just merge it into the caller. Signed-off-by: Christoph Hellwig Signed-off-by: Dan Williams --- kernel/memremap.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index a9a948cd3d7f..ada31b0d76d4 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -306,14 +306,6 @@ static void devm_memremap_pages_release(void *data) "%s: failed to free all reserved pages\n", __func__); } -/* assumes rcu_read_lock() held at entry */ -static struct dev_pagemap *find_dev_pagemap(resource_size_t phys) -{ - WARN_ON_ONCE(!rcu_read_lock_held()); - - return radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); -} - /** * devm_memremap_pages - remap and provide memmap backing for the given resource * @dev: hosting device for @res @@ -466,7 +458,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, /* fall back to slow path lookup */ rcu_read_lock(); - pgmap = find_dev_pagemap(phys); + pgmap = radix_tree_lookup(&pgmap_radix, PHYS_PFN(phys)); if (pgmap && !percpu_ref_tryget_live(pgmap->ref)) pgmap = NULL; rcu_read_unlock(); -- cgit v1.2.3 From b865ea64304ed591b7ab92d74efb12eff5ff4cbb Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 10 Nov 2017 08:48:25 +0900 Subject: sections: split dereference_function_descriptor() There are two format specifiers to print out a pointer in symbolic format: '%pS/%ps' and '%pF/%pf'. On most architectures, the two mean exactly the same thing, but some architectures (ia64, ppc64, parisc64) use an indirect pointer for C function pointers, where the function pointer points to a function descriptor (which in turn contains the actual pointer to the code). The '%pF/%pf, when used appropriately, automatically does the appropriate function descriptor dereference on such architectures. The "when used appropriately" part is tricky. Basically this is a subtle ABI detail, specific to some platforms, that made it to the API level and people can be unaware of it and miss the whole "we need to dereference the function" business out. [1] proves that point (note that it fixes only '%pF' and '%pS', there might be '%pf' and '%ps' cases as well). It appears that we can handle everything within the affected arches and make '%pS/%ps' smart enough to retire '%pF/%pf'. Function descriptors live in .opd elf section and all affected arches (ia64, ppc64, parisc64) handle it properly for kernel and modules. So we, technically, can decide if the dereference is needed by simply looking at the pointer: if it belongs to .opd section then we need to dereference it. The kernel and modules have their own .opd sections, obviously, that's why we need to split dereference_function_descriptor() and use separate kernel and module dereference arch callbacks. This patch does the first step, it a) adds dereference_kernel_function_descriptor() function. b) adds a weak alias to dereference_module_function_descriptor() function. So, for the time being, we will have: 1) dereference_function_descriptor() A generic function, that simply dereferences the pointer. There is bunch of places that call it: kgdbts, init/main.c, extable, etc. 2) dereference_kernel_function_descriptor() A function to call on kernel symbols that does kernel .opd section address range test. 3) dereference_module_function_descriptor() A function to call on modules' symbols that does modules' .opd section address range test. [1] https://marc.info/?l=linux-kernel&m=150472969730573 Link: http://lkml.kernel.org/r/20171109234830.5067-2-sergey.senozhatsky@gmail.com To: Fenghua Yu To: Benjamin Herrenschmidt To: Paul Mackerras To: Michael Ellerman To: James Bottomley Cc: Andrew Morton Cc: Jessica Yu Cc: Steven Rostedt Cc: linux-ia64@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Sergey Senozhatsky Tested-by: Tony Luck #ia64 Tested-by: Santosh Sivaraj #powerpc Tested-by: Helge Deller #parisc64 Signed-off-by: Petr Mladek --- include/asm-generic/sections.h | 8 ++++++-- include/linux/module.h | 10 ++++++++++ kernel/module.c | 6 ++++++ 3 files changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 03cc5f9bba71..849cd8eb5ca0 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -30,6 +30,7 @@ * __ctors_start, __ctors_end * __irqentry_text_start, __irqentry_text_end * __softirqentry_text_start, __softirqentry_text_end + * __start_opd, __end_opd */ extern char _text[], _stext[], _etext[]; extern char _data[], _sdata[], _edata[]; @@ -49,12 +50,15 @@ extern char __start_once[], __end_once[]; /* Start and end of .ctors section - used for constructor calls. */ extern char __ctors_start[], __ctors_end[]; +/* Start and end of .opd section - used for function descriptors. */ +extern char __start_opd[], __end_opd[]; + extern __visible const void __nosave_begin, __nosave_end; -/* function descriptor handling (if any). Override - * in asm/sections.h */ +/* Function descriptor handling (if any). Override in asm/sections.h */ #ifndef dereference_function_descriptor #define dereference_function_descriptor(p) (p) +#define dereference_kernel_function_descriptor(p) (p) #endif /* random extra sections (if any). Override diff --git a/include/linux/module.h b/include/linux/module.h index c69b49abe877..e6249795f9e2 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -606,6 +606,9 @@ int ref_module(struct module *a, struct module *b); __mod ? __mod->name : "kernel"; \ }) +/* Dereference module function descriptor */ +void *dereference_module_function_descriptor(struct module *mod, void *ptr); + /* For kallsyms to ask for address resolution. namebuf should be at * least KSYM_NAME_LEN long: a pointer to namebuf is returned if * found, otherwise NULL. */ @@ -760,6 +763,13 @@ static inline bool is_module_sig_enforced(void) return false; } +/* Dereference module function descriptor */ +static inline +void *dereference_module_function_descriptor(struct module *mod, void *ptr) +{ + return ptr; +} + #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/kernel/module.c b/kernel/module.c index f0411a271765..65f6561d70e1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3938,6 +3938,12 @@ static const char *get_ksymbol(struct module *mod, return symname(kallsyms, best); } +void * __weak dereference_module_function_descriptor(struct module *mod, + void *ptr) +{ + return ptr; +} + /* For kallsyms to ask for address resolution. NULL means not found. Careful * not to lock to avoid deadlock on oopses, simply disable preemption. */ const char *module_address_lookup(unsigned long addr, -- cgit v1.2.3 From 04b8eb7a4ccd9ef9343e2720ccf2a5db8cfe2f67 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 6 Dec 2017 13:36:49 +0900 Subject: symbol lookup: introduce dereference_symbol_descriptor() dereference_symbol_descriptor() invokes appropriate ARCH specific function descriptor dereference callbacks: - dereference_kernel_function_descriptor() if the pointer is a kernel symbol; - dereference_module_function_descriptor() if the pointer is a module symbol. This is the last step needed to make '%pS/%ps' smart enough to handle function descriptor dereference on affected ARCHs and to retire '%pF/%pf'. To refresh it: Some architectures (ia64, ppc64, parisc64) use an indirect pointer for C function pointers - the function pointer points to a function descriptor and we need to dereference it to get the actual function pointer. Function descriptors live in .opd elf section and all affected ARCHs (ia64, ppc64, parisc64) handle it properly for kernel and modules. So we, technically, can decide if the dereference is needed by simply looking at the pointer: if it belongs to .opd section then we need to dereference it. The kernel and modules have their own .opd sections, obviously, that's why we need to split dereference_function_descriptor() and use separate kernel and module dereference arch callbacks. Link: http://lkml.kernel.org/r/20171206043649.GB15885@jagdpanzerIV Cc: Fenghua Yu Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: James Bottomley Cc: Andrew Morton Cc: Jessica Yu Cc: Steven Rostedt Cc: linux-ia64@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Sergey Senozhatsky Tested-by: Tony Luck #ia64 Tested-by: Santosh Sivaraj #powerpc Tested-by: Helge Deller #parisc64 Signed-off-by: Petr Mladek --- Documentation/printk-formats.txt | 35 +++++++++----------------- include/linux/kallsyms.h | 54 ++++++++++++++++++++++++++++++++++++++++ kernel/kallsyms.c | 35 -------------------------- lib/vsprintf.c | 5 ++-- 4 files changed, 68 insertions(+), 61 deletions(-) (limited to 'kernel') diff --git a/Documentation/printk-formats.txt b/Documentation/printk-formats.txt index 361789df51ec..58c44cce90b6 100644 --- a/Documentation/printk-formats.txt +++ b/Documentation/printk-formats.txt @@ -50,42 +50,31 @@ Symbols/Function Pointers :: + %pS versatile_init+0x0/0x110 + %ps versatile_init %pF versatile_init+0x0/0x110 %pf versatile_init - %pS versatile_init+0x0/0x110 %pSR versatile_init+0x9/0x110 (with __builtin_extract_return_addr() translation) - %ps versatile_init %pB prev_fn_of_versatile_init+0x88/0x88 -The ``F`` and ``f`` specifiers are for printing function pointers, -for example, f->func, &gettimeofday. They have the same result as -``S`` and ``s`` specifiers. But they do an extra conversion on -ia64, ppc64 and parisc64 architectures where the function pointers -are actually function descriptors. +The ``S`` and ``s`` specifiers are used for printing a pointer in symbolic +format. They result in the symbol name with (``S``) or without (``s``) +offsets. If KALLSYMS are disabled then the symbol address is printed instead. -The ``S`` and ``s`` specifiers can be used for printing symbols -from direct addresses, for example, __builtin_return_address(0), -(void *)regs->ip. They result in the symbol name with (``S``) or -without (``s``) offsets. If KALLSYMS are disabled then the symbol -address is printed instead. +Note, that the ``F`` and ``f`` specifiers are identical to ``S`` (``s``) +and thus deprecated. We have ``F`` and ``f`` because on ia64, ppc64 and +parisc64 function pointers are indirect and, in fact, are function +descriptors, which require additional dereferencing before we can lookup +the symbol. As of now, ``S`` and ``s`` perform dereferencing on those +platforms (when needed), so ``F`` and ``f`` exist for compatibility +reasons only. The ``B`` specifier results in the symbol name with offsets and should be used when printing stack backtraces. The specifier takes into consideration the effect of compiler optimisations which may occur when tail-call``s are used and marked with the noreturn GCC attribute. -Examples:: - - printk("Going to call: %pF\n", gettimeofday); - printk("Going to call: %pF\n", p->func); - printk("%s: called from %pS\n", __func__, (void *)_RET_IP_); - printk("%s: called from %pS\n", __func__, - (void *)__builtin_return_address(0)); - printk("Faulted at %pS\n", (void *)regs->ip); - printk(" %s%pB\n", (reliable ? "" : "? "), (void *)*stack); - - Kernel Pointers =============== diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 708f337d780b..e4f2e5a65f14 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -9,6 +9,10 @@ #include #include #include +#include +#include + +#include #define KSYM_NAME_LEN 128 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \ @@ -22,6 +26,56 @@ struct module; +static inline int is_kernel_inittext(unsigned long addr) +{ + if (addr >= (unsigned long)_sinittext + && addr <= (unsigned long)_einittext) + return 1; + return 0; +} + +static inline int is_kernel_text(unsigned long addr) +{ + if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || + arch_is_kernel_text(addr)) + return 1; + return in_gate_area_no_mm(addr); +} + +static inline int is_kernel(unsigned long addr) +{ + if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) + return 1; + return in_gate_area_no_mm(addr); +} + +static inline int is_ksym_addr(unsigned long addr) +{ + if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) + return is_kernel(addr); + + return is_kernel_text(addr) || is_kernel_inittext(addr); +} + +static inline void *dereference_symbol_descriptor(void *ptr) +{ +#ifdef HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR + struct module *mod; + + ptr = dereference_kernel_function_descriptor(ptr); + if (is_ksym_addr((unsigned long)ptr)) + return ptr; + + preempt_disable(); + mod = __module_address((unsigned long)ptr); + preempt_enable(); + + if (mod) + ptr = dereference_module_function_descriptor(mod, ptr); +#endif + return ptr; +} + #ifdef CONFIG_KALLSYMS /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 531ffa984bc2..0e4c0922908a 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -12,7 +12,6 @@ * compression (see scripts/kallsyms.c for a more complete description) */ #include -#include #include #include #include @@ -20,15 +19,12 @@ #include #include #include /* for cond_resched */ -#include #include #include #include #include #include -#include - /* * These will be re-linked against their real values * during the second link stage. @@ -52,37 +48,6 @@ extern const u16 kallsyms_token_index[] __weak; extern const unsigned long kallsyms_markers[] __weak; -static inline int is_kernel_inittext(unsigned long addr) -{ - if (addr >= (unsigned long)_sinittext - && addr <= (unsigned long)_einittext) - return 1; - return 0; -} - -static inline int is_kernel_text(unsigned long addr) -{ - if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || - arch_is_kernel_text(addr)) - return 1; - return in_gate_area_no_mm(addr); -} - -static inline int is_kernel(unsigned long addr) -{ - if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) - return 1; - return in_gate_area_no_mm(addr); -} - -static int is_ksym_addr(unsigned long addr) -{ - if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) - return is_kernel(addr); - - return is_kernel_text(addr) || is_kernel_inittext(addr); -} - /* * Expand a compressed symbol data into the resulting uncompressed string, * if uncompressed string is too long (>= maxlen), it will be truncated, diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 1746bae94d41..16e2eefb0f79 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -40,7 +40,6 @@ #include "../mm/internal.h" /* For the trace_print_flags arrays */ #include /* for PAGE_SIZE */ -#include /* for dereference_function_descriptor() */ #include /* cpu_to_le16 */ #include @@ -1723,10 +1722,10 @@ char *pointer(const char *fmt, char *buf, char *end, void *ptr, switch (*fmt) { case 'F': case 'f': - ptr = dereference_function_descriptor(ptr); - /* Fallthrough */ case 'S': case 's': + ptr = dereference_symbol_descriptor(ptr); + /* Fallthrough */ case 'B': return symbol_string(buf, end, ptr, spec, fmt); case 'R': -- cgit v1.2.3 From 430e68d10baf55e4c40d4dd1de8201c1caf5dddd Mon Sep 17 00:00:00 2001 From: Quentin Monnet Date: Wed, 10 Jan 2018 12:26:06 +0000 Subject: bpf: export function to write into verifier log buffer Rename the BPF verifier `verbose()` to `bpf_verifier_log_write()` and export it, so that other components (in particular, drivers for BPF offload) can reuse the user buffer log to dump error messages at verification time. Renaming `verbose()` was necessary in order to avoid a name so generic to be exported to the global namespace. However to prevent too much pain for backports, the calls to `verbose()` in the kernel BPF verifier were not changed. Instead, use function aliasing to make `verbose` point to `bpf_verifier_log_write`. Another solution could consist in making a wrapper around `verbose()`, but since it is a variadic function, I don't see a clean way without creating two identical wrappers, one for the verifier and one to export. Signed-off-by: Quentin Monnet Reviewed-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- include/linux/bpf_verifier.h | 3 +++ kernel/bpf/verifier.c | 16 ++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 2feb218c001d..6b66cd1aa0b9 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -192,6 +192,9 @@ struct bpf_verifier_env { u32 subprog_cnt; }; +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...); + static inline struct bpf_reg_state *cur_regs(struct bpf_verifier_env *env) { struct bpf_verifier_state *cur = env->cur_state; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d921ab387b0b..3b2b47666180 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -169,11 +169,11 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); /* log_level controls verbosity level of eBPF verifier. - * verbose() is used to dump the verification trace to the log, so the user - * can figure out what's wrong with the program + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program */ -static __printf(2, 3) void verbose(struct bpf_verifier_env *env, - const char *fmt, ...) +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...) { struct bpf_verifer_log *log = &env->log; unsigned int n; @@ -197,6 +197,14 @@ static __printf(2, 3) void verbose(struct bpf_verifier_env *env, else log->ubuf = NULL; } +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); +/* Historically bpf_verifier_log_write was called verbose, but the name was too + * generic for symbol export. The function was renamed, but not the calls in + * the verifier to avoid complicating backports. Hence the alias below. + */ +static __printf(2, 3) void verbose(struct bpf_verifier_env *env, + const char *fmt, ...) + __attribute__((alias("bpf_verifier_log_write"))); static bool type_is_pkt_pointer(enum bpf_reg_type type) { -- cgit v1.2.3 From d0807da78e11d46f18399cbf8c4028c731346766 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Wed, 10 Jan 2018 11:01:28 +0100 Subject: livepatch: Remove immediate feature Immediate flag has been used to disable per-task consistency and patch all tasks immediately. It could be useful if the patch doesn't change any function or data semantics. However, it causes problems on its own. The consistency problem is currently broken with respect to immediate patches. func a patches 1i 2i 3 When the patch 3 is applied, only 2i function is checked (by stack checking facility). There might be a task sleeping in 1i though. Such task is migrated to 3, because we do not check 1i in klp_check_stack_func() at all. Coming atomic replace feature would be easier to implement and more reliable without immediate. Thus, remove immediate feature completely and save us from the problems. Note that force feature has the similar problem. However it is considered as a last resort. If used, administrator should not apply any new live patches and should plan for reboot into an updated kernel. The architectures would now need to provide HAVE_RELIABLE_STACKTRACE to fully support livepatch. Signed-off-by: Miroslav Benes Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- Documentation/livepatch/livepatch.txt | 89 +++++++++------------------- include/linux/livepatch.h | 4 -- kernel/livepatch/core.c | 12 +--- kernel/livepatch/transition.c | 49 ++------------- samples/livepatch/livepatch-callbacks-demo.c | 15 ----- samples/livepatch/livepatch-sample.c | 15 ----- samples/livepatch/livepatch-shadow-fix1.c | 15 ----- samples/livepatch/livepatch-shadow-fix2.c | 15 ----- 8 files changed, 33 insertions(+), 181 deletions(-) (limited to 'kernel') diff --git a/Documentation/livepatch/livepatch.txt b/Documentation/livepatch/livepatch.txt index 896ba8941702..1ae2de758c08 100644 --- a/Documentation/livepatch/livepatch.txt +++ b/Documentation/livepatch/livepatch.txt @@ -72,8 +72,7 @@ example, they add a NULL pointer or a boundary check, fix a race by adding a missing memory barrier, or add some locking around a critical section. Most of these changes are self contained and the function presents itself the same way to the rest of the system. In this case, the functions might -be updated independently one by one. (This can be done by setting the -'immediate' flag in the klp_patch struct.) +be updated independently one by one. But there are more complex fixes. For example, a patch might change ordering of locking in multiple functions at the same time. Or a patch @@ -125,12 +124,6 @@ safe to patch tasks: b) Patching CPU-bound user tasks. If the task is highly CPU-bound then it will get patched the next time it gets interrupted by an IRQ. - c) In the future it could be useful for applying patches for - architectures which don't yet have HAVE_RELIABLE_STACKTRACE. In - this case you would have to signal most of the tasks on the - system. However this isn't supported yet because there's - currently no way to patch kthreads without - HAVE_RELIABLE_STACKTRACE. 3. For idle "swapper" tasks, since they don't ever exit the kernel, they instead have a klp_update_patch_state() call in the idle loop which @@ -138,27 +131,16 @@ safe to patch tasks: (Note there's not yet such an approach for kthreads.) -All the above approaches may be skipped by setting the 'immediate' flag -in the 'klp_patch' struct, which will disable per-task consistency and -patch all tasks immediately. This can be useful if the patch doesn't -change any function or data semantics. Note that, even with this flag -set, it's possible that some tasks may still be running with an old -version of the function, until that function returns. +Architectures which don't have HAVE_RELIABLE_STACKTRACE solely rely on +the second approach. It's highly likely that some tasks may still be +running with an old version of the function, until that function +returns. In this case you would have to signal the tasks. This +especially applies to kthreads. They may not be woken up and would need +to be forced. See below for more information. -There's also an 'immediate' flag in the 'klp_func' struct which allows -you to specify that certain functions in the patch can be applied -without per-task consistency. This might be useful if you want to patch -a common function like schedule(), and the function change doesn't need -consistency but the rest of the patch does. - -For architectures which don't have HAVE_RELIABLE_STACKTRACE, the user -must set patch->immediate which causes all tasks to be patched -immediately. This option should be used with care, only when the patch -doesn't change any function or data semantics. - -In the future, architectures which don't have HAVE_RELIABLE_STACKTRACE -may be allowed to use per-task consistency if we can come up with -another way to patch kthreads. +Unless we can come up with another way to patch kthreads, architectures +without HAVE_RELIABLE_STACKTRACE are not considered fully supported by +the kernel livepatching. The /sys/kernel/livepatch//transition file shows whether a patch is in transition. Only a single patch (the topmost patch on the stack) @@ -197,6 +179,11 @@ modules is permanently disabled when the force feature is used. It cannot be guaranteed there is no task sleeping in such module. It implies unbounded reference count if a patch module is disabled and enabled in a loop. +Moreover, the usage of force may also affect future applications of live +patches and cause even more harm to the system. Administrator should first +consider to simply cancel a transition (see above). If force is used, reboot +should be planned and no more live patches applied. + 3.1 Adding consistency model support to new architectures --------------------------------------------------------- @@ -234,13 +221,6 @@ few options: a good backup option for those architectures which don't have reliable stack traces yet. -In the meantime, patches for such architectures can bypass the -consistency model by setting klp_patch.immediate to true. This option -is perfectly fine for patches which don't change the semantics of the -patched functions. In practice, this is usable for ~90% of security -fixes. Use of this option also means the patch can't be unloaded after -it has been disabled. - 4. Livepatch module =================== @@ -296,9 +276,6 @@ into three levels: only for a particular object ( vmlinux or a kernel module ). Note that kallsyms allows for searching symbols according to the object name. - There's also an 'immediate' flag which, when set, patches the - function immediately, bypassing the consistency model safety checks. - + struct klp_object defines an array of patched functions (struct klp_func) in the same object. Where the object is either vmlinux (NULL) or a module name. @@ -317,9 +294,6 @@ into three levels: symbols are found. The only exception are symbols from objects (kernel modules) that have not been loaded yet. - Setting the 'immediate' flag applies the patch to all tasks - immediately, bypassing the consistency model safety checks. - For more details on how the patch is applied on a per-task basis, see the "Consistency model" section. @@ -334,14 +308,12 @@ section "Livepatch life-cycle" below for more details about these two operations. Module removal is only safe when there are no users of the underlying -functions. The immediate consistency model is not able to detect this. The -code just redirects the functions at the very beginning and it does not -check if the functions are in use. In other words, it knows when the -functions get called but it does not know when the functions return. -Therefore it cannot be decided when the livepatch module can be safely -removed. This is solved by a hybrid consistency model. When the system is -transitioned to a new patch state (patched/unpatched) it is guaranteed that -no task sleeps or runs in the old code. +functions. This is the reason why the force feature permanently disables +the removal. The forced tasks entered the functions but we cannot say +that they returned back. Therefore it cannot be decided when the +livepatch module can be safely removed. When the system is successfully +transitioned to a new patch state (patched/unpatched) without being +forced it is guaranteed that no task sleeps or runs in the old code. 5. Livepatch life-cycle @@ -355,19 +327,12 @@ First, the patch is applied only when all patched symbols for already loaded objects are found. The error handling is much easier if this check is done before particular functions get redirected. -Second, the immediate consistency model does not guarantee that anyone is not -sleeping in the new code after the patch is reverted. This means that the new -code needs to stay around "forever". If the code is there, one could apply it -again. Therefore it makes sense to separate the operations that might be done -once and those that need to be repeated when the patch is enabled (applied) -again. - -Third, it might take some time until the entire system is migrated -when a more complex consistency model is used. The patch revert might -block the livepatch module removal for too long. Therefore it is useful -to revert the patch using a separate operation that might be called -explicitly. But it does not make sense to remove all information -until the livepatch module is really removed. +Second, it might take some time until the entire system is migrated with +the hybrid consistency model being used. The patch revert might block +the livepatch module removal for too long. Therefore it is useful to +revert the patch using a separate operation that might be called +explicitly. But it does not make sense to remove all information until +the livepatch module is really removed. 5.1. Registration diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index fc5c1be3f6f4..4754f01c1abb 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -40,7 +40,6 @@ * @new_func: pointer to the patched function code * @old_sympos: a hint indicating which symbol position the old function * can be found (optional) - * @immediate: patch the func immediately, bypassing safety mechanisms * @old_addr: the address of the function being patched * @kobj: kobject for sysfs resources * @stack_node: list node for klp_ops func_stack list @@ -76,7 +75,6 @@ struct klp_func { * in kallsyms for the given object is used. */ unsigned long old_sympos; - bool immediate; /* internal */ unsigned long old_addr; @@ -137,7 +135,6 @@ struct klp_object { * struct klp_patch - patch structure for live patching * @mod: reference to the live patch module * @objs: object entries for kernel objects to be patched - * @immediate: patch all funcs immediately, bypassing safety mechanisms * @list: list node for global list of registered patches * @kobj: kobject for sysfs resources * @enabled: the patch is enabled (but operation may be incomplete) @@ -147,7 +144,6 @@ struct klp_patch { /* external */ struct module *mod; struct klp_object *objs; - bool immediate; /* internal */ struct list_head list; diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 1c3c9b27c916..41be6061b92f 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -366,11 +366,6 @@ static int __klp_enable_patch(struct klp_patch *patch) /* * A reference is taken on the patch module to prevent it from being * unloaded. - * - * Note: For immediate (no consistency model) patches we don't allow - * patch modules to unload since there is no safe/sane method to - * determine if a thread is still running in the patched code contained - * in the patch module once the ftrace registration is successful. */ if (!try_module_get(patch->mod)) return -ENODEV; @@ -890,12 +885,7 @@ int klp_register_patch(struct klp_patch *patch) if (!klp_initialized()) return -ENODEV; - /* - * Architectures without reliable stack traces have to set - * patch->immediate because there's currently no way to patch kthreads - * with the consistency model. - */ - if (!klp_have_reliable_stack() && !patch->immediate) { + if (!klp_have_reliable_stack()) { pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); return -ENOSYS; } diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c index be5bfa533ee8..7c6631e693bc 100644 --- a/kernel/livepatch/transition.c +++ b/kernel/livepatch/transition.c @@ -82,7 +82,6 @@ static void klp_complete_transition(void) struct klp_func *func; struct task_struct *g, *task; unsigned int cpu; - bool immediate_func = false; pr_debug("'%s': completing %s transition\n", klp_transition_patch->mod->name, @@ -104,16 +103,9 @@ static void klp_complete_transition(void) klp_synchronize_transition(); } - if (klp_transition_patch->immediate) - goto done; - - klp_for_each_object(klp_transition_patch, obj) { - klp_for_each_func(obj, func) { + klp_for_each_object(klp_transition_patch, obj) + klp_for_each_func(obj, func) func->transition = false; - if (func->immediate) - immediate_func = true; - } - } /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ if (klp_target_state == KLP_PATCHED) @@ -132,7 +124,6 @@ static void klp_complete_transition(void) task->patch_state = KLP_UNDEFINED; } -done: klp_for_each_object(klp_transition_patch, obj) { if (!klp_is_object_loaded(obj)) continue; @@ -146,16 +137,11 @@ done: klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); /* - * See complementary comment in __klp_enable_patch() for why we - * keep the module reference for immediate patches. - * - * klp_forced or immediate_func set implies unbounded increase of - * module's ref count if the module is disabled/enabled in a loop. + * klp_forced set implies unbounded increase of module's ref count if + * the module is disabled/enabled in a loop. */ - if (!klp_forced && !klp_transition_patch->immediate && - !immediate_func && klp_target_state == KLP_UNPATCHED) { + if (!klp_forced && klp_target_state == KLP_UNPATCHED) module_put(klp_transition_patch->mod); - } klp_target_state = KLP_UNDEFINED; klp_transition_patch = NULL; @@ -223,9 +209,6 @@ static int klp_check_stack_func(struct klp_func *func, struct klp_ops *ops; int i; - if (func->immediate) - return 0; - for (i = 0; i < trace->nr_entries; i++) { address = trace->entries[i]; @@ -387,13 +370,6 @@ void klp_try_complete_transition(void) WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); - /* - * If the patch can be applied or reverted immediately, skip the - * per-task transitions. - */ - if (klp_transition_patch->immediate) - goto success; - /* * Try to switch the tasks to the target patch state by walking their * stacks and looking for any to-be-patched or to-be-unpatched @@ -437,7 +413,6 @@ void klp_try_complete_transition(void) return; } -success: /* we're done, now cleanup the data structures */ klp_complete_transition(); } @@ -457,13 +432,6 @@ void klp_start_transition(void) klp_transition_patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); - /* - * If the patch can be applied or reverted immediately, skip the - * per-task transitions. - */ - if (klp_transition_patch->immediate) - return; - /* * Mark all normal tasks as needing a patch state update. They'll * switch either in klp_try_complete_transition() or as they exit the @@ -513,13 +481,6 @@ void klp_init_transition(struct klp_patch *patch, int state) pr_debug("'%s': initializing %s transition\n", patch->mod->name, klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); - /* - * If the patch can be applied or reverted immediately, skip the - * per-task transitions. - */ - if (patch->immediate) - return; - /* * Initialize all tasks to the initial patch state to prepare them for * switching to the target state. diff --git a/samples/livepatch/livepatch-callbacks-demo.c b/samples/livepatch/livepatch-callbacks-demo.c index 3d115bd68442..72f9e6d1387b 100644 --- a/samples/livepatch/livepatch-callbacks-demo.c +++ b/samples/livepatch/livepatch-callbacks-demo.c @@ -197,21 +197,6 @@ static int livepatch_callbacks_demo_init(void) { int ret; - if (!klp_have_reliable_stack() && !patch.immediate) { - /* - * WARNING: Be very careful when using 'patch.immediate' in - * your patches. It's ok to use it for simple patches like - * this, but for more complex patches which change function - * semantics, locking semantics, or data structures, it may not - * be safe. Use of this option will also prevent removal of - * the patch. - * - * See Documentation/livepatch/livepatch.txt for more details. - */ - patch.immediate = true; - pr_notice("The consistency model isn't supported for your architecture. Bypassing safety mechanisms and applying the patch immediately.\n"); - } - ret = klp_register_patch(&patch); if (ret) return ret; diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c index 84795223f15f..2d554dd930e2 100644 --- a/samples/livepatch/livepatch-sample.c +++ b/samples/livepatch/livepatch-sample.c @@ -71,21 +71,6 @@ static int livepatch_init(void) { int ret; - if (!klp_have_reliable_stack() && !patch.immediate) { - /* - * WARNING: Be very careful when using 'patch.immediate' in - * your patches. It's ok to use it for simple patches like - * this, but for more complex patches which change function - * semantics, locking semantics, or data structures, it may not - * be safe. Use of this option will also prevent removal of - * the patch. - * - * See Documentation/livepatch/livepatch.txt for more details. - */ - patch.immediate = true; - pr_notice("The consistency model isn't supported for your architecture. Bypassing safety mechanisms and applying the patch immediately.\n"); - } - ret = klp_register_patch(&patch); if (ret) return ret; diff --git a/samples/livepatch/livepatch-shadow-fix1.c b/samples/livepatch/livepatch-shadow-fix1.c index fbe0a1f3d99b..830c55514f9f 100644 --- a/samples/livepatch/livepatch-shadow-fix1.c +++ b/samples/livepatch/livepatch-shadow-fix1.c @@ -133,21 +133,6 @@ static int livepatch_shadow_fix1_init(void) { int ret; - if (!klp_have_reliable_stack() && !patch.immediate) { - /* - * WARNING: Be very careful when using 'patch.immediate' in - * your patches. It's ok to use it for simple patches like - * this, but for more complex patches which change function - * semantics, locking semantics, or data structures, it may not - * be safe. Use of this option will also prevent removal of - * the patch. - * - * See Documentation/livepatch/livepatch.txt for more details. - */ - patch.immediate = true; - pr_notice("The consistency model isn't supported for your architecture. Bypassing safety mechanisms and applying the patch immediately.\n"); - } - ret = klp_register_patch(&patch); if (ret) return ret; diff --git a/samples/livepatch/livepatch-shadow-fix2.c b/samples/livepatch/livepatch-shadow-fix2.c index 53c1794bdc5f..ff9948f0ec00 100644 --- a/samples/livepatch/livepatch-shadow-fix2.c +++ b/samples/livepatch/livepatch-shadow-fix2.c @@ -128,21 +128,6 @@ static int livepatch_shadow_fix2_init(void) { int ret; - if (!klp_have_reliable_stack() && !patch.immediate) { - /* - * WARNING: Be very careful when using 'patch.immediate' in - * your patches. It's ok to use it for simple patches like - * this, but for more complex patches which change function - * semantics, locking semantics, or data structures, it may not - * be safe. Use of this option will also prevent removal of - * the patch. - * - * See Documentation/livepatch/livepatch.txt for more details. - */ - patch.immediate = true; - pr_notice("The consistency model isn't supported for your architecture. Bypassing safety mechanisms and applying the patch immediately.\n"); - } - ret = klp_register_patch(&patch); if (ret) return ret; -- cgit v1.2.3 From 8869016d3a58cbe7c31c70f4f008a92122b271c7 Mon Sep 17 00:00:00 2001 From: Miroslav Benes Date: Thu, 21 Dec 2017 14:40:43 +0100 Subject: livepatch: add locking to force and signal functions klp_send_signals() and klp_force_transition() do not acquire klp_mutex, because it seemed to be superfluous. A potential race in klp_send_signals() was harmless and there was nothing in klp_force_transition() which needed to be synchronized. That changed with the addition of klp_forced variable during the review process. There is a small window now, when klp_complete_transition() does not see klp_forced set to true while all tasks have been already transitioned to the target state. module_put() is called and the module can be removed. Acquire klp_mutex in sysfs callback to prevent it. Do the same for the signal sending just to be sure. There is no real downside to that. Fixes: c99a2be790b07 ("livepatch: force transition to finish") Fixes: 43347d56c8d9d ("livepatch: send a fake signal to all blocking tasks") Reported-by: Jason Baron Signed-off-by: Miroslav Benes Reviewed-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 52 ++++++++++++++++++++++++++----------------------- 1 file changed, 28 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 1c3c9b27c916..8fd8e8f126da 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -537,22 +537,24 @@ static ssize_t signal_store(struct kobject *kobj, struct kobj_attribute *attr, int ret; bool val; - patch = container_of(kobj, struct klp_patch, kobj); - - /* - * klp_mutex lock is not grabbed here intentionally. It is not really - * needed. The race window is harmless and grabbing the lock would only - * hold the action back. - */ - if (patch != klp_transition_patch) - return -EINVAL; - ret = kstrtobool(buf, &val); if (ret) return ret; - if (val) - klp_send_signals(); + if (!val) + return count; + + mutex_lock(&klp_mutex); + + patch = container_of(kobj, struct klp_patch, kobj); + if (patch != klp_transition_patch) { + mutex_unlock(&klp_mutex); + return -EINVAL; + } + + klp_send_signals(); + + mutex_unlock(&klp_mutex); return count; } @@ -564,22 +566,24 @@ static ssize_t force_store(struct kobject *kobj, struct kobj_attribute *attr, int ret; bool val; - patch = container_of(kobj, struct klp_patch, kobj); - - /* - * klp_mutex lock is not grabbed here intentionally. It is not really - * needed. The race window is harmless and grabbing the lock would only - * hold the action back. - */ - if (patch != klp_transition_patch) - return -EINVAL; - ret = kstrtobool(buf, &val); if (ret) return ret; - if (val) - klp_force_transition(); + if (!val) + return count; + + mutex_lock(&klp_mutex); + + patch = container_of(kobj, struct klp_patch, kobj); + if (patch != klp_transition_patch) { + mutex_unlock(&klp_mutex); + return -EINVAL; + } + + klp_force_transition(); + + mutex_unlock(&klp_mutex); return count; } -- cgit v1.2.3 From 526c3ddb6aa270fe6f71d227eac1e189746cc257 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 3 Jan 2018 18:07:12 -0600 Subject: signal/arm64: Document conflicts with SI_USER and SIGFPE,SIGTRAP,SIGBUS Setting si_code to 0 results in a userspace seeing an si_code of 0. This is the same si_code as SI_USER. Posix and common sense requires that SI_USER not be a signal specific si_code. As such this use of 0 for the si_code is a pretty horribly broken ABI. Further use of si_code == 0 guaranteed that copy_siginfo_to_user saw a value of __SI_KILL and now sees a value of SIL_KILL with the result that uid and pid fields are copied and which might copying the si_addr field by accident but certainly not by design. Making this a very flakey implementation. Utilizing FPE_FIXME, BUS_FIXME, TRAP_FIXME siginfo_layout will now return SIL_FAULT and the appropriate fields will be reliably copied. But folks this is a new and unique kind of bad. This is massively untested code bad. This is inventing new and unique was to get siginfo wrong bad. This is don't even think about Posix or what siginfo means bad. This is lots of eyeballs all missing the fact that the code does the wrong thing bad. This is getting stuck and keep making the same mistake bad. I really hope we can find a non userspace breaking fix for this on a port as new as arm64. Possible ABI fixes include: - Send the signal without siginfo - Don't generate a signal - Possibly assign and use an appropriate si_code - Don't handle cases which can't happen Cc: Catalin Marinas Cc: Will Deacon Cc: Tyler Baicar Cc: James Morse Cc: Tony Lindgren Cc: Nicolas Pitre Cc: Olof Johansson Cc: Santosh Shilimkar Cc: Arnd Bergmann Cc: linux-arm-kernel@lists.infradead.org Ref: 53631b54c870 ("arm64: Floating point and SIMD") Ref: 32015c235603 ("arm64: exception: handle Synchronous External Abort") Ref: 1d18c47c735e ("arm64: MMU fault handling and page table management") Signed-off-by: "Eric W. Biederman" --- arch/arm64/include/uapi/asm/siginfo.h | 21 +++++++ arch/arm64/kernel/fpsimd.c | 2 +- arch/arm64/mm/fault.c | 114 +++++++++++++++++----------------- kernel/signal.c | 4 ++ 4 files changed, 83 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/include/uapi/asm/siginfo.h b/arch/arm64/include/uapi/asm/siginfo.h index 574d12f86039..9b4d91277742 100644 --- a/arch/arm64/include/uapi/asm/siginfo.h +++ b/arch/arm64/include/uapi/asm/siginfo.h @@ -21,4 +21,25 @@ #include +/* + * SIGFPE si_codes + */ +#ifdef __KERNEL__ +#define FPE_FIXME 0 /* Broken dup of SI_USER */ +#endif /* __KERNEL__ */ + +/* + * SIGBUS si_codes + */ +#ifdef __KERNEL__ +#define BUS_FIXME 0 /* Broken dup of SI_USER */ +#endif /* __KERNEL__ */ + +/* + * SIGTRAP si_codes + */ +#ifdef __KERNEL__ +#define TRAP_FIXME 0 /* Broken dup of SI_USER */ +#endif /* __KERNEL__ */ + #endif diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index fae81f7964b4..ad0edf31e247 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -867,7 +867,7 @@ asmlinkage void do_fpsimd_acc(unsigned int esr, struct pt_regs *regs) asmlinkage void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) { siginfo_t info; - unsigned int si_code = 0; + unsigned int si_code = FPE_FIXME; if (esr & FPEXC_IOF) si_code = FPE_FLTINV; diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 9b7f89df49db..abe200587334 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -596,7 +596,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) info.si_signo = SIGBUS; info.si_errno = 0; - info.si_code = 0; + info.si_code = BUS_FIXME; if (esr & ESR_ELx_FnV) info.si_addr = NULL; else @@ -607,70 +607,70 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) } static const struct fault_info fault_info[] = { - { do_bad, SIGBUS, 0, "ttbr address size fault" }, - { do_bad, SIGBUS, 0, "level 1 address size fault" }, - { do_bad, SIGBUS, 0, "level 2 address size fault" }, - { do_bad, SIGBUS, 0, "level 3 address size fault" }, + { do_bad, SIGBUS, BUS_FIXME, "ttbr address size fault" }, + { do_bad, SIGBUS, BUS_FIXME, "level 1 address size fault" }, + { do_bad, SIGBUS, BUS_FIXME, "level 2 address size fault" }, + { do_bad, SIGBUS, BUS_FIXME, "level 3 address size fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, - { do_bad, SIGBUS, 0, "unknown 8" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 8" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" }, - { do_bad, SIGBUS, 0, "unknown 12" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 12" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" }, - { do_sea, SIGBUS, 0, "synchronous external abort" }, - { do_bad, SIGBUS, 0, "unknown 17" }, - { do_bad, SIGBUS, 0, "unknown 18" }, - { do_bad, SIGBUS, 0, "unknown 19" }, - { do_sea, SIGBUS, 0, "level 0 (translation table walk)" }, - { do_sea, SIGBUS, 0, "level 1 (translation table walk)" }, - { do_sea, SIGBUS, 0, "level 2 (translation table walk)" }, - { do_sea, SIGBUS, 0, "level 3 (translation table walk)" }, - { do_sea, SIGBUS, 0, "synchronous parity or ECC error" }, // Reserved when RAS is implemented - { do_bad, SIGBUS, 0, "unknown 25" }, - { do_bad, SIGBUS, 0, "unknown 26" }, - { do_bad, SIGBUS, 0, "unknown 27" }, - { do_sea, SIGBUS, 0, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented - { do_sea, SIGBUS, 0, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented - { do_sea, SIGBUS, 0, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented - { do_sea, SIGBUS, 0, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented - { do_bad, SIGBUS, 0, "unknown 32" }, + { do_sea, SIGBUS, BUS_FIXME, "synchronous external abort" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 17" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 18" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 19" }, + { do_sea, SIGBUS, BUS_FIXME, "level 0 (translation table walk)" }, + { do_sea, SIGBUS, BUS_FIXME, "level 1 (translation table walk)" }, + { do_sea, SIGBUS, BUS_FIXME, "level 2 (translation table walk)" }, + { do_sea, SIGBUS, BUS_FIXME, "level 3 (translation table walk)" }, + { do_sea, SIGBUS, BUS_FIXME, "synchronous parity or ECC error" }, // Reserved when RAS is implemented + { do_bad, SIGBUS, BUS_FIXME, "unknown 25" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 26" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 27" }, + { do_sea, SIGBUS, BUS_FIXME, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented + { do_sea, SIGBUS, BUS_FIXME, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented + { do_sea, SIGBUS, BUS_FIXME, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented + { do_sea, SIGBUS, BUS_FIXME, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented + { do_bad, SIGBUS, BUS_FIXME, "unknown 32" }, { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" }, - { do_bad, SIGBUS, 0, "unknown 34" }, - { do_bad, SIGBUS, 0, "unknown 35" }, - { do_bad, SIGBUS, 0, "unknown 36" }, - { do_bad, SIGBUS, 0, "unknown 37" }, - { do_bad, SIGBUS, 0, "unknown 38" }, - { do_bad, SIGBUS, 0, "unknown 39" }, - { do_bad, SIGBUS, 0, "unknown 40" }, - { do_bad, SIGBUS, 0, "unknown 41" }, - { do_bad, SIGBUS, 0, "unknown 42" }, - { do_bad, SIGBUS, 0, "unknown 43" }, - { do_bad, SIGBUS, 0, "unknown 44" }, - { do_bad, SIGBUS, 0, "unknown 45" }, - { do_bad, SIGBUS, 0, "unknown 46" }, - { do_bad, SIGBUS, 0, "unknown 47" }, - { do_bad, SIGBUS, 0, "TLB conflict abort" }, - { do_bad, SIGBUS, 0, "Unsupported atomic hardware update fault" }, - { do_bad, SIGBUS, 0, "unknown 50" }, - { do_bad, SIGBUS, 0, "unknown 51" }, - { do_bad, SIGBUS, 0, "implementation fault (lockdown abort)" }, - { do_bad, SIGBUS, 0, "implementation fault (unsupported exclusive)" }, - { do_bad, SIGBUS, 0, "unknown 54" }, - { do_bad, SIGBUS, 0, "unknown 55" }, - { do_bad, SIGBUS, 0, "unknown 56" }, - { do_bad, SIGBUS, 0, "unknown 57" }, - { do_bad, SIGBUS, 0, "unknown 58" }, - { do_bad, SIGBUS, 0, "unknown 59" }, - { do_bad, SIGBUS, 0, "unknown 60" }, - { do_bad, SIGBUS, 0, "section domain fault" }, - { do_bad, SIGBUS, 0, "page domain fault" }, - { do_bad, SIGBUS, 0, "unknown 63" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 34" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 35" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 36" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 37" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 38" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 39" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 40" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 41" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 42" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 43" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 44" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 45" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 46" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 47" }, + { do_bad, SIGBUS, BUS_FIXME, "TLB conflict abort" }, + { do_bad, SIGBUS, BUS_FIXME, "Unsupported atomic hardware update fault" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 50" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 51" }, + { do_bad, SIGBUS, BUS_FIXME, "implementation fault (lockdown abort)" }, + { do_bad, SIGBUS, BUS_FIXME, "implementation fault (unsupported exclusive)" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 54" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 55" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 56" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 57" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 58" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 59" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 60" }, + { do_bad, SIGBUS, BUS_FIXME, "section domain fault" }, + { do_bad, SIGBUS, BUS_FIXME, "page domain fault" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 63" }, }; int handle_guest_sea(phys_addr_t addr, unsigned int esr) @@ -739,11 +739,11 @@ static struct fault_info __refdata debug_fault_info[] = { { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" }, { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" }, { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" }, - { do_bad, SIGBUS, 0, "unknown 3" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 3" }, { do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" }, - { do_bad, SIGTRAP, 0, "aarch32 vector catch" }, + { do_bad, SIGTRAP, TRAP_FIXME, "aarch32 vector catch" }, { early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" }, - { do_bad, SIGBUS, 0, "unknown 7" }, + { do_bad, SIGBUS, BUS_FIXME, "unknown 7" }, }; void __init hook_debug_fault_code(int nr, diff --git a/kernel/signal.c b/kernel/signal.c index c894162ec96c..fd182a845490 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2711,6 +2711,10 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) #ifdef FPE_FIXME if ((sig == SIGFPE) && (si_code == FPE_FIXME)) layout = SIL_FAULT; +#endif +#ifdef BUS_FIXME + if ((sig == SIGBUS) && (si_code == BUS_FIXME)) + layout = SIL_FAULT; #endif } return layout; -- cgit v1.2.3 From faf1f22b61f2715224ba9b579e2a983e99b86823 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 Jan 2018 17:27:42 -0600 Subject: signal: Ensure generic siginfos the kernel sends have all bits initialized Call clear_siginfo to ensure stack allocated siginfos are fully initialized before being passed to the signal sending functions. This ensures that if there is the kind of confusion documented by TRAP_FIXME, FPE_FIXME, or BUS_FIXME the kernel won't send unitialized data to userspace when the kernel generates a signal with SI_USER but the copy to userspace assumes it is a different kind of signal, and different fields are initialized. This also prepares the way for turning copy_siginfo_to_user into a copy_to_user, by removing the need in many cases to perform a field by field copy simply to skip the uninitialized fields. Signed-off-by: "Eric W. Biederman" --- fs/fcntl.c | 1 + ipc/mqueue.c | 1 + kernel/signal.c | 9 ++++++++- 3 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/fs/fcntl.c b/fs/fcntl.c index 0522e283a4f4..c17369659f4a 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -737,6 +737,7 @@ static void send_sigio_to_task(struct task_struct *p, delivered even if we can't queue. Failure to queue in this case _should_ be reported; we fall back to SIGIO in that case. --sct */ + clear_siginfo(&si); si.si_signo = signum; si.si_errno = 0; si.si_code = reason; diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 9649ecd8a73a..17bc8b874d92 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -639,6 +639,7 @@ static void __do_notify(struct mqueue_inode_info *info) case SIGEV_SIGNAL: /* sends signal */ + clear_siginfo(&sig_i); sig_i.si_signo = info->notify.sigev_signo; sig_i.si_errno = 0; sig_i.si_code = SI_MESGQ; diff --git a/kernel/signal.c b/kernel/signal.c index fd182a845490..241d54958bbb 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -549,6 +549,7 @@ still_pending: * a fast-pathed signal or we must have been * out of queue space. So zero out the info. */ + clear_siginfo(info); info->si_signo = sig; info->si_errno = 0; info->si_code = SI_USER; @@ -1043,6 +1044,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { case (unsigned long) SEND_SIG_NOINFO: + clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_USER; @@ -1051,6 +1053,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, q->info.si_uid = from_kuid_munged(current_user_ns(), current_uid()); break; case (unsigned long) SEND_SIG_PRIV: + clear_siginfo(&q->info); q->info.si_signo = sig; q->info.si_errno = 0; q->info.si_code = SI_KERNEL; @@ -1623,6 +1626,7 @@ bool do_notify_parent(struct task_struct *tsk, int sig) sig = SIGCHLD; } + clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; /* @@ -1717,6 +1721,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, parent = tsk->real_parent; } + clear_siginfo(&info); info.si_signo = SIGCHLD; info.si_errno = 0; /* @@ -1929,7 +1934,7 @@ static void ptrace_do_notify(int signr, int exit_code, int why) { siginfo_t info; - memset(&info, 0, sizeof info); + clear_siginfo(&info); info.si_signo = signr; info.si_code = exit_code; info.si_pid = task_pid_vnr(current); @@ -2136,6 +2141,7 @@ static int ptrace_signal(int signr, siginfo_t *info) * have updated *info via PTRACE_SETSIGINFO. */ if (signr != info->si_signo) { + clear_siginfo(info); info->si_signo = signr; info->si_errno = 0; info->si_code = SI_USER; @@ -2941,6 +2947,7 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) { struct siginfo info; + clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_USER; -- cgit v1.2.3 From aba1be2f8c26ea322dd9c43047c1aff90528f032 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 19 Jul 2017 21:23:15 -0500 Subject: signal: Ensure no siginfo union member increases the size of struct siginfo Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 241d54958bbb..b9e5d825ee46 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3688,6 +3688,7 @@ void __init signals_init(void) /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE != offsetof(struct siginfo, _sifields._pad)); + BUILD_BUG_ON(sizeof(struct siginfo) != SI_MAX_SIZE); sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } -- cgit v1.2.3 From 9943d3accb4ec5a85e72c65a9959257fa7d12500 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 24 Jul 2017 14:53:03 -0500 Subject: signal: Clear si_sys_private before copying siginfo to userspace In preparation for unconditionally copying the whole of siginfo to userspace clear si_sys_private. So this kernel internal value is guaranteed not to make it to userspace. Signed-off-by: "Eric W. Biederman" --- kernel/signal.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index b9e5d825ee46..18aa55c1bb4f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -643,6 +643,9 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) spin_unlock(&tsk->sighand->siglock); posixtimer_rearm(info); spin_lock(&tsk->sighand->siglock); + + /* Don't expose the si_sys_private value to userspace */ + info->si_sys_private = 0; } #endif return signr; -- cgit v1.2.3 From 30073566ca64cbc005f4fbcc21f0af7db83940a2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 31 Jul 2017 15:47:40 -0500 Subject: signal/ia64: switch the last arch-specific copy_siginfo_to_user() to generic version Signed-off-by: "Eric W. Biederman" --- arch/ia64/include/uapi/asm/siginfo.h | 2 -- arch/ia64/kernel/signal.c | 52 ------------------------------------ kernel/signal.c | 9 ++++--- 3 files changed, 5 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/uapi/asm/siginfo.h b/arch/ia64/include/uapi/asm/siginfo.h index 3c5417d66628..66839031b767 100644 --- a/arch/ia64/include/uapi/asm/siginfo.h +++ b/arch/ia64/include/uapi/asm/siginfo.h @@ -11,8 +11,6 @@ #define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int)) -#define HAVE_ARCH_COPY_SIGINFO_TO_USER - #include #define si_imm _sifields._sigfault._imm /* as per UNIX SysV ABI spec */ diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index a254cc98f95c..54547c7cf8a2 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -105,58 +105,6 @@ restore_sigcontext (struct sigcontext __user *sc, struct sigscratch *scr) return err; } -int -copy_siginfo_to_user (siginfo_t __user *to, const siginfo_t *from) -{ - if (!access_ok(VERIFY_WRITE, to, sizeof(siginfo_t))) - return -EFAULT; - if (from->si_code < 0) { - if (__copy_to_user(to, from, sizeof(siginfo_t))) - return -EFAULT; - return 0; - } else { - int err; - - /* - * If you change siginfo_t structure, please be sure this code is fixed - * accordingly. It should never copy any pad contained in the structure - * to avoid security leaks, but must copy the generic 3 ints plus the - * relevant union member. - */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_FAULT: - err |= __put_user(from->si_flags, &to->si_flags); - err |= __put_user(from->si_isr, &to->si_isr); - case SIL_POLL: - err |= __put_user(from->si_addr, &to->si_addr); - err |= __put_user(from->si_imm, &to->si_imm); - break; - case SIL_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_ptr, &to->si_ptr); - break; - case SIL_RT: - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_ptr, &to->si_ptr); - break; - case SIL_CHLD: - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - err |= __put_user(from->si_status, &to->si_status); - case SIL_KILL: - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_pid, &to->si_pid); - break; - } - return err; - } -} - long ia64_rt_sigreturn (struct sigscratch *scr) { diff --git a/kernel/signal.c b/kernel/signal.c index 18aa55c1bb4f..62c642899290 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2729,8 +2729,6 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) return layout; } -#ifndef HAVE_ARCH_COPY_SIGINFO_TO_USER - int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) { int err; @@ -2769,6 +2767,11 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) #ifdef __ARCH_SI_TRAPNO err |= __put_user(from->si_trapno, &to->si_trapno); #endif +#ifdef __ia64__ + err |= __put_user(from->si_imm, &to->si_imm); + err |= __put_user(from->si_flags, &to->si_flags); + err |= __put_user(from->si_isr, &to->si_isr); +#endif #ifdef BUS_MCEERR_AO /* * Other callers might not initialize the si_lsb field, @@ -2812,8 +2815,6 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) return err; } -#endif - /** * do_sigtimedwait - wait for queued signals specified in @which * @which: queued signals to wait for -- cgit v1.2.3 From 0326e7ef057d214ed43a77557078c24e98b84af9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 27 Jul 2017 11:59:46 -0500 Subject: signal: Remove unnecessary ifdefs now that there is only one struct siginfo Remove HAVE_ARCH_SIGINFO_T Remove __ARCH_SIGSYS Signed-off-by: "Eric W. Biederman" --- include/linux/signal.h | 2 -- include/uapi/asm-generic/siginfo.h | 8 -------- kernel/signal.c | 4 ---- 3 files changed, 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index 87abf0c29ed7..a9bc7e1b077e 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -30,9 +30,7 @@ enum siginfo_layout { SIL_FAULT, SIL_CHLD, SIL_RT, -#ifdef __ARCH_SIGSYS SIL_SYS, -#endif }; enum siginfo_layout siginfo_layout(int sig, int si_code); diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 00829f74dcb6..a650d252de0a 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -44,8 +44,6 @@ typedef union sigval { #define __ARCH_SI_ATTRIBUTES #endif -#ifndef HAVE_ARCH_SIGINFO_T - typedef struct siginfo { int si_signo; #ifndef __ARCH_HAS_SWAPPED_SIGINFO @@ -128,10 +126,6 @@ typedef struct siginfo { } _sifields; } __ARCH_SI_ATTRIBUTES siginfo_t; -/* If the arch shares siginfo, then it has SIGSYS. */ -#define __ARCH_SIGSYS -#endif - /* * How these fields are to be accessed. */ @@ -156,11 +150,9 @@ typedef struct siginfo { #define si_pkey _sifields._sigfault._pkey #define si_band _sifields._sigpoll._band #define si_fd _sifields._sigpoll._fd -#ifdef __ARCH_SIGSYS #define si_call_addr _sifields._sigsys._call_addr #define si_syscall _sifields._sigsys._syscall #define si_arch _sifields._sigsys._arch -#endif /* * si_code values diff --git a/kernel/signal.c b/kernel/signal.c index 62c642899290..47c87b1d0b8a 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2697,9 +2697,7 @@ enum siginfo_layout siginfo_layout(int sig, int si_code) #endif [SIGCHLD] = { NSIGCHLD, SIL_CHLD }, [SIGPOLL] = { NSIGPOLL, SIL_POLL }, -#ifdef __ARCH_SIGSYS [SIGSYS] = { NSIGSYS, SIL_SYS }, -#endif }; if ((sig < ARRAY_SIZE(filter)) && (si_code <= filter[sig].limit)) layout = filter[sig].layout; @@ -2804,13 +2802,11 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_ptr, &to->si_ptr); break; -#ifdef __ARCH_SIGSYS case SIL_SYS: err |= __put_user(from->si_call_addr, &to->si_call_addr); err |= __put_user(from->si_syscall, &to->si_syscall); err |= __put_user(from->si_arch, &to->si_arch); break; -#endif } return err; } -- cgit v1.2.3 From b4da3340eae2c3932144be3e81ccfd4e424d87b7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:54:04 +0900 Subject: tracing/kprobe: bpf: Check error injectable event is on function entry Check whether error injectable event is on function entry or not. Currently it checks the event is ftrace-based kprobes or not, but that is wrong. It should check if the event is on the entry of target function. Since error injection will override a function to just return with modified return value, that operation must be done before the target function starts making stackframe. As a side effect, bpf error injection is no need to depend on function-tracer. It can work with sw-breakpoint based kprobe events too. Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- arch/x86/include/asm/kprobes.h | 4 +--- arch/x86/kernel/kprobes/core.c | 14 ++++++++++++++ arch/x86/kernel/kprobes/ftrace.c | 14 -------------- kernel/trace/Kconfig | 2 -- kernel/trace/bpf_trace.c | 8 ++++---- kernel/trace/trace_kprobe.c | 9 ++++++--- kernel/trace/trace_probe.h | 12 ++++++------ 7 files changed, 31 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 36abb23a7a35..367d99cff426 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -67,9 +67,7 @@ extern const int kretprobe_blacklist_size; void arch_remove_kprobe(struct kprobe *p); asmlinkage void kretprobe_trampoline(void); -#ifdef CONFIG_KPROBES_ON_FTRACE -extern void arch_ftrace_kprobe_override_function(struct pt_regs *regs); -#endif +extern void arch_kprobe_override_function(struct pt_regs *regs); /* Architecture specific copy of original instruction*/ struct arch_specific_insn { diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index bd36f3c33cd0..b02a377d5905 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1183,3 +1183,17 @@ int arch_trampoline_kprobe(struct kprobe *p) { return 0; } + +asmlinkage void override_func(void); +asm( + ".type override_func, @function\n" + "override_func:\n" + " ret\n" + ".size override_func, .-override_func\n" +); + +void arch_kprobe_override_function(struct pt_regs *regs) +{ + regs->ip = (unsigned long)&override_func; +} +NOKPROBE_SYMBOL(arch_kprobe_override_function); diff --git a/arch/x86/kernel/kprobes/ftrace.c b/arch/x86/kernel/kprobes/ftrace.c index 1ea748d682fd..8dc0161cec8f 100644 --- a/arch/x86/kernel/kprobes/ftrace.c +++ b/arch/x86/kernel/kprobes/ftrace.c @@ -97,17 +97,3 @@ int arch_prepare_kprobe_ftrace(struct kprobe *p) p->ainsn.boostable = false; return 0; } - -asmlinkage void override_func(void); -asm( - ".type override_func, @function\n" - "override_func:\n" - " ret\n" - ".size override_func, .-override_func\n" -); - -void arch_ftrace_kprobe_override_function(struct pt_regs *regs) -{ - regs->ip = (unsigned long)&override_func; -} -NOKPROBE_SYMBOL(arch_ftrace_kprobe_override_function); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index ae3a2d519e50..6400e1bf97c5 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -533,9 +533,7 @@ config FUNCTION_PROFILER config BPF_KPROBE_OVERRIDE bool "Enable BPF programs to override a kprobed function" depends on BPF_EVENTS - depends on KPROBES_ON_FTRACE depends on HAVE_KPROBE_OVERRIDE - depends on DYNAMIC_FTRACE_WITH_REGS default n help Allows BPF to override the execution of a probed function and diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f6d2327ecb59..1966ad3bf3e0 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -85,7 +85,7 @@ BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { __this_cpu_write(bpf_kprobe_override, 1); regs_set_return_value(regs, rc); - arch_ftrace_kprobe_override_function(regs); + arch_kprobe_override_function(regs); return 0; } @@ -800,11 +800,11 @@ int perf_event_attach_bpf_prog(struct perf_event *event, int ret = -EEXIST; /* - * Kprobe override only works for ftrace based kprobes, and only if they - * are on the opt-in list. + * Kprobe override only works if they are on the function entry, + * and only if they are on the opt-in list. */ if (prog->kprobe_override && - (!trace_kprobe_ftrace(event->tp_event) || + (!trace_kprobe_on_func_entry(event->tp_event) || !trace_kprobe_error_injectable(event->tp_event))) return -EINVAL; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 91f4b57dab82..3c8deb977a8b 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -88,13 +88,16 @@ static nokprobe_inline unsigned long trace_kprobe_nhit(struct trace_kprobe *tk) return nhit; } -int trace_kprobe_ftrace(struct trace_event_call *call) +bool trace_kprobe_on_func_entry(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; - return kprobe_ftrace(&tk->rp.kp); + + return kprobe_on_func_entry(tk->rp.kp.addr, + tk->rp.kp.addr ? NULL : tk->rp.kp.symbol_name, + tk->rp.kp.addr ? 0 : tk->rp.kp.offset); } -int trace_kprobe_error_injectable(struct trace_event_call *call) +bool trace_kprobe_error_injectable(struct trace_event_call *call) { struct trace_kprobe *tk = (struct trace_kprobe *)call->data; unsigned long addr; diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 5e54d748c84c..e101c5bb9eda 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -252,8 +252,8 @@ struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); void free_symbol_cache(struct symbol_cache *sc); struct symbol_cache *alloc_symbol_cache(const char *sym, long offset); -int trace_kprobe_ftrace(struct trace_event_call *call); -int trace_kprobe_error_injectable(struct trace_event_call *call); +bool trace_kprobe_on_func_entry(struct trace_event_call *call); +bool trace_kprobe_error_injectable(struct trace_event_call *call); #else /* uprobes do not support symbol fetch methods */ #define fetch_symbol_u8 NULL @@ -280,14 +280,14 @@ alloc_symbol_cache(const char *sym, long offset) return NULL; } -static inline int trace_kprobe_ftrace(struct trace_event_call *call) +static inline bool trace_kprobe_on_func_entry(struct trace_event_call *call) { - return 0; + return false; } -static inline int trace_kprobe_error_injectable(struct trace_event_call *call) +static inline bool trace_kprobe_error_injectable(struct trace_event_call *call) { - return 0; + return false; } #endif /* CONFIG_KPROBE_EVENTS */ -- cgit v1.2.3 From 66665ad2f1023d3ffb0c12eea9e0a6d0b613ecb3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:54:33 +0900 Subject: tracing/kprobe: bpf: Compare instruction pointer with original one Compare instruction pointer with original one on the stack instead using per-cpu bpf_kprobe_override flag. This patch also consolidates reset_current_kprobe() and preempt_enable_no_resched() blocks. Those can be done in one place. Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- kernel/trace/bpf_trace.c | 1 - kernel/trace/trace_kprobe.c | 21 +++++++-------------- 2 files changed, 7 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 1966ad3bf3e0..24ed6363e00f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -83,7 +83,6 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); #ifdef CONFIG_BPF_KPROBE_OVERRIDE BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { - __this_cpu_write(bpf_kprobe_override, 1); regs_set_return_value(regs, rc); arch_kprobe_override_function(regs); return 0; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 3c8deb977a8b..b8c90441bc87 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -42,8 +42,6 @@ struct trace_kprobe { (offsetof(struct trace_kprobe, tp.args) + \ (sizeof(struct probe_arg) * (n))) -DEFINE_PER_CPU(int, bpf_kprobe_override); - static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk) { return tk->rp.handler != NULL; @@ -1205,6 +1203,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) int rctx; if (bpf_prog_array_valid(call)) { + unsigned long orig_ip = instruction_pointer(regs); int ret; ret = trace_call_bpf(call, regs); @@ -1212,12 +1211,13 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) /* * We need to check and see if we modified the pc of the * pt_regs, and if so clear the kprobe and return 1 so that we - * don't do the instruction skipping. Also reset our state so - * we are clean the next pass through. + * don't do the single stepping. + * The ftrace kprobe handler leaves it up to us to re-enable + * preemption here before returning if we've modified the ip. */ - if (__this_cpu_read(bpf_kprobe_override)) { - __this_cpu_write(bpf_kprobe_override, 0); + if (orig_ip != instruction_pointer(regs)) { reset_current_kprobe(); + preempt_enable_no_resched(); return 1; } if (!ret) @@ -1325,15 +1325,8 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) if (tk->tp.flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); #ifdef CONFIG_PERF_EVENTS - if (tk->tp.flags & TP_FLAG_PROFILE) { + if (tk->tp.flags & TP_FLAG_PROFILE) ret = kprobe_perf_func(tk, regs); - /* - * The ftrace kprobe handler leaves it up to us to re-enable - * preemption here before returning if we've modified the ip. - */ - if (ret) - preempt_enable_no_resched(); - } #endif return ret; } -- cgit v1.2.3 From 540adea3809f61115d2a1ea4ed6e627613452ba1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:55:03 +0900 Subject: error-injection: Separate error-injection from kprobe Since error-injection framework is not limited to be used by kprobes, nor bpf. Other kernel subsystems can use it freely for checking safeness of error-injection, e.g. livepatch, ftrace etc. So this separate error-injection framework from kprobes. Some differences has been made: - "kprobe" word is removed from any APIs/structures. - BPF_ALLOW_ERROR_INJECTION() is renamed to ALLOW_ERROR_INJECTION() since it is not limited for BPF too. - CONFIG_FUNCTION_ERROR_INJECTION is the config item of this feature. It is automatically enabled if the arch supports error injection feature for kprobe or ftrace etc. Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- arch/Kconfig | 2 +- arch/x86/Kconfig | 2 +- arch/x86/include/asm/error-injection.h | 13 ++ arch/x86/kernel/kprobes/core.c | 14 --- arch/x86/lib/Makefile | 1 + arch/x86/lib/error-inject.c | 19 +++ fs/btrfs/disk-io.c | 4 +- fs/btrfs/free-space-cache.c | 4 +- include/asm-generic/error-injection.h | 20 ++++ include/asm-generic/vmlinux.lds.h | 14 +-- include/linux/bpf.h | 11 -- include/linux/error-injection.h | 21 ++++ include/linux/kprobes.h | 1 - include/linux/module.h | 6 +- kernel/kprobes.c | 163 ------------------------- kernel/module.c | 8 +- kernel/trace/Kconfig | 2 +- kernel/trace/bpf_trace.c | 4 +- kernel/trace/trace_kprobe.c | 3 +- lib/Kconfig.debug | 4 + lib/Makefile | 1 + lib/error-inject.c | 213 +++++++++++++++++++++++++++++++++ 22 files changed, 317 insertions(+), 213 deletions(-) create mode 100644 arch/x86/include/asm/error-injection.h create mode 100644 arch/x86/lib/error-inject.c create mode 100644 include/asm-generic/error-injection.h create mode 100644 include/linux/error-injection.h create mode 100644 lib/error-inject.c (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index d3f4aaf9cb7a..97376accfb14 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -196,7 +196,7 @@ config HAVE_OPTPROBES config HAVE_KPROBES_ON_FTRACE bool -config HAVE_KPROBE_OVERRIDE +config HAVE_FUNCTION_ERROR_INJECTION bool config HAVE_NMI diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 45dc6233f2b9..366b19cb79b7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -154,7 +154,7 @@ config X86 select HAVE_KERNEL_XZ select HAVE_KPROBES select HAVE_KPROBES_ON_FTRACE - select HAVE_KPROBE_OVERRIDE + select HAVE_FUNCTION_ERROR_INJECTION select HAVE_KRETPROBES select HAVE_KVM select HAVE_LIVEPATCH if X86_64 diff --git a/arch/x86/include/asm/error-injection.h b/arch/x86/include/asm/error-injection.h new file mode 100644 index 000000000000..47b7a1296245 --- /dev/null +++ b/arch/x86/include/asm/error-injection.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_ERROR_INJECTION_H +#define _ASM_ERROR_INJECTION_H + +#include +#include +#include +#include + +asmlinkage void just_return_func(void); +void override_function_with_return(struct pt_regs *regs); + +#endif /* _ASM_ERROR_INJECTION_H */ diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index b02a377d5905..bd36f3c33cd0 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -1183,17 +1183,3 @@ int arch_trampoline_kprobe(struct kprobe *p) { return 0; } - -asmlinkage void override_func(void); -asm( - ".type override_func, @function\n" - "override_func:\n" - " ret\n" - ".size override_func, .-override_func\n" -); - -void arch_kprobe_override_function(struct pt_regs *regs) -{ - regs->ip = (unsigned long)&override_func; -} -NOKPROBE_SYMBOL(arch_kprobe_override_function); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 7b181b61170e..171377b83be1 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o +lib-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o diff --git a/arch/x86/lib/error-inject.c b/arch/x86/lib/error-inject.c new file mode 100644 index 000000000000..7b881d03d0dd --- /dev/null +++ b/arch/x86/lib/error-inject.c @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +asmlinkage void just_return_func(void); + +asm( + ".type just_return_func, @function\n" + "just_return_func:\n" + " ret\n" + ".size just_return_func, .-just_return_func\n" +); + +void override_function_with_return(struct pt_regs *regs) +{ + regs->ip = (unsigned long)&just_return_func; +} +NOKPROBE_SYMBOL(override_function_with_return); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5da18ebc9222..9798e21ebe9d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include #include #include "ctree.h" #include "disk-io.h" @@ -3124,7 +3124,7 @@ recovery_tree_root: goto fail_block_groups; goto retry_root_backup; } -BPF_ALLOW_ERROR_INJECTION(open_ctree); +ALLOW_ERROR_INJECTION(open_ctree); static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) { diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index fb1382893bfc..ef847699031a 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -333,7 +333,7 @@ static int io_ctl_init(struct btrfs_io_ctl *io_ctl, struct inode *inode, return 0; } -BPF_ALLOW_ERROR_INJECTION(io_ctl_init); +ALLOW_ERROR_INJECTION(io_ctl_init); static void io_ctl_free(struct btrfs_io_ctl *io_ctl) { diff --git a/include/asm-generic/error-injection.h b/include/asm-generic/error-injection.h new file mode 100644 index 000000000000..08352c9d9f97 --- /dev/null +++ b/include/asm-generic/error-injection.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_ERROR_INJECTION_H +#define _ASM_GENERIC_ERROR_INJECTION_H + +#if defined(__KERNEL__) && !defined(__ASSEMBLY__) +#ifdef CONFIG_FUNCTION_ERROR_INJECTION +/* + * Whitelist ganerating macro. Specify functions which can be + * error-injectable using this macro. + */ +#define ALLOW_ERROR_INJECTION(fname) \ +static unsigned long __used \ + __attribute__((__section__("_error_injection_whitelist"))) \ + _eil_addr_##fname = (unsigned long)fname; +#else +#define ALLOW_ERROR_INJECTION(fname) +#endif +#endif + +#endif /* _ASM_GENERIC_ERROR_INJECTION_H */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index a2e8582d094a..f2068cca5206 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -136,13 +136,13 @@ #define KPROBE_BLACKLIST() #endif -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -#define ERROR_INJECT_LIST() . = ALIGN(8); \ - VMLINUX_SYMBOL(__start_kprobe_error_inject_list) = .; \ - KEEP(*(_kprobe_error_inject_list)) \ - VMLINUX_SYMBOL(__stop_kprobe_error_inject_list) = .; +#ifdef CONFIG_FUNCTION_ERROR_INJECTION +#define ERROR_INJECT_WHITELIST() . = ALIGN(8); \ + VMLINUX_SYMBOL(__start_error_injection_whitelist) = .;\ + KEEP(*(_error_injection_whitelist)) \ + VMLINUX_SYMBOL(__stop_error_injection_whitelist) = .; #else -#define ERROR_INJECT_LIST() +#define ERROR_INJECT_WHITELIST() #endif #ifdef CONFIG_EVENT_TRACING @@ -573,7 +573,7 @@ FTRACE_EVENTS() \ TRACE_SYSCALLS() \ KPROBE_BLACKLIST() \ - ERROR_INJECT_LIST() \ + ERROR_INJECT_WHITELIST() \ MEM_DISCARD(init.rodata) \ CLK_OF_TABLES() \ RESERVEDMEM_OF_TABLES() \ diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 44f26f6df8fc..3496977203a3 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -613,15 +613,4 @@ extern const struct bpf_func_proto bpf_sock_map_update_proto; void bpf_user_rnd_init_once(void); u64 bpf_user_rnd_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5); -#if defined(__KERNEL__) && !defined(__ASSEMBLY__) -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -#define BPF_ALLOW_ERROR_INJECTION(fname) \ -static unsigned long __used \ - __attribute__((__section__("_kprobe_error_inject_list"))) \ - _eil_addr_##fname = (unsigned long)fname; -#else -#define BPF_ALLOW_ERROR_INJECTION(fname) -#endif -#endif - #endif /* _LINUX_BPF_H */ diff --git a/include/linux/error-injection.h b/include/linux/error-injection.h new file mode 100644 index 000000000000..130a67c50dac --- /dev/null +++ b/include/linux/error-injection.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_ERROR_INJECTION_H +#define _LINUX_ERROR_INJECTION_H + +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + +#include + +extern bool within_error_injection_list(unsigned long addr); + +#else /* !CONFIG_FUNCTION_ERROR_INJECTION */ + +#include +static inline bool within_error_injection_list(unsigned long addr) +{ + return false; +} + +#endif + +#endif /* _LINUX_ERROR_INJECTION_H */ diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 963fd364f3d6..9440a2fc8893 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -271,7 +271,6 @@ extern bool arch_kprobe_on_func_entry(unsigned long offset); extern bool kprobe_on_func_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); extern bool within_kprobe_blacklist(unsigned long addr); -extern bool within_kprobe_error_injection_list(unsigned long addr); struct kprobe_insn_cache { struct mutex mutex; diff --git a/include/linux/module.h b/include/linux/module.h index 548fa09fa806..792e51d83bda 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -476,9 +476,9 @@ struct module { unsigned int num_ctors; #endif -#ifdef CONFIG_BPF_KPROBE_OVERRIDE - unsigned int num_kprobe_ei_funcs; - unsigned long *kprobe_ei_funcs; +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + unsigned int num_ei_funcs; + unsigned long *ei_funcs; #endif } ____cacheline_aligned __randomize_layout; #ifndef MODULE_ARCH_INIT diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b4aab48ad258..da2ccf142358 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -83,16 +83,6 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) return &(kretprobe_table_locks[hash].lock); } -/* List of symbols that can be overriden for error injection. */ -static LIST_HEAD(kprobe_error_injection_list); -static DEFINE_MUTEX(kprobe_ei_mutex); -struct kprobe_ei_entry { - struct list_head list; - unsigned long start_addr; - unsigned long end_addr; - void *priv; -}; - /* Blacklist -- list of struct kprobe_blacklist_entry */ static LIST_HEAD(kprobe_blacklist); @@ -1404,17 +1394,6 @@ bool within_kprobe_blacklist(unsigned long addr) return false; } -bool within_kprobe_error_injection_list(unsigned long addr) -{ - struct kprobe_ei_entry *ent; - - list_for_each_entry(ent, &kprobe_error_injection_list, list) { - if (addr >= ent->start_addr && addr < ent->end_addr) - return true; - } - return false; -} - /* * If we have a symbol_name argument, look it up and add the offset field * to it. This way, we can specify a relative address to a symbol. @@ -2189,86 +2168,6 @@ static int __init populate_kprobe_blacklist(unsigned long *start, return 0; } -#ifdef CONFIG_BPF_KPROBE_OVERRIDE -/* Markers of the _kprobe_error_inject_list section */ -extern unsigned long __start_kprobe_error_inject_list[]; -extern unsigned long __stop_kprobe_error_inject_list[]; - -/* - * Lookup and populate the kprobe_error_injection_list. - * - * For safety reasons we only allow certain functions to be overriden with - * bpf_error_injection, so we need to populate the list of the symbols that have - * been marked as safe for overriding. - */ -static void populate_kprobe_error_injection_list(unsigned long *start, - unsigned long *end, - void *priv) -{ - unsigned long *iter; - struct kprobe_ei_entry *ent; - unsigned long entry, offset = 0, size = 0; - - mutex_lock(&kprobe_ei_mutex); - for (iter = start; iter < end; iter++) { - entry = arch_deref_entry_point((void *)*iter); - - if (!kernel_text_address(entry) || - !kallsyms_lookup_size_offset(entry, &size, &offset)) { - pr_err("Failed to find error inject entry at %p\n", - (void *)entry); - continue; - } - - ent = kmalloc(sizeof(*ent), GFP_KERNEL); - if (!ent) - break; - ent->start_addr = entry; - ent->end_addr = entry + size; - ent->priv = priv; - INIT_LIST_HEAD(&ent->list); - list_add_tail(&ent->list, &kprobe_error_injection_list); - } - mutex_unlock(&kprobe_ei_mutex); -} - -static void __init populate_kernel_kprobe_ei_list(void) -{ - populate_kprobe_error_injection_list(__start_kprobe_error_inject_list, - __stop_kprobe_error_inject_list, - NULL); -} - -static void module_load_kprobe_ei_list(struct module *mod) -{ - if (!mod->num_kprobe_ei_funcs) - return; - populate_kprobe_error_injection_list(mod->kprobe_ei_funcs, - mod->kprobe_ei_funcs + - mod->num_kprobe_ei_funcs, mod); -} - -static void module_unload_kprobe_ei_list(struct module *mod) -{ - struct kprobe_ei_entry *ent, *n; - if (!mod->num_kprobe_ei_funcs) - return; - - mutex_lock(&kprobe_ei_mutex); - list_for_each_entry_safe(ent, n, &kprobe_error_injection_list, list) { - if (ent->priv == mod) { - list_del_init(&ent->list); - kfree(ent); - } - } - mutex_unlock(&kprobe_ei_mutex); -} -#else -static inline void __init populate_kernel_kprobe_ei_list(void) {} -static inline void module_load_kprobe_ei_list(struct module *m) {} -static inline void module_unload_kprobe_ei_list(struct module *m) {} -#endif - /* Module notifier call back, checking kprobes on the module */ static int kprobes_module_callback(struct notifier_block *nb, unsigned long val, void *data) @@ -2279,11 +2178,6 @@ static int kprobes_module_callback(struct notifier_block *nb, unsigned int i; int checkcore = (val == MODULE_STATE_GOING); - if (val == MODULE_STATE_COMING) - module_load_kprobe_ei_list(mod); - else if (val == MODULE_STATE_GOING) - module_unload_kprobe_ei_list(mod); - if (val != MODULE_STATE_GOING && val != MODULE_STATE_LIVE) return NOTIFY_DONE; @@ -2346,8 +2240,6 @@ static int __init init_kprobes(void) pr_err("Please take care of using kprobes.\n"); } - populate_kernel_kprobe_ei_list(); - if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { @@ -2515,56 +2407,6 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; -/* - * kprobes/error_injection_list -- shows which functions can be overriden for - * error injection. - * */ -static void *kprobe_ei_seq_start(struct seq_file *m, loff_t *pos) -{ - mutex_lock(&kprobe_ei_mutex); - return seq_list_start(&kprobe_error_injection_list, *pos); -} - -static void kprobe_ei_seq_stop(struct seq_file *m, void *v) -{ - mutex_unlock(&kprobe_ei_mutex); -} - -static void *kprobe_ei_seq_next(struct seq_file *m, void *v, loff_t *pos) -{ - return seq_list_next(v, &kprobe_error_injection_list, pos); -} - -static int kprobe_ei_seq_show(struct seq_file *m, void *v) -{ - char buffer[KSYM_SYMBOL_LEN]; - struct kprobe_ei_entry *ent = - list_entry(v, struct kprobe_ei_entry, list); - - sprint_symbol(buffer, ent->start_addr); - seq_printf(m, "%s\n", buffer); - return 0; -} - -static const struct seq_operations kprobe_ei_seq_ops = { - .start = kprobe_ei_seq_start, - .next = kprobe_ei_seq_next, - .stop = kprobe_ei_seq_stop, - .show = kprobe_ei_seq_show, -}; - -static int kprobe_ei_open(struct inode *inode, struct file *filp) -{ - return seq_open(filp, &kprobe_ei_seq_ops); -} - -static const struct file_operations debugfs_kprobe_ei_ops = { - .open = kprobe_ei_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - static void arm_all_kprobes(void) { struct hlist_head *head; @@ -2706,11 +2548,6 @@ static int __init debugfs_kprobe_init(void) if (!file) goto error; - file = debugfs_create_file("error_injection_list", 0444, dir, NULL, - &debugfs_kprobe_ei_ops); - if (!file) - goto error; - return 0; error: diff --git a/kernel/module.c b/kernel/module.c index bd695bfdc5c4..601494d4b7ea 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3118,10 +3118,10 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->ftrace_callsites), &mod->num_ftrace_callsites); #endif -#ifdef CONFIG_BPF_KPROBE_OVERRIDE - mod->kprobe_ei_funcs = section_objs(info, "_kprobe_error_inject_list", - sizeof(*mod->kprobe_ei_funcs), - &mod->num_kprobe_ei_funcs); +#ifdef CONFIG_FUNCTION_ERROR_INJECTION + mod->ei_funcs = section_objs(info, "_error_injection_whitelist", + sizeof(*mod->ei_funcs), + &mod->num_ei_funcs); #endif mod->extable = section_objs(info, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6400e1bf97c5..7114c885a78a 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -533,7 +533,7 @@ config FUNCTION_PROFILER config BPF_KPROBE_OVERRIDE bool "Enable BPF programs to override a kprobed function" depends on BPF_EVENTS - depends on HAVE_KPROBE_OVERRIDE + depends on FUNCTION_ERROR_INJECTION default n help Allows BPF to override the execution of a probed function and diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 24ed6363e00f..f274468cbc45 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "trace_probe.h" #include "trace.h" @@ -84,7 +84,7 @@ EXPORT_SYMBOL_GPL(trace_call_bpf); BPF_CALL_2(bpf_override_return, struct pt_regs *, regs, unsigned long, rc) { regs_set_return_value(regs, rc); - arch_kprobe_override_function(regs); + override_function_with_return(regs); return 0; } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b8c90441bc87..1fad24acd444 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -21,6 +21,7 @@ #include #include #include +#include #include "trace_probe.h" @@ -107,7 +108,7 @@ bool trace_kprobe_error_injectable(struct trace_event_call *call) } else { addr = (unsigned long)tk->rp.kp.addr; } - return within_kprobe_error_injection_list(addr); + return within_error_injection_list(addr); } static int register_kprobe_event(struct trace_kprobe *tk); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9d5b78aad4c5..2a33efdd1fea 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1500,6 +1500,10 @@ config FAULT_INJECTION Provide fault-injection framework. For more details, see Documentation/fault-injection/. +config FUNCTION_ERROR_INJECTION + def_bool y + depends on HAVE_FUNCTION_ERROR_INJECTION && KPROBES + config FAILSLAB bool "Fault-injection capability for kmalloc" depends on FAULT_INJECTION diff --git a/lib/Makefile b/lib/Makefile index a6c8529dd9b2..75ec13778cd8 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -149,6 +149,7 @@ obj-$(CONFIG_NETDEV_NOTIFIER_ERROR_INJECT) += netdev-notifier-error-inject.o obj-$(CONFIG_MEMORY_NOTIFIER_ERROR_INJECT) += memory-notifier-error-inject.o obj-$(CONFIG_OF_RECONFIG_NOTIFIER_ERROR_INJECT) += \ of-reconfig-notifier-error-inject.o +obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o lib-$(CONFIG_GENERIC_BUG) += bug.o diff --git a/lib/error-inject.c b/lib/error-inject.c new file mode 100644 index 000000000000..bccadcf3c981 --- /dev/null +++ b/lib/error-inject.c @@ -0,0 +1,213 @@ +// SPDX-License-Identifier: GPL-2.0 +// error-inject.c: Function-level error injection table +#include +#include +#include +#include +#include +#include +#include +#include + +/* Whitelist of symbols that can be overridden for error injection. */ +static LIST_HEAD(error_injection_list); +static DEFINE_MUTEX(ei_mutex); +struct ei_entry { + struct list_head list; + unsigned long start_addr; + unsigned long end_addr; + void *priv; +}; + +bool within_error_injection_list(unsigned long addr) +{ + struct ei_entry *ent; + bool ret = false; + + mutex_lock(&ei_mutex); + list_for_each_entry(ent, &error_injection_list, list) { + if (addr >= ent->start_addr && addr < ent->end_addr) { + ret = true; + break; + } + } + mutex_unlock(&ei_mutex); + return ret; +} + +/* + * Lookup and populate the error_injection_list. + * + * For safety reasons we only allow certain functions to be overridden with + * bpf_error_injection, so we need to populate the list of the symbols that have + * been marked as safe for overriding. + */ +static void populate_error_injection_list(unsigned long *start, + unsigned long *end, void *priv) +{ + unsigned long *iter; + struct ei_entry *ent; + unsigned long entry, offset = 0, size = 0; + + mutex_lock(&ei_mutex); + for (iter = start; iter < end; iter++) { + entry = arch_deref_entry_point((void *)*iter); + + if (!kernel_text_address(entry) || + !kallsyms_lookup_size_offset(entry, &size, &offset)) { + pr_err("Failed to find error inject entry at %p\n", + (void *)entry); + continue; + } + + ent = kmalloc(sizeof(*ent), GFP_KERNEL); + if (!ent) + break; + ent->start_addr = entry; + ent->end_addr = entry + size; + ent->priv = priv; + INIT_LIST_HEAD(&ent->list); + list_add_tail(&ent->list, &error_injection_list); + } + mutex_unlock(&ei_mutex); +} + +/* Markers of the _error_inject_whitelist section */ +extern unsigned long __start_error_injection_whitelist[]; +extern unsigned long __stop_error_injection_whitelist[]; + +static void __init populate_kernel_ei_list(void) +{ + populate_error_injection_list(__start_error_injection_whitelist, + __stop_error_injection_whitelist, + NULL); +} + +#ifdef CONFIG_MODULES +static void module_load_ei_list(struct module *mod) +{ + if (!mod->num_ei_funcs) + return; + + populate_error_injection_list(mod->ei_funcs, + mod->ei_funcs + mod->num_ei_funcs, mod); +} + +static void module_unload_ei_list(struct module *mod) +{ + struct ei_entry *ent, *n; + + if (!mod->num_ei_funcs) + return; + + mutex_lock(&ei_mutex); + list_for_each_entry_safe(ent, n, &error_injection_list, list) { + if (ent->priv == mod) { + list_del_init(&ent->list); + kfree(ent); + } + } + mutex_unlock(&ei_mutex); +} + +/* Module notifier call back, checking error injection table on the module */ +static int ei_module_callback(struct notifier_block *nb, + unsigned long val, void *data) +{ + struct module *mod = data; + + if (val == MODULE_STATE_COMING) + module_load_ei_list(mod); + else if (val == MODULE_STATE_GOING) + module_unload_ei_list(mod); + + return NOTIFY_DONE; +} + +static struct notifier_block ei_module_nb = { + .notifier_call = ei_module_callback, + .priority = 0 +}; + +static __init int module_ei_init(void) +{ + return register_module_notifier(&ei_module_nb); +} +#else /* !CONFIG_MODULES */ +#define module_ei_init() (0) +#endif + +/* + * error_injection/whitelist -- shows which functions can be overridden for + * error injection. + */ +static void *ei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&ei_mutex); + return seq_list_start(&error_injection_list, *pos); +} + +static void ei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&ei_mutex); +} + +static void *ei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &error_injection_list, pos); +} + +static int ei_seq_show(struct seq_file *m, void *v) +{ + struct ei_entry *ent = list_entry(v, struct ei_entry, list); + + seq_printf(m, "%pf\n", (void *)ent->start_addr); + return 0; +} + +static const struct seq_operations ei_seq_ops = { + .start = ei_seq_start, + .next = ei_seq_next, + .stop = ei_seq_stop, + .show = ei_seq_show, +}; + +static int ei_open(struct inode *inode, struct file *filp) +{ + return seq_open(filp, &ei_seq_ops); +} + +static const struct file_operations debugfs_ei_ops = { + .open = ei_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init ei_debugfs_init(void) +{ + struct dentry *dir, *file; + + dir = debugfs_create_dir("error_injection", NULL); + if (!dir) + return -ENOMEM; + + file = debugfs_create_file("list", 0444, dir, NULL, &debugfs_ei_ops); + if (!file) { + debugfs_remove(dir); + return -ENOMEM; + } + + return 0; +} + +static int __init init_error_injection(void) +{ + populate_kernel_ei_list(); + + if (!module_ei_init()) + ei_debugfs_init(); + + return 0; +} +late_initcall(init_error_injection); -- cgit v1.2.3 From 4b1a29a7f5425d32640b34b8a755f34e02f64d0f Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sat, 13 Jan 2018 02:56:03 +0900 Subject: error-injection: Support fault injection framework Support in-kernel fault-injection framework via debugfs. This allows you to inject a conditional error to specified function using debugfs interfaces. Here is the result of test script described in Documentation/fault-injection/fault-injection.txt =========== # ./test_fail_function.sh 1+0 records in 1+0 records out 1048576 bytes (1.0 MB, 1.0 MiB) copied, 0.0227404 s, 46.1 MB/s btrfs-progs v4.4 See http://btrfs.wiki.kernel.org for more information. Label: (null) UUID: bfa96010-12e9-4360-aed0-42eec7af5798 Node size: 16384 Sector size: 4096 Filesystem size: 1001.00MiB Block group profiles: Data: single 8.00MiB Metadata: DUP 58.00MiB System: DUP 12.00MiB SSD detected: no Incompat features: extref, skinny-metadata Number of devices: 1 Devices: ID SIZE PATH 1 1001.00MiB /dev/loop2 mount: mount /dev/loop2 on /opt/tmpmnt failed: Cannot allocate memory SUCCESS! =========== Signed-off-by: Masami Hiramatsu Reviewed-by: Josef Bacik Signed-off-by: Alexei Starovoitov --- Documentation/fault-injection/fault-injection.txt | 68 +++++ kernel/Makefile | 1 + kernel/fail_function.c | 349 ++++++++++++++++++++++ lib/Kconfig.debug | 10 + 4 files changed, 428 insertions(+) create mode 100644 kernel/fail_function.c (limited to 'kernel') diff --git a/Documentation/fault-injection/fault-injection.txt b/Documentation/fault-injection/fault-injection.txt index 918972babcd8..f4a32463ca48 100644 --- a/Documentation/fault-injection/fault-injection.txt +++ b/Documentation/fault-injection/fault-injection.txt @@ -30,6 +30,12 @@ o fail_mmc_request injects MMC data errors on devices permitted by setting debugfs entries under /sys/kernel/debug/mmc0/fail_mmc_request +o fail_function + + injects error return on specific functions, which are marked by + ALLOW_ERROR_INJECTION() macro, by setting debugfs entries + under /sys/kernel/debug/fail_function. No boot option supported. + Configure fault-injection capabilities behavior ----------------------------------------------- @@ -123,6 +129,29 @@ configuration of fault-injection capabilities. default is 'N', setting it to 'Y' will disable failure injections when dealing with private (address space) futexes. +- /sys/kernel/debug/fail_function/inject: + + Format: { 'function-name' | '!function-name' | '' } + specifies the target function of error injection by name. + If the function name leads '!' prefix, given function is + removed from injection list. If nothing specified ('') + injection list is cleared. + +- /sys/kernel/debug/fail_function/injectable: + + (read only) shows error injectable functions and what type of + error values can be specified. The error type will be one of + below; + - NULL: retval must be 0. + - ERRNO: retval must be -1 to -MAX_ERRNO (-4096). + - ERR_NULL: retval must be 0 or -1 to -MAX_ERRNO (-4096). + +- /sys/kernel/debug/fail_function//retval: + + specifies the "error" return value to inject to the given + function for given function. This will be created when + user specifies new injection entry. + o Boot option In order to inject faults while debugfs is not available (early boot time), @@ -268,6 +297,45 @@ trap "echo 0 > /sys/kernel/debug/$FAILTYPE/probability" SIGINT SIGTERM EXIT echo "Injecting errors into the module $module... (interrupt to stop)" sleep 1000000 +------------------------------------------------------------------------------ + +o Inject open_ctree error while btrfs mount + +#!/bin/bash + +rm -f testfile.img +dd if=/dev/zero of=testfile.img bs=1M seek=1000 count=1 +DEVICE=$(losetup --show -f testfile.img) +mkfs.btrfs -f $DEVICE +mkdir -p tmpmnt + +FAILTYPE=fail_function +FAILFUNC=open_ctree +echo $FAILFUNC > /sys/kernel/debug/$FAILTYPE/inject +echo -12 > /sys/kernel/debug/$FAILTYPE/$FAILFUNC/retval +echo N > /sys/kernel/debug/$FAILTYPE/task-filter +echo 100 > /sys/kernel/debug/$FAILTYPE/probability +echo 0 > /sys/kernel/debug/$FAILTYPE/interval +echo -1 > /sys/kernel/debug/$FAILTYPE/times +echo 0 > /sys/kernel/debug/$FAILTYPE/space +echo 1 > /sys/kernel/debug/$FAILTYPE/verbose + +mount -t btrfs $DEVICE tmpmnt +if [ $? -ne 0 ] +then + echo "SUCCESS!" +else + echo "FAILED!" + umount tmpmnt +fi + +echo > /sys/kernel/debug/$FAILTYPE/inject + +rmdir tmpmnt +losetup -d $DEVICE +rm testfile.img + + Tool to run command with failslab or fail_page_alloc ---------------------------------------------------- In order to make it easier to accomplish the tasks mentioned above, we can use diff --git a/kernel/Makefile b/kernel/Makefile index 172d151d429c..f85ae5dfa474 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -81,6 +81,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_FAIL_FUNCTION) += fail_function.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o diff --git a/kernel/fail_function.c b/kernel/fail_function.c new file mode 100644 index 000000000000..21b0122cb39c --- /dev/null +++ b/kernel/fail_function.c @@ -0,0 +1,349 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fail_function.c: Function-based error injection + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs); + +struct fei_attr { + struct list_head list; + struct kprobe kp; + unsigned long retval; +}; +static DEFINE_MUTEX(fei_lock); +static LIST_HEAD(fei_attr_list); +static DECLARE_FAULT_ATTR(fei_fault_attr); +static struct dentry *fei_debugfs_dir; + +static unsigned long adjust_error_retval(unsigned long addr, unsigned long retv) +{ + switch (get_injectable_error_type(addr)) { + case EI_ETYPE_NULL: + if (retv != 0) + return 0; + break; + case EI_ETYPE_ERRNO: + if (retv < (unsigned long)-MAX_ERRNO) + return (unsigned long)-EINVAL; + break; + case EI_ETYPE_ERRNO_NULL: + if (retv != 0 && retv < (unsigned long)-MAX_ERRNO) + return (unsigned long)-EINVAL; + break; + } + + return retv; +} + +static struct fei_attr *fei_attr_new(const char *sym, unsigned long addr) +{ + struct fei_attr *attr; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (attr) { + attr->kp.symbol_name = kstrdup(sym, GFP_KERNEL); + if (!attr->kp.symbol_name) { + kfree(attr); + return NULL; + } + attr->kp.pre_handler = fei_kprobe_handler; + attr->retval = adjust_error_retval(addr, 0); + INIT_LIST_HEAD(&attr->list); + } + return attr; +} + +static void fei_attr_free(struct fei_attr *attr) +{ + if (attr) { + kfree(attr->kp.symbol_name); + kfree(attr); + } +} + +static struct fei_attr *fei_attr_lookup(const char *sym) +{ + struct fei_attr *attr; + + list_for_each_entry(attr, &fei_attr_list, list) { + if (!strcmp(attr->kp.symbol_name, sym)) + return attr; + } + + return NULL; +} + +static bool fei_attr_is_valid(struct fei_attr *_attr) +{ + struct fei_attr *attr; + + list_for_each_entry(attr, &fei_attr_list, list) { + if (attr == _attr) + return true; + } + + return false; +} + +static int fei_retval_set(void *data, u64 val) +{ + struct fei_attr *attr = data; + unsigned long retv = (unsigned long)val; + int err = 0; + + mutex_lock(&fei_lock); + /* + * Since this operation can be done after retval file is removed, + * It is safer to check the attr is still valid before accessing + * its member. + */ + if (!fei_attr_is_valid(attr)) { + err = -ENOENT; + goto out; + } + + if (attr->kp.addr) { + if (adjust_error_retval((unsigned long)attr->kp.addr, + val) != retv) + err = -EINVAL; + } + if (!err) + attr->retval = val; +out: + mutex_unlock(&fei_lock); + + return err; +} + +static int fei_retval_get(void *data, u64 *val) +{ + struct fei_attr *attr = data; + int err = 0; + + mutex_lock(&fei_lock); + /* Here we also validate @attr to ensure it still exists. */ + if (!fei_attr_is_valid(attr)) + err = -ENOENT; + else + *val = attr->retval; + mutex_unlock(&fei_lock); + + return err; +} +DEFINE_DEBUGFS_ATTRIBUTE(fei_retval_ops, fei_retval_get, fei_retval_set, + "%llx\n"); + +static int fei_debugfs_add_attr(struct fei_attr *attr) +{ + struct dentry *dir; + + dir = debugfs_create_dir(attr->kp.symbol_name, fei_debugfs_dir); + if (!dir) + return -ENOMEM; + + if (!debugfs_create_file("retval", 0600, dir, attr, &fei_retval_ops)) { + debugfs_remove_recursive(dir); + return -ENOMEM; + } + + return 0; +} + +static void fei_debugfs_remove_attr(struct fei_attr *attr) +{ + struct dentry *dir; + + dir = debugfs_lookup(attr->kp.symbol_name, fei_debugfs_dir); + if (dir) + debugfs_remove_recursive(dir); +} + +static int fei_kprobe_handler(struct kprobe *kp, struct pt_regs *regs) +{ + struct fei_attr *attr = container_of(kp, struct fei_attr, kp); + + if (should_fail(&fei_fault_attr, 1)) { + regs_set_return_value(regs, attr->retval); + override_function_with_return(regs); + /* Kprobe specific fixup */ + reset_current_kprobe(); + preempt_enable_no_resched(); + return 1; + } + + return 0; +} +NOKPROBE_SYMBOL(fei_kprobe_handler) + +static void *fei_seq_start(struct seq_file *m, loff_t *pos) +{ + mutex_lock(&fei_lock); + return seq_list_start(&fei_attr_list, *pos); +} + +static void fei_seq_stop(struct seq_file *m, void *v) +{ + mutex_unlock(&fei_lock); +} + +static void *fei_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + return seq_list_next(v, &fei_attr_list, pos); +} + +static int fei_seq_show(struct seq_file *m, void *v) +{ + struct fei_attr *attr = list_entry(v, struct fei_attr, list); + + seq_printf(m, "%pf\n", attr->kp.addr); + return 0; +} + +static const struct seq_operations fei_seq_ops = { + .start = fei_seq_start, + .next = fei_seq_next, + .stop = fei_seq_stop, + .show = fei_seq_show, +}; + +static int fei_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &fei_seq_ops); +} + +static void fei_attr_remove(struct fei_attr *attr) +{ + fei_debugfs_remove_attr(attr); + unregister_kprobe(&attr->kp); + list_del(&attr->list); + fei_attr_free(attr); +} + +static void fei_attr_remove_all(void) +{ + struct fei_attr *attr, *n; + + list_for_each_entry_safe(attr, n, &fei_attr_list, list) { + fei_attr_remove(attr); + } +} + +static ssize_t fei_write(struct file *file, const char __user *buffer, + size_t count, loff_t *ppos) +{ + struct fei_attr *attr; + unsigned long addr; + char *buf, *sym; + int ret; + + /* cut off if it is too long */ + if (count > KSYM_NAME_LEN) + count = KSYM_NAME_LEN; + buf = kmalloc(sizeof(char) * (count + 1), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (copy_from_user(buf, buffer, count)) { + ret = -EFAULT; + goto out; + } + buf[count] = '\0'; + sym = strstrip(buf); + + mutex_lock(&fei_lock); + + /* Writing just spaces will remove all injection points */ + if (sym[0] == '\0') { + fei_attr_remove_all(); + ret = count; + goto out; + } + /* Writing !function will remove one injection point */ + if (sym[0] == '!') { + attr = fei_attr_lookup(sym + 1); + if (!attr) { + ret = -ENOENT; + goto out; + } + fei_attr_remove(attr); + ret = count; + goto out; + } + + addr = kallsyms_lookup_name(sym); + if (!addr) { + ret = -EINVAL; + goto out; + } + if (!within_error_injection_list(addr)) { + ret = -ERANGE; + goto out; + } + if (fei_attr_lookup(sym)) { + ret = -EBUSY; + goto out; + } + attr = fei_attr_new(sym, addr); + if (!attr) { + ret = -ENOMEM; + goto out; + } + + ret = register_kprobe(&attr->kp); + if (!ret) + ret = fei_debugfs_add_attr(attr); + if (ret < 0) + fei_attr_remove(attr); + else { + list_add_tail(&attr->list, &fei_attr_list); + ret = count; + } +out: + kfree(buf); + mutex_unlock(&fei_lock); + return ret; +} + +static const struct file_operations fei_ops = { + .open = fei_open, + .read = seq_read, + .write = fei_write, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init fei_debugfs_init(void) +{ + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_function", NULL, + &fei_fault_attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + /* injectable attribute is just a symlink of error_inject/list */ + if (!debugfs_create_symlink("injectable", dir, + "../error_injection/list")) + goto error; + + if (!debugfs_create_file("inject", 0600, dir, NULL, &fei_ops)) + goto error; + + fei_debugfs_dir = dir; + + return 0; +error: + debugfs_remove_recursive(dir); + return -ENOMEM; +} + +late_initcall(fei_debugfs_init); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2a33efdd1fea..890d4766cef3 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1551,6 +1551,16 @@ config FAIL_FUTEX help Provide fault-injection capability for futexes. +config FAIL_FUNCTION + bool "Fault-injection capability for functions" + depends on FAULT_INJECTION_DEBUG_FS && FUNCTION_ERROR_INJECTION + help + Provide function-based fault-injection capability. + This will allow you to override a specific function with a return + with given return value. As a result, function caller will see + an error value and have to handle it. This is useful to test the + error handling in various subsystems. + config FAULT_INJECTION_DEBUG_FS bool "Debugfs entries for fault-injection capabilities" depends on FAULT_INJECTION && SYSFS && DEBUG_FS -- cgit v1.2.3 From 1110f3a9bcf394c06b81a98206aee9b6860653c8 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:03 -0800 Subject: bpf: add map_alloc_check callback .map_alloc callbacks contain a number of checks validating user- -provided map attributes against constraints of a particular map type. For offloaded maps we will need to check map attributes without actually allocating any memory on the host. Add a new callback for validating attributes before any memory is allocated. This callback can be selectively implemented by map types for sharing code with offloads, or simply to separate the logical steps of validation and allocation. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/syscall.c | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 3496977203a3..263598619f90 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -25,6 +25,7 @@ struct bpf_map; /* map is generic key/value storage optionally accesible by eBPF programs */ struct bpf_map_ops { /* funcs callable from userspace (via syscall) */ + int (*map_alloc_check)(union bpf_attr *attr); struct bpf_map *(*map_alloc)(union bpf_attr *attr); void (*map_release)(struct bpf_map *map, struct file *map_file); void (*map_free)(struct bpf_map *map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 2bac0dc8baba..c0ac03a04880 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -96,16 +96,25 @@ static int check_uarg_tail_zero(void __user *uaddr, static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { + const struct bpf_map_ops *ops; struct bpf_map *map; + int err; - if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || - !bpf_map_types[attr->map_type]) + if (attr->map_type >= ARRAY_SIZE(bpf_map_types)) + return ERR_PTR(-EINVAL); + ops = bpf_map_types[attr->map_type]; + if (!ops) return ERR_PTR(-EINVAL); - map = bpf_map_types[attr->map_type]->map_alloc(attr); + if (ops->map_alloc_check) { + err = ops->map_alloc_check(attr); + if (err) + return ERR_PTR(err); + } + map = ops->map_alloc(attr); if (IS_ERR(map)) return map; - map->ops = bpf_map_types[attr->map_type]; + map->ops = ops; map->map_type = attr->map_type; return map; } -- cgit v1.2.3 From daffc5a2e6f4bf4f99b00e183117920e321b6763 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:04 -0800 Subject: bpf: hashtab: move attribute validation before allocation Number of attribute checks are currently performed after hashtab is already allocated. Move them to be able to split them out to the check function later on. Checks have to now be performed on the attr union directly instead of the members of bpf_map, since bpf_map will be allocated later. No functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 47 +++++++++++++++++++++++------------------------ 1 file changed, 23 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3905d4bc5b80..b80f42adf068 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -269,6 +269,28 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) return ERR_PTR(-EINVAL); + /* check sanity of attributes. + * value_size == 0 may be allowed in the future to use map as a set + */ + if (attr->max_entries == 0 || attr->key_size == 0 || + attr->value_size == 0) + return ERR_PTR(-EINVAL); + + if (attr->key_size > MAX_BPF_STACK) + /* eBPF programs initialize keys on stack, so they cannot be + * larger than max stack size + */ + return ERR_PTR(-E2BIG); + + if (attr->value_size >= KMALLOC_MAX_SIZE - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + return ERR_PTR(-E2BIG); + htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); @@ -281,14 +303,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) htab->map.map_flags = attr->map_flags; htab->map.numa_node = numa_node; - /* check sanity of attributes. - * value_size == 0 may be allowed in the future to use map as a set - */ - err = -EINVAL; - if (htab->map.max_entries == 0 || htab->map.key_size == 0 || - htab->map.value_size == 0) - goto free_htab; - if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. * since we are at it, make each lru list has the same @@ -304,22 +318,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* hash table size must be power of 2 */ htab->n_buckets = roundup_pow_of_two(htab->map.max_entries); - err = -E2BIG; - if (htab->map.key_size > MAX_BPF_STACK) - /* eBPF programs initialize keys on stack, so they cannot be - * larger than max stack size - */ - goto free_htab; - - if (htab->map.value_size >= KMALLOC_MAX_SIZE - - MAX_BPF_STACK - sizeof(struct htab_elem)) - /* if value_size is bigger, the user space won't be able to - * access the elements via bpf syscall. This check also makes - * sure that the elem_size doesn't overflow and it's - * kmalloc-able later in htab_map_update_elem() - */ - goto free_htab; - htab->elem_size = sizeof(struct htab_elem) + round_up(htab->map.key_size, 8); if (percpu) @@ -327,6 +325,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) else htab->elem_size += round_up(htab->map.value_size, 8); + err = -E2BIG; /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || htab->n_buckets > U32_MAX / sizeof(struct bucket)) -- cgit v1.2.3 From 9328e0d1bc09e96bd7dc85374f5c2a1e0e04e539 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:05 -0800 Subject: bpf: hashtab: move checks out of alloc function Use the new callback to perform allocation checks for hash maps. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/hashtab.c | 55 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b80f42adf068..7fd6519444d3 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -227,7 +227,7 @@ static int alloc_extra_elems(struct bpf_htab *htab) } /* Called from syscall */ -static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +static int htab_map_alloc_check(union bpf_attr *attr) { bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); @@ -241,9 +241,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); int numa_node = bpf_map_attr_numa_node(attr); - struct bpf_htab *htab; - int err, i; - u64 cost; BUILD_BUG_ON(offsetof(struct htab_elem, htab) != offsetof(struct htab_elem, hash_node.pprev)); @@ -254,33 +251,33 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* LRU implementation is much complicated than other * maps. Hence, limit to CAP_SYS_ADMIN for now. */ - return ERR_PTR(-EPERM); + return -EPERM; if (attr->map_flags & ~HTAB_CREATE_FLAG_MASK) /* reserved bits should not be used */ - return ERR_PTR(-EINVAL); + return -EINVAL; if (!lru && percpu_lru) - return ERR_PTR(-EINVAL); + return -EINVAL; if (lru && !prealloc) - return ERR_PTR(-ENOTSUPP); + return -ENOTSUPP; if (numa_node != NUMA_NO_NODE && (percpu || percpu_lru)) - return ERR_PTR(-EINVAL); + return -EINVAL; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set */ if (attr->max_entries == 0 || attr->key_size == 0 || attr->value_size == 0) - return ERR_PTR(-EINVAL); + return -EINVAL; if (attr->key_size > MAX_BPF_STACK) /* eBPF programs initialize keys on stack, so they cannot be * larger than max stack size */ - return ERR_PTR(-E2BIG); + return -E2BIG; if (attr->value_size >= KMALLOC_MAX_SIZE - MAX_BPF_STACK - sizeof(struct htab_elem)) @@ -289,7 +286,28 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) * sure that the elem_size doesn't overflow and it's * kmalloc-able later in htab_map_update_elem() */ - return ERR_PTR(-E2BIG); + return -E2BIG; + + return 0; +} + +static struct bpf_map *htab_map_alloc(union bpf_attr *attr) +{ + bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH || + attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH); + /* percpu_lru means each cpu has its own LRU list. + * it is different from BPF_MAP_TYPE_PERCPU_HASH where + * the map's value itself is percpu. percpu_lru has + * nothing to do with the map's value. + */ + bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); + bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); + int numa_node = bpf_map_attr_numa_node(attr); + struct bpf_htab *htab; + int err, i; + u64 cost; htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) @@ -1142,6 +1160,7 @@ static void htab_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1152,6 +1171,7 @@ const struct bpf_map_ops htab_map_ops = { }; const struct bpf_map_ops htab_lru_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1235,6 +1255,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, } const struct bpf_map_ops htab_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1244,6 +1265,7 @@ const struct bpf_map_ops htab_percpu_map_ops = { }; const struct bpf_map_ops htab_lru_percpu_map_ops = { + .map_alloc_check = htab_map_alloc_check, .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1252,11 +1274,11 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +static int fd_htab_map_alloc_check(union bpf_attr *attr) { if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return htab_map_alloc(attr); + return -EINVAL; + return htab_map_alloc_check(attr); } static void fd_htab_map_free(struct bpf_map *map) @@ -1327,7 +1349,7 @@ static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_htab_map_alloc(attr); + map = htab_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -1371,6 +1393,7 @@ static void htab_of_map_free(struct bpf_map *map) } const struct bpf_map_ops htab_of_maps_map_ops = { + .map_alloc_check = fd_htab_map_alloc_check, .map_alloc = htab_of_map_alloc, .map_free = htab_of_map_free, .map_get_next_key = htab_map_get_next_key, -- cgit v1.2.3 From bd475643d74e8ed78bfd36d941053b0e45974e8e Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:06 -0800 Subject: bpf: add helper for copying attrs to struct bpf_map All map types reimplement the field-by-field copy of union bpf_attr members into struct bpf_map. Add a helper to perform this operation. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 1 + kernel/bpf/cpumap.c | 8 +------- kernel/bpf/devmap.c | 8 +------- kernel/bpf/hashtab.c | 9 +-------- kernel/bpf/lpm_trie.c | 7 +------ kernel/bpf/sockmap.c | 8 +------- kernel/bpf/stackmap.c | 6 +----- kernel/bpf/syscall.c | 10 ++++++++++ 8 files changed, 17 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 263598619f90..c60ddfb34d41 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -378,6 +378,7 @@ void bpf_map_put(struct bpf_map *map); int bpf_map_precharge_memlock(u32 pages); void *bpf_map_area_alloc(size_t size, int numa_node); void bpf_map_area_free(void *base); +void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr); extern int sysctl_unprivileged_bpf_disabled; diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index ce5b669003b2..192151ec9d12 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -94,13 +94,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr) if (!cmap) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - cmap->map.map_type = attr->map_type; - cmap->map.key_size = attr->key_size; - cmap->map.value_size = attr->value_size; - cmap->map.max_entries = attr->max_entries; - cmap->map.map_flags = attr->map_flags; - cmap->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&cmap->map, attr); /* Pre-limit array size based on NR_CPUS, not final CPU check */ if (cmap->map.max_entries > NR_CPUS) { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index ebdef54bf7df..565f9ece9115 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -93,13 +93,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - dtab->map.map_type = attr->map_type; - dtab->map.key_size = attr->key_size; - dtab->map.value_size = attr->value_size; - dtab->map.max_entries = attr->max_entries; - dtab->map.map_flags = attr->map_flags; - dtab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&dtab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 7fd6519444d3..b76828f23b49 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -304,7 +304,6 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU); bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC); - int numa_node = bpf_map_attr_numa_node(attr); struct bpf_htab *htab; int err, i; u64 cost; @@ -313,13 +312,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) if (!htab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - htab->map.map_type = attr->map_type; - htab->map.key_size = attr->key_size; - htab->map.value_size = attr->value_size; - htab->map.max_entries = attr->max_entries; - htab->map.map_flags = attr->map_flags; - htab->map.numa_node = numa_node; + bpf_map_init_from_attr(&htab->map, attr); if (percpu_lru) { /* ensure each CPU's lru list has >=1 elements. diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 885e45479680..584e02227671 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -522,12 +522,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr) return ERR_PTR(-ENOMEM); /* copy mandatory map attributes */ - trie->map.map_type = attr->map_type; - trie->map.key_size = attr->key_size; - trie->map.value_size = attr->value_size; - trie->map.max_entries = attr->max_entries; - trie->map.map_flags = attr->map_flags; - trie->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&trie->map, attr); trie->data_size = attr->key_size - offsetof(struct bpf_lpm_trie_key, data); trie->max_prefixlen = trie->data_size * 8; diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 079968680bc3..0314d1783d77 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -513,13 +513,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr) if (!stab) return ERR_PTR(-ENOMEM); - /* mandatory map attributes */ - stab->map.map_type = attr->map_type; - stab->map.key_size = attr->key_size; - stab->map.value_size = attr->value_size; - stab->map.max_entries = attr->max_entries; - stab->map.map_flags = attr->map_flags; - stab->map.numa_node = bpf_map_attr_numa_node(attr); + bpf_map_init_from_attr(&stab->map, attr); /* make sure page count doesn't overflow */ cost = (u64) stab->map.max_entries * sizeof(struct sock *); diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 6c63c2222ea8..b0ecf43f5894 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -88,14 +88,10 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr) if (cost >= U32_MAX - PAGE_SIZE) goto free_smap; - smap->map.map_type = attr->map_type; - smap->map.key_size = attr->key_size; + bpf_map_init_from_attr(&smap->map, attr); smap->map.value_size = value_size; - smap->map.max_entries = attr->max_entries; - smap->map.map_flags = attr->map_flags; smap->n_buckets = n_buckets; smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; - smap->map.numa_node = bpf_map_attr_numa_node(attr); err = bpf_map_precharge_memlock(smap->map.pages); if (err) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c0ac03a04880..a3f726bb42ea 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -143,6 +143,16 @@ void bpf_map_area_free(void *area) kvfree(area); } +void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr) +{ + map->map_type = attr->map_type; + map->key_size = attr->key_size; + map->value_size = attr->value_size; + map->max_entries = attr->max_entries; + map->map_flags = attr->map_flags; + map->numa_node = bpf_map_attr_numa_node(attr); +} + int bpf_map_precharge_memlock(u32 pages) { struct user_struct *user = get_current_user(); -- cgit v1.2.3 From 0a9c1991f285f829fd786fa2a9c824c2a3f87bc6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:07 -0800 Subject: bpf: rename bpf_dev_offload -> bpf_prog_offload With map offload coming, we need to call program offload structure something less ambiguous. Pure rename, no functional changes. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 2 +- include/linux/bpf.h | 4 ++-- kernel/bpf/offload.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 320b2250d29a..6590228d3755 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -237,7 +237,7 @@ int nfp_net_bpf_offload(struct nfp_net *nn, struct bpf_prog *prog, int err; if (prog) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; if (!offload) return -EINVAL; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index c60ddfb34d41..9fff1ace1d8e 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -200,7 +200,7 @@ struct bpf_prog_offload_ops { int insn_idx, int prev_insn_idx); }; -struct bpf_dev_offload { +struct bpf_prog_offload { struct bpf_prog *prog; struct net_device *netdev; void *dev_priv; @@ -230,7 +230,7 @@ struct bpf_prog_aux { #ifdef CONFIG_SECURITY void *security; #endif - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; union { struct work_struct work; struct rcu_head rcu; diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 040d4e0edf3f..001ddfde7874 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -32,7 +32,7 @@ static LIST_HEAD(bpf_prog_offload_devs); int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) @@ -72,7 +72,7 @@ err_free: static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, struct netdev_bpf *data) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; struct net_device *netdev; ASSERT_RTNL(); @@ -110,7 +110,7 @@ exit_unlock: int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { - struct bpf_dev_offload *offload; + struct bpf_prog_offload *offload; int ret = -ENODEV; down_read(&bpf_devs_lock); @@ -124,7 +124,7 @@ int bpf_prog_offload_verify_insn(struct bpf_verifier_env *env, static void __bpf_prog_offload_destroy(struct bpf_prog *prog) { - struct bpf_dev_offload *offload = prog->aux->offload; + struct bpf_prog_offload *offload = prog->aux->offload; struct netdev_bpf data = {}; data.offload.prog = prog; @@ -242,7 +242,7 @@ static int bpf_offload_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct bpf_dev_offload *offload, *tmp; + struct bpf_prog_offload *offload, *tmp; ASSERT_RTNL(); -- cgit v1.2.3 From 5bc2d55c6a69ef9efd11740359974b08ea11f1d7 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:08 -0800 Subject: bpf: offload: factor out netdev checking at allocation time Add a helper to check if netdev could be found and whether it has .ndo_bpf callback. There is no need to check the callback every time it's invoked, ndos can't reasonably be swapped for a set without .ndp_bpf while program is loaded. bpf_dev_offload_check() will also be used by map offload. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 001ddfde7874..cdd1e19a668b 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -30,9 +30,19 @@ static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); +static int bpf_dev_offload_check(struct net_device *netdev) +{ + if (!netdev) + return -EINVAL; + if (!netdev->netdev_ops->ndo_bpf) + return -EOPNOTSUPP; + return 0; +} + int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) { struct bpf_prog_offload *offload; + int err; if (attr->prog_type != BPF_PROG_TYPE_SCHED_CLS && attr->prog_type != BPF_PROG_TYPE_XDP) @@ -49,12 +59,15 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) offload->netdev = dev_get_by_index(current->nsproxy->net_ns, attr->prog_ifindex); - if (!offload->netdev) - goto err_free; + err = bpf_dev_offload_check(offload->netdev); + if (err) + goto err_maybe_put; down_write(&bpf_devs_lock); - if (offload->netdev->reg_state != NETREG_REGISTERED) + if (offload->netdev->reg_state != NETREG_REGISTERED) { + err = -EINVAL; goto err_unlock; + } prog->aux->offload = offload; list_add_tail(&offload->offloads, &bpf_prog_offload_devs); dev_put(offload->netdev); @@ -63,10 +76,11 @@ int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) return 0; err_unlock: up_write(&bpf_devs_lock); - dev_put(offload->netdev); -err_free: +err_maybe_put: + if (offload->netdev) + dev_put(offload->netdev); kfree(offload); - return -EINVAL; + return err; } static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, @@ -80,8 +94,6 @@ static int __bpf_offload_ndo(struct bpf_prog *prog, enum bpf_netdev_command cmd, if (!offload) return -ENODEV; netdev = offload->netdev; - if (!netdev->netdev_ops->ndo_bpf) - return -EOPNOTSUPP; data->command = cmd; -- cgit v1.2.3 From a38845729ea3985db5d2544ec3ef3dc8f6313a27 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 11 Jan 2018 20:29:09 -0800 Subject: bpf: offload: add map offload infrastructure BPF map offload follow similar path to program offload. At creation time users may specify ifindex of the device on which they want to create the map. Map will be validated by the kernel's .map_alloc_check callback and device driver will be called for the actual allocation. Map will have an empty set of operations associated with it (save for alloc and free callbacks). The real device callbacks are kept in map->offload->dev_ops because they have slightly different signatures. Map operations are called in process context so the driver may communicate with HW freely, msleep(), wait() etc. Map alloc and free callbacks are muxed via existing .ndo_bpf, and are always called with rtnl lock held. Maps and programs are guaranteed to be destroyed before .ndo_uninit (i.e. before unregister_netdev() returns). Map callbacks are invoked with bpf_devs_lock *read* locked, drivers must take care of exclusive locking if necessary. All offload-specific branches are marked with unlikely() (through bpf_map_is_dev_bound()), given that branch penalty will be negligible compared to IO anyway, and we don't want to penalize SW path unnecessarily. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 59 +++++++++++++ include/linux/netdevice.h | 6 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/offload.c | 188 +++++++++++++++++++++++++++++++++++++++-- kernel/bpf/syscall.c | 44 ++++++++-- kernel/bpf/verifier.c | 7 ++ tools/include/uapi/linux/bpf.h | 1 + 7 files changed, 293 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9fff1ace1d8e..5c2c104dc2c5 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -74,6 +74,33 @@ struct bpf_map { char name[BPF_OBJ_NAME_LEN]; }; +struct bpf_offloaded_map; + +struct bpf_map_dev_ops { + int (*map_get_next_key)(struct bpf_offloaded_map *map, + void *key, void *next_key); + int (*map_lookup_elem)(struct bpf_offloaded_map *map, + void *key, void *value); + int (*map_update_elem)(struct bpf_offloaded_map *map, + void *key, void *value, u64 flags); + int (*map_delete_elem)(struct bpf_offloaded_map *map, void *key); +}; + +struct bpf_offloaded_map { + struct bpf_map map; + struct net_device *netdev; + const struct bpf_map_dev_ops *dev_ops; + void *dev_priv; + struct list_head offloads; +}; + +static inline struct bpf_offloaded_map *map_to_offmap(struct bpf_map *map) +{ + return container_of(map, struct bpf_offloaded_map, map); +} + +extern const struct bpf_map_ops bpf_map_offload_ops; + /* function argument constraints */ enum bpf_arg_type { ARG_DONTCARE = 0, /* unused argument in helper function */ @@ -369,6 +396,7 @@ int __bpf_prog_charge(struct user_struct *user, u32 pages); void __bpf_prog_uncharge(struct user_struct *user, u32 pages); void bpf_prog_free_id(struct bpf_prog *prog, bool do_idr_lock); +void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock); struct bpf_map *bpf_map_get_with_uref(u32 ufd); struct bpf_map *__bpf_map_get(struct fd f); @@ -556,6 +584,15 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); +int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); +int bpf_map_offload_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags); +int bpf_map_offload_delete_elem(struct bpf_map *map, void *key); +int bpf_map_offload_get_next_key(struct bpf_map *map, + void *key, void *next_key); + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map); + #if defined(CONFIG_NET) && defined(CONFIG_BPF_SYSCALL) int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr); @@ -563,6 +600,14 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) { return aux->offload_requested; } + +static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +{ + return unlikely(map->ops == &bpf_map_offload_ops); +} + +struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr); +void bpf_map_offload_map_free(struct bpf_map *map); #else static inline int bpf_prog_offload_init(struct bpf_prog *prog, union bpf_attr *attr) @@ -574,6 +619,20 @@ static inline bool bpf_prog_is_dev_bound(struct bpf_prog_aux *aux) { return false; } + +static inline bool bpf_map_is_dev_bound(struct bpf_map *map) +{ + return false; +} + +static inline struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void bpf_map_offload_map_free(struct bpf_map *map) +{ +} #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */ #if defined(CONFIG_STREAM_PARSER) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_INET) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ef7b348e8498..0b3ab42d50fe 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -804,6 +804,8 @@ enum bpf_netdev_command { BPF_OFFLOAD_VERIFIER_PREP, BPF_OFFLOAD_TRANSLATE, BPF_OFFLOAD_DESTROY, + BPF_OFFLOAD_MAP_ALLOC, + BPF_OFFLOAD_MAP_FREE, }; struct bpf_prog_offload_ops; @@ -834,6 +836,10 @@ struct netdev_bpf { struct { struct bpf_prog *prog; } offload; + /* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */ + struct { + struct bpf_offloaded_map *offmap; + }; }; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 395d261948de..7c2259e8bc54 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -245,6 +245,7 @@ union bpf_attr { * BPF_F_NUMA_NODE is set). */ char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index cdd1e19a668b..453785fa1881 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -24,11 +24,13 @@ #include #include -/* Protects bpf_prog_offload_devs and offload members of all progs. +/* Protects bpf_prog_offload_devs, bpf_map_offload_devs and offload members + * of all progs. * RTNL lock cannot be taken when holding this lock. */ static DECLARE_RWSEM(bpf_devs_lock); static LIST_HEAD(bpf_prog_offload_devs); +static LIST_HEAD(bpf_map_offload_devs); static int bpf_dev_offload_check(struct net_device *netdev) { @@ -250,11 +252,186 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, const struct bpf_prog_ops bpf_offload_prog_ops = { }; +static int bpf_map_offload_ndo(struct bpf_offloaded_map *offmap, + enum bpf_netdev_command cmd) +{ + struct netdev_bpf data = {}; + struct net_device *netdev; + + ASSERT_RTNL(); + + data.command = cmd; + data.offmap = offmap; + /* Caller must make sure netdev is valid */ + netdev = offmap->netdev; + + return netdev->netdev_ops->ndo_bpf(netdev, &data); +} + +struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) +{ + struct net *net = current->nsproxy->net_ns; + struct bpf_offloaded_map *offmap; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (attr->map_type != BPF_MAP_TYPE_HASH) + return ERR_PTR(-EINVAL); + + offmap = kzalloc(sizeof(*offmap), GFP_USER); + if (!offmap) + return ERR_PTR(-ENOMEM); + + bpf_map_init_from_attr(&offmap->map, attr); + + rtnl_lock(); + down_write(&bpf_devs_lock); + offmap->netdev = __dev_get_by_index(net, attr->map_ifindex); + err = bpf_dev_offload_check(offmap->netdev); + if (err) + goto err_unlock; + + err = bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_ALLOC); + if (err) + goto err_unlock; + + list_add_tail(&offmap->offloads, &bpf_map_offload_devs); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + return &offmap->map; + +err_unlock: + up_write(&bpf_devs_lock); + rtnl_unlock(); + kfree(offmap); + return ERR_PTR(err); +} + +static void __bpf_map_offload_destroy(struct bpf_offloaded_map *offmap) +{ + WARN_ON(bpf_map_offload_ndo(offmap, BPF_OFFLOAD_MAP_FREE)); + /* Make sure BPF_MAP_GET_NEXT_ID can't find this dead map */ + bpf_map_free_id(&offmap->map, true); + list_del_init(&offmap->offloads); + offmap->netdev = NULL; +} + +void bpf_map_offload_map_free(struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + + rtnl_lock(); + down_write(&bpf_devs_lock); + if (offmap->netdev) + __bpf_map_offload_destroy(offmap); + up_write(&bpf_devs_lock); + rtnl_unlock(); + + kfree(offmap); +} + +int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_lookup_elem(offmap, key, value); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_update_elem(struct bpf_map *map, + void *key, void *value, u64 flags) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + if (unlikely(flags > BPF_EXIST)) + return -EINVAL; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_update_elem(offmap, key, value, + flags); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_delete_elem(offmap, key); + up_read(&bpf_devs_lock); + + return ret; +} + +int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + struct bpf_offloaded_map *offmap = map_to_offmap(map); + int ret = -ENODEV; + + down_read(&bpf_devs_lock); + if (offmap->netdev) + ret = offmap->dev_ops->map_get_next_key(offmap, key, next_key); + up_read(&bpf_devs_lock); + + return ret; +} + +bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) +{ + struct bpf_offloaded_map *offmap; + struct bpf_prog_offload *offload; + bool ret; + + if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map)) + return false; + if (!bpf_prog_is_dev_bound(prog->aux)) + return true; + + down_read(&bpf_devs_lock); + offload = prog->aux->offload; + offmap = map_to_offmap(map); + + ret = offload && offload->netdev == offmap->netdev; + up_read(&bpf_devs_lock); + + return ret; +} + +static void bpf_offload_orphan_all_progs(struct net_device *netdev) +{ + struct bpf_prog_offload *offload, *tmp; + + list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, offloads) + if (offload->netdev == netdev) + __bpf_prog_offload_destroy(offload->prog); +} + +static void bpf_offload_orphan_all_maps(struct net_device *netdev) +{ + struct bpf_offloaded_map *offmap, *tmp; + + list_for_each_entry_safe(offmap, tmp, &bpf_map_offload_devs, offloads) + if (offmap->netdev == netdev) + __bpf_map_offload_destroy(offmap); +} + static int bpf_offload_notification(struct notifier_block *notifier, ulong event, void *ptr) { struct net_device *netdev = netdev_notifier_info_to_dev(ptr); - struct bpf_prog_offload *offload, *tmp; ASSERT_RTNL(); @@ -265,11 +442,8 @@ static int bpf_offload_notification(struct notifier_block *notifier, break; down_write(&bpf_devs_lock); - list_for_each_entry_safe(offload, tmp, &bpf_prog_offload_devs, - offloads) { - if (offload->netdev == netdev) - __bpf_prog_offload_destroy(offload->prog); - } + bpf_offload_orphan_all_progs(netdev); + bpf_offload_orphan_all_maps(netdev); up_write(&bpf_devs_lock); break; default: diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index a3f726bb42ea..c691b9e972e3 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -94,6 +94,11 @@ static int check_uarg_tail_zero(void __user *uaddr, return 0; } +const struct bpf_map_ops bpf_map_offload_ops = { + .map_alloc = bpf_map_offload_map_alloc, + .map_free = bpf_map_offload_map_free, +}; + static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { const struct bpf_map_ops *ops; @@ -111,6 +116,8 @@ static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) if (err) return ERR_PTR(err); } + if (attr->map_ifindex) + ops = &bpf_map_offload_ops; map = ops->map_alloc(attr); if (IS_ERR(map)) return map; @@ -208,16 +215,25 @@ static int bpf_map_alloc_id(struct bpf_map *map) return id > 0 ? 0 : id; } -static void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) +void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock) { unsigned long flags; + /* Offloaded maps are removed from the IDR store when their device + * disappears - even if someone holds an fd to them they are unusable, + * the memory is gone, all ops will fail; they are simply waiting for + * refcnt to drop to be freed. + */ + if (!map->id) + return; + if (do_idr_lock) spin_lock_irqsave(&map_idr_lock, flags); else __acquire(&map_idr_lock); idr_remove(&map_idr, map->id); + map->id = 0; if (do_idr_lock) spin_unlock_irqrestore(&map_idr_lock, flags); @@ -397,7 +413,7 @@ static int bpf_obj_name_cpy(char *dst, const char *src) return 0; } -#define BPF_MAP_CREATE_LAST_FIELD map_name +#define BPF_MAP_CREATE_LAST_FIELD map_ifindex /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -585,8 +601,10 @@ static int map_lookup_elem(union bpf_attr *attr) if (!value) goto free_key; - if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || - map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_lookup_elem(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) { err = bpf_percpu_hash_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_copy(map, key, value); @@ -673,7 +691,10 @@ static int map_update_elem(union bpf_attr *attr) goto free_value; /* Need to create a kthread, thus must support schedule */ - if (map->map_type == BPF_MAP_TYPE_CPUMAP) { + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_update_elem(map, key, value, attr->flags); + goto out; + } else if (map->map_type == BPF_MAP_TYPE_CPUMAP) { err = map->ops->map_update_elem(map, key, value, attr->flags); goto out; } @@ -750,6 +771,11 @@ static int map_delete_elem(union bpf_attr *attr) goto err_put; } + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_delete_elem(map, key); + goto out; + } + preempt_disable(); __this_cpu_inc(bpf_prog_active); rcu_read_lock(); @@ -757,7 +783,7 @@ static int map_delete_elem(union bpf_attr *attr) rcu_read_unlock(); __this_cpu_dec(bpf_prog_active); preempt_enable(); - +out: if (!err) trace_bpf_map_delete_elem(map, ufd, key); kfree(key); @@ -807,9 +833,15 @@ static int map_get_next_key(union bpf_attr *attr) if (!next_key) goto free_key; + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_get_next_key(map, key, next_key); + goto out; + } + rcu_read_lock(); err = map->ops->map_get_next_key(map, key, next_key); rcu_read_unlock(); +out: if (err) goto free_next_key; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 48b61caa94cb..ceabb394d2dc 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4816,6 +4816,13 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env, return -EINVAL; } } + + if ((bpf_prog_is_dev_bound(prog->aux) || bpf_map_is_dev_bound(map)) && + !bpf_offload_dev_match(prog, map)) { + verbose(env, "offload device mismatch between prog and map\n"); + return -EINVAL; + } + return 0; } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 4e8c60acfa32..69f96af4a569 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -245,6 +245,7 @@ union bpf_attr { * BPF_F_NUMA_NODE is set). */ char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ -- cgit v1.2.3 From 6106c0f82481e686b337ee0c403821fb5c3c17ef Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Jan 2018 15:06:40 +1100 Subject: staging: lustre: lnet: convert selftest to use workqueues Instead of the cfs workitem library, use workqueues. As lnet wants to provide a cpu mask of allowed cpus, it needs to be a WQ_UNBOUND work queue so that tasks can run on cpus other than where they were submitted. This patch also exported apply_workqueue_attrs() which is a documented part of the workqueue API, that isn't currently exported. lustre needs it to allow workqueue thread to be limited to a subset of CPUs. Acked-by: Tejun Heo (for export of apply_workqueue_attrs) Signed-off-by: NeilBrown Signed-off-by: Greg Kroah-Hartman --- drivers/staging/lustre/lnet/selftest/framework.c | 10 ++-- drivers/staging/lustre/lnet/selftest/module.c | 39 ++++++++------- drivers/staging/lustre/lnet/selftest/rpc.c | 61 ++++++++++-------------- drivers/staging/lustre/lnet/selftest/selftest.h | 40 +++++++--------- kernel/workqueue.c | 1 + 5 files changed, 69 insertions(+), 82 deletions(-) (limited to 'kernel') diff --git a/drivers/staging/lustre/lnet/selftest/framework.c b/drivers/staging/lustre/lnet/selftest/framework.c index 2e1126552e18..c7697f66f663 100644 --- a/drivers/staging/lustre/lnet/selftest/framework.c +++ b/drivers/staging/lustre/lnet/selftest/framework.c @@ -941,15 +941,13 @@ sfw_create_test_rpc(struct sfw_test_unit *tsu, struct lnet_process_id peer, return 0; } -static int +static void sfw_run_test(struct swi_workitem *wi) { struct sfw_test_unit *tsu = container_of(wi, struct sfw_test_unit, tsu_worker); struct sfw_test_instance *tsi = tsu->tsu_instance; struct srpc_client_rpc *rpc = NULL; - LASSERT(wi == &tsu->tsu_worker); - if (tsi->tsi_ops->tso_prep_rpc(tsu, tsu->tsu_dest, &rpc)) { LASSERT(!rpc); goto test_done; @@ -975,7 +973,7 @@ sfw_run_test(struct swi_workitem *wi) rpc->crpc_timeout = rpc_timeout; srpc_post_rpc(rpc); spin_unlock(&rpc->crpc_lock); - return 0; + return; test_done: /* @@ -985,9 +983,7 @@ test_done: * - my batch is still active; no one can run it again now. * Cancel pending schedules and prevent future schedule attempts: */ - swi_exit_workitem(wi); sfw_test_unit_done(tsu); - return 1; } static int @@ -1017,7 +1013,7 @@ sfw_run_batch(struct sfw_batch *tsb) tsu->tsu_loop = tsi->tsi_loop; wi = &tsu->tsu_worker; swi_init_workitem(wi, sfw_run_test, - lst_sched_test[lnet_cpt_of_nid(tsu->tsu_dest.nid)]); + lst_test_wq[lnet_cpt_of_nid(tsu->tsu_dest.nid)]); swi_schedule_workitem(wi); } } diff --git a/drivers/staging/lustre/lnet/selftest/module.c b/drivers/staging/lustre/lnet/selftest/module.c index ba4b6145c953..aa6bfd5baf2f 100644 --- a/drivers/staging/lustre/lnet/selftest/module.c +++ b/drivers/staging/lustre/lnet/selftest/module.c @@ -47,8 +47,8 @@ enum { static int lst_init_step = LST_INIT_NONE; -struct cfs_wi_sched *lst_sched_serial; -struct cfs_wi_sched **lst_sched_test; +struct workqueue_struct *lst_serial_wq; +struct workqueue_struct **lst_test_wq; static void lnet_selftest_exit(void) @@ -68,16 +68,16 @@ lnet_selftest_exit(void) case LST_INIT_WI_TEST: for (i = 0; i < cfs_cpt_number(lnet_cpt_table()); i++) { - if (!lst_sched_test[i]) + if (!lst_test_wq[i]) continue; - cfs_wi_sched_destroy(lst_sched_test[i]); + destroy_workqueue(lst_test_wq[i]); } - kvfree(lst_sched_test); - lst_sched_test = NULL; + kvfree(lst_test_wq); + lst_test_wq = NULL; /* fall through */ case LST_INIT_WI_SERIAL: - cfs_wi_sched_destroy(lst_sched_serial); - lst_sched_serial = NULL; + destroy_workqueue(lst_serial_wq); + lst_serial_wq = NULL; case LST_INIT_NONE: break; default: @@ -92,33 +92,40 @@ lnet_selftest_init(void) int rc; int i; - rc = cfs_wi_sched_create("lst_s", lnet_cpt_table(), CFS_CPT_ANY, - 1, &lst_sched_serial); - if (rc) { + lst_serial_wq = alloc_ordered_workqueue("lst_s", 0); + if (!lst_serial_wq) { CERROR("Failed to create serial WI scheduler for LST\n"); return rc; } lst_init_step = LST_INIT_WI_SERIAL; nscheds = cfs_cpt_number(lnet_cpt_table()); - lst_sched_test = kvmalloc_array(nscheds, sizeof(lst_sched_test[0]), + lst_test_wq = kvmalloc_array(nscheds, sizeof(lst_test_wq[0]), GFP_KERNEL | __GFP_ZERO); - if (!lst_sched_test) + if (!lst_test_wq) goto error; lst_init_step = LST_INIT_WI_TEST; for (i = 0; i < nscheds; i++) { int nthrs = cfs_cpt_weight(lnet_cpt_table(), i); + struct workqueue_attrs attrs; /* reserve at least one CPU for LND */ nthrs = max(nthrs - 1, 1); - rc = cfs_wi_sched_create("lst_t", lnet_cpt_table(), i, - nthrs, &lst_sched_test[i]); - if (rc) { + lst_test_wq[i] = alloc_workqueue("lst_t", WQ_UNBOUND, nthrs); + if (!lst_test_wq[i]) { CWARN("Failed to create CPU partition affinity WI scheduler %d for LST\n", i); goto error; } + attrs.nice = 0; + #ifdef CONFIG_CPUMASK_OFFSTACK + attrs.cpumask = lnet_cpt_table()->ctb_parts[i].cpt_cpumask; + #else + cpumask_copy(attrs.cpumask, lnet_cpt_table()->ctb_parts[i].cpt_cpumask); + #endif + attrs.no_numa = false; + apply_workqueue_attrs(lst_test_wq[i], &attrs); } rc = srpc_startup(); diff --git a/drivers/staging/lustre/lnet/selftest/rpc.c b/drivers/staging/lustre/lnet/selftest/rpc.c index b6c9ab92c288..f8198ad1046e 100644 --- a/drivers/staging/lustre/lnet/selftest/rpc.c +++ b/drivers/staging/lustre/lnet/selftest/rpc.c @@ -68,7 +68,7 @@ srpc_serv_portal(int svc_id) } /* forward ref's */ -int srpc_handle_rpc(struct swi_workitem *wi); +void srpc_handle_rpc(struct swi_workitem *wi); void srpc_get_counters(struct srpc_counters *cnt) { @@ -178,7 +178,7 @@ srpc_init_server_rpc(struct srpc_server_rpc *rpc, memset(rpc, 0, sizeof(*rpc)); swi_init_workitem(&rpc->srpc_wi, srpc_handle_rpc, srpc_serv_is_framework(scd->scd_svc) ? - lst_sched_serial : lst_sched_test[scd->scd_cpt]); + lst_serial_wq : lst_test_wq[scd->scd_cpt]); rpc->srpc_ev.ev_fired = 1; /* no event expected now */ @@ -242,7 +242,7 @@ srpc_service_nrpcs(struct srpc_service *svc) max(nrpcs, SFW_FRWK_WI_MIN) : max(nrpcs, SFW_TEST_WI_MIN); } -int srpc_add_buffer(struct swi_workitem *wi); +void srpc_add_buffer(struct swi_workitem *wi); static int srpc_service_init(struct srpc_service *svc) @@ -277,11 +277,11 @@ srpc_service_init(struct srpc_service *svc) scd->scd_ev.ev_type = SRPC_REQUEST_RCVD; /* - * NB: don't use lst_sched_serial for adding buffer, + * NB: don't use lst_serial_wq for adding buffer, * see details in srpc_service_add_buffers() */ swi_init_workitem(&scd->scd_buf_wi, - srpc_add_buffer, lst_sched_test[i]); + srpc_add_buffer, lst_test_wq[i]); if (i && srpc_serv_is_framework(svc)) { /* @@ -513,7 +513,7 @@ __must_hold(&scd->scd_lock) return rc; } -int +void srpc_add_buffer(struct swi_workitem *wi) { struct srpc_service_cd *scd = container_of(wi, struct srpc_service_cd, scd_buf_wi); @@ -572,7 +572,6 @@ srpc_add_buffer(struct swi_workitem *wi) } spin_unlock(&scd->scd_lock); - return 0; } int @@ -604,15 +603,15 @@ srpc_service_add_buffers(struct srpc_service *sv, int nbuffer) spin_lock(&scd->scd_lock); /* * NB: srpc_service_add_buffers() can be called inside - * thread context of lst_sched_serial, and we don't normally + * thread context of lst_serial_wq, and we don't normally * allow to sleep inside thread context of WI scheduler * because it will block current scheduler thread from doing * anything else, even worse, it could deadlock if it's * waiting on result from another WI of the same scheduler. * However, it's safe at here because scd_buf_wi is scheduled - * by thread in a different WI scheduler (lst_sched_test), + * by thread in a different WI scheduler (lst_test_wq), * so we don't have any risk of deadlock, though this could - * block all WIs pending on lst_sched_serial for a moment + * block all WIs pending on lst_serial_wq for a moment * which is not good but not fatal. */ lst_wait_until(scd->scd_buf_err || @@ -659,11 +658,9 @@ srpc_finish_service(struct srpc_service *sv) LASSERT(sv->sv_shuttingdown); /* srpc_shutdown_service called */ cfs_percpt_for_each(scd, i, sv->sv_cpt_data) { + swi_cancel_workitem(&scd->scd_buf_wi); + spin_lock(&scd->scd_lock); - if (!swi_deschedule_workitem(&scd->scd_buf_wi)) { - spin_unlock(&scd->scd_lock); - return 0; - } if (scd->scd_buf_nposted > 0) { CDEBUG(D_NET, "waiting for %d posted buffers to unlink\n", @@ -679,11 +676,9 @@ srpc_finish_service(struct srpc_service *sv) rpc = list_entry(scd->scd_rpc_active.next, struct srpc_server_rpc, srpc_list); - CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s scheduled %d running %d, ev fired %d type %d status %d lnet %d\n", + CNETERR("Active RPC %p on shutdown: sv %s, peer %s, wi %s, ev fired %d type %d status %d lnet %d\n", rpc, sv->sv_name, libcfs_id2str(rpc->srpc_peer), swi_state2str(rpc->srpc_wi.swi_state), - rpc->srpc_wi.swi_workitem.wi_scheduled, - rpc->srpc_wi.swi_workitem.wi_running, rpc->srpc_ev.ev_fired, rpc->srpc_ev.ev_type, rpc->srpc_ev.ev_status, rpc->srpc_ev.ev_lnet); spin_unlock(&scd->scd_lock); @@ -946,7 +941,6 @@ srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status) * Cancel pending schedules and prevent future schedule attempts: */ LASSERT(rpc->srpc_ev.ev_fired); - swi_exit_workitem(&rpc->srpc_wi); if (!sv->sv_shuttingdown && !list_empty(&scd->scd_buf_blocked)) { buffer = list_entry(scd->scd_buf_blocked.next, @@ -964,7 +958,7 @@ srpc_server_rpc_done(struct srpc_server_rpc *rpc, int status) } /* handles an incoming RPC */ -int +void srpc_handle_rpc(struct swi_workitem *wi) { struct srpc_server_rpc *rpc = container_of(wi, struct srpc_server_rpc, srpc_wi); @@ -986,9 +980,8 @@ srpc_handle_rpc(struct swi_workitem *wi) if (ev->ev_fired) { /* no more event, OK to finish */ srpc_server_rpc_done(rpc, -ESHUTDOWN); - return 1; } - return 0; + return; } spin_unlock(&scd->scd_lock); @@ -1006,7 +999,7 @@ srpc_handle_rpc(struct swi_workitem *wi) if (!msg->msg_magic) { /* moaned already in srpc_lnet_ev_handler */ srpc_server_rpc_done(rpc, EBADMSG); - return 1; + return; } srpc_unpack_msg_hdr(msg); @@ -1022,7 +1015,7 @@ srpc_handle_rpc(struct swi_workitem *wi) LASSERT(!reply->status || !rpc->srpc_bulk); if (rc) { srpc_server_rpc_done(rpc, rc); - return 1; + return; } } @@ -1031,7 +1024,7 @@ srpc_handle_rpc(struct swi_workitem *wi) if (rpc->srpc_bulk) { rc = srpc_do_bulk(rpc); if (!rc) - return 0; /* wait for bulk */ + return; /* wait for bulk */ LASSERT(ev->ev_fired); ev->ev_status = rc; @@ -1049,16 +1042,16 @@ srpc_handle_rpc(struct swi_workitem *wi) if (rc) { srpc_server_rpc_done(rpc, rc); - return 1; + return; } } wi->swi_state = SWI_STATE_REPLY_SUBMITTED; rc = srpc_send_reply(rpc); if (!rc) - return 0; /* wait for reply */ + return; /* wait for reply */ srpc_server_rpc_done(rpc, rc); - return 1; + return; case SWI_STATE_REPLY_SUBMITTED: if (!ev->ev_fired) { @@ -1071,10 +1064,8 @@ srpc_handle_rpc(struct swi_workitem *wi) wi->swi_state = SWI_STATE_DONE; srpc_server_rpc_done(rpc, ev->ev_status); - return 1; + return; } - - return 0; } static void @@ -1169,7 +1160,6 @@ srpc_client_rpc_done(struct srpc_client_rpc *rpc, int status) * Cancel pending schedules and prevent future schedule attempts: */ LASSERT(!srpc_event_pending(rpc)); - swi_exit_workitem(wi); spin_unlock(&rpc->crpc_lock); @@ -1177,7 +1167,7 @@ srpc_client_rpc_done(struct srpc_client_rpc *rpc, int status) } /* sends an outgoing RPC */ -int +void srpc_send_rpc(struct swi_workitem *wi) { int rc = 0; @@ -1213,7 +1203,7 @@ srpc_send_rpc(struct swi_workitem *wi) rc = srpc_prepare_reply(rpc); if (rc) { srpc_client_rpc_done(rpc, rc); - return 1; + return; } rc = srpc_prepare_bulk(rpc); @@ -1290,7 +1280,7 @@ srpc_send_rpc(struct swi_workitem *wi) wi->swi_state = SWI_STATE_DONE; srpc_client_rpc_done(rpc, rc); - return 1; + return; } if (rc) { @@ -1307,10 +1297,9 @@ abort: if (!srpc_event_pending(rpc)) { srpc_client_rpc_done(rpc, -EINTR); - return 1; + return; } } - return 0; } struct srpc_client_rpc * diff --git a/drivers/staging/lustre/lnet/selftest/selftest.h b/drivers/staging/lustre/lnet/selftest/selftest.h index 465417263ef1..ad04534f000c 100644 --- a/drivers/staging/lustre/lnet/selftest/selftest.h +++ b/drivers/staging/lustre/lnet/selftest/selftest.h @@ -169,11 +169,11 @@ struct srpc_buffer { }; struct swi_workitem; -typedef int (*swi_action_t) (struct swi_workitem *); +typedef void (*swi_action_t) (struct swi_workitem *); struct swi_workitem { - struct cfs_wi_sched *swi_sched; - struct cfs_workitem swi_workitem; + struct workqueue_struct *swi_wq; + struct work_struct swi_work; swi_action_t swi_action; int swi_state; }; @@ -444,7 +444,7 @@ void srpc_free_bulk(struct srpc_bulk *bk); struct srpc_bulk *srpc_alloc_bulk(int cpt, unsigned int off, unsigned int bulk_npg, unsigned int bulk_len, int sink); -int srpc_send_rpc(struct swi_workitem *wi); +void srpc_send_rpc(struct swi_workitem *wi); int srpc_send_reply(struct srpc_server_rpc *rpc); int srpc_add_service(struct srpc_service *sv); int srpc_remove_service(struct srpc_service *sv); @@ -456,8 +456,8 @@ void srpc_service_remove_buffers(struct srpc_service *sv, int nbuffer); void srpc_get_counters(struct srpc_counters *cnt); void srpc_set_counters(const struct srpc_counters *cnt); -extern struct cfs_wi_sched *lst_sched_serial; -extern struct cfs_wi_sched **lst_sched_test; +extern struct workqueue_struct *lst_serial_wq; +extern struct workqueue_struct **lst_test_wq; static inline int srpc_serv_is_framework(struct srpc_service *svc) @@ -465,42 +465,36 @@ srpc_serv_is_framework(struct srpc_service *svc) return svc->sv_id < SRPC_FRAMEWORK_SERVICE_MAX_ID; } -static inline int -swi_wi_action(struct cfs_workitem *wi) +static void +swi_wi_action(struct work_struct *wi) { struct swi_workitem *swi; - swi = container_of(wi, struct swi_workitem, swi_workitem); + swi = container_of(wi, struct swi_workitem, swi_work); - return swi->swi_action(swi); + swi->swi_action(swi); } static inline void swi_init_workitem(struct swi_workitem *swi, - swi_action_t action, struct cfs_wi_sched *sched) + swi_action_t action, struct workqueue_struct *wq) { - swi->swi_sched = sched; + swi->swi_wq = wq; swi->swi_action = action; swi->swi_state = SWI_STATE_NEWBORN; - cfs_wi_init(&swi->swi_workitem, swi_wi_action); + INIT_WORK(&swi->swi_work, swi_wi_action); } static inline void swi_schedule_workitem(struct swi_workitem *wi) { - cfs_wi_schedule(wi->swi_sched, &wi->swi_workitem); -} - -static inline void -swi_exit_workitem(struct swi_workitem *swi) -{ - cfs_wi_exit(swi->swi_sched, &swi->swi_workitem); + queue_work(wi->swi_wq, &wi->swi_work); } static inline int -swi_deschedule_workitem(struct swi_workitem *swi) +swi_cancel_workitem(struct swi_workitem *swi) { - return cfs_wi_deschedule(swi->swi_sched, &swi->swi_workitem); + return cancel_work_sync(&swi->swi_work); } int sfw_startup(void); @@ -534,7 +528,7 @@ srpc_init_client_rpc(struct srpc_client_rpc *rpc, struct lnet_process_id peer, INIT_LIST_HEAD(&rpc->crpc_list); swi_init_workitem(&rpc->crpc_wi, srpc_send_rpc, - lst_sched_test[lnet_cpt_of_nid(peer.nid)]); + lst_test_wq[lnet_cpt_of_nid(peer.nid)]); spin_lock_init(&rpc->crpc_lock); atomic_set(&rpc->crpc_refcount, 1); /* 1 ref for caller */ diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 43d18cb46308..d1d460bb9b02 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3806,6 +3806,7 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, return ret; } +EXPORT_SYMBOL_GPL(apply_workqueue_attrs); /** * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug -- cgit v1.2.3 From 07dcd7fe89938934ddad65f738bc5aac89b8e54d Mon Sep 17 00:00:00 2001 From: David Windsor Date: Tue, 15 Aug 2017 16:45:00 -0700 Subject: fork: Define usercopy region in mm_struct slab caches In support of usercopy hardening, this patch defines a region in the mm_struct slab caches in which userspace copy operations are allowed. Only the auxv field is copied to userspace. cache object allocation: kernel/fork.c: #define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) dup_mm(): ... mm = allocate_mm(); copy_mm(...): ... dup_mm(); copy_process(...): ... copy_mm(...) _do_fork(...): ... copy_process(...) example usage trace: fs/binfmt_elf.c: create_elf_tables(...): ... elf_info = (elf_addr_t *)current->mm->saved_auxv; ... copy_to_user(..., elf_info, ei_index * sizeof(elf_addr_t)) load_elf_binary(...): ... create_elf_tables(...); This region is known as the slab cache's usercopy region. Slab caches can now check that each dynamically sized copy operation involving cache-managed memory falls entirely within the slab's usercopy region. This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY whitelisting code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Signed-off-by: David Windsor [kees: adjust commit log, split patch, provide usage trace] Cc: Ingo Molnar Cc: Andrew Morton Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Kees Cook Acked-by: Rik van Riel --- kernel/fork.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 432eadf6b58c..82f2a0441d3b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2225,9 +2225,11 @@ void __init proc_caches_init(void) * maximum number of CPU's we can ever have. The cpumask_allocation * is at the end of the structure, exactly for that reason. */ - mm_cachep = kmem_cache_create("mm_struct", + mm_cachep = kmem_cache_create_usercopy("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, + offsetof(struct mm_struct, saved_auxv), + sizeof_field(struct mm_struct, saved_auxv), NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); -- cgit v1.2.3 From f9d29946c56734e954459bc9a0e688a8ae9b4cbf Mon Sep 17 00:00:00 2001 From: David Windsor Date: Sat, 10 Jun 2017 22:50:41 -0400 Subject: fork: Define usercopy region in thread_stack slab caches In support of usercopy hardening, this patch defines a region in the thread_stack slab caches in which userspace copy operations are allowed. Since the entire thread_stack needs to be available to userspace, the entire slab contents are whitelisted. Note that the slab-based thread stack is only present on systems with THREAD_SIZE < PAGE_SIZE and !CONFIG_VMAP_STACK. cache object allocation: kernel/fork.c: alloc_thread_stack_node(...): return kmem_cache_alloc_node(thread_stack_cache, ...) dup_task_struct(...): ... stack = alloc_thread_stack_node(...) ... tsk->stack = stack; copy_process(...): ... dup_task_struct(...) _do_fork(...): ... copy_process(...) This region is known as the slab cache's usercopy region. Slab caches can now check that each dynamically sized copy operation involving cache-managed memory falls entirely within the slab's usercopy region. This patch is modified from Brad Spengler/PaX Team's PAX_USERCOPY whitelisting code in the last public patch of grsecurity/PaX based on my understanding of the code. Changes or omissions from the original code are mine and don't reflect the original grsecurity/PaX code. Signed-off-by: David Windsor [kees: adjust commit log, split patch, provide usage trace] Cc: Ingo Molnar Cc: Andrew Morton Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Kees Cook Acked-by: Rik van Riel --- kernel/fork.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 82f2a0441d3b..0e086af148f2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -282,8 +282,9 @@ static void free_thread_stack(struct task_struct *tsk) void thread_stack_cache_init(void) { - thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE, - THREAD_SIZE, 0, NULL); + thread_stack_cache = kmem_cache_create_usercopy("thread_stack", + THREAD_SIZE, THREAD_SIZE, 0, 0, + THREAD_SIZE, NULL); BUG_ON(thread_stack_cache == NULL); } # endif -- cgit v1.2.3 From 5905429ad85657c28d93ec3d826ddeea1f44c3ce Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Aug 2017 13:00:58 -0700 Subject: fork: Provide usercopy whitelisting for task_struct MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit While the blocked and saved_sigmask fields of task_struct are copied to userspace (via sigmask_to_save() and setup_rt_frame()), it is always copied with a static length (i.e. sizeof(sigset_t)). The only portion of task_struct that is potentially dynamically sized and may be copied to userspace is in the architecture-specific thread_struct at the end of task_struct. cache object allocation: kernel/fork.c: alloc_task_struct_node(...): return kmem_cache_alloc_node(task_struct_cachep, ...); dup_task_struct(...): ... tsk = alloc_task_struct_node(node); copy_process(...): ... dup_task_struct(...) _do_fork(...): ... copy_process(...) example usage trace: arch/x86/kernel/fpu/signal.c: __fpu__restore_sig(...): ... struct task_struct *tsk = current; struct fpu *fpu = &tsk->thread.fpu; ... __copy_from_user(&fpu->state.xsave, ..., state_size); fpu__restore_sig(...): ... return __fpu__restore_sig(...); arch/x86/kernel/signal.c: restore_sigcontext(...): ... fpu__restore_sig(...) This introduces arch_thread_struct_whitelist() to let an architecture declare specifically where the whitelist should be within thread_struct. If undefined, the entire thread_struct field is left whitelisted. Cc: Andrew Morton Cc: Nicholas Piggin Cc: Laura Abbott Cc: "Mickaël Salaün" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andy Lutomirski Signed-off-by: Kees Cook Acked-by: Rik van Riel --- arch/Kconfig | 11 +++++++++++ include/linux/sched/task.h | 14 ++++++++++++++ kernel/fork.c | 22 ++++++++++++++++++++-- 3 files changed, 45 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 400b9e1b2f27..8911ff37335a 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -242,6 +242,17 @@ config ARCH_INIT_TASK config ARCH_TASK_STRUCT_ALLOCATOR bool +config HAVE_ARCH_THREAD_STRUCT_WHITELIST + bool + depends on !ARCH_TASK_STRUCT_ALLOCATOR + help + An architecture should select this to provide hardened usercopy + knowledge about what region of the thread_struct should be + whitelisted for copying to userspace. Normally this is only the + FPU registers. Specifically, arch_thread_struct_whitelist() + should be implemented. Without this, the entire thread_struct + field in task_struct will be left whitelisted. + # Select if arch has its private alloc_thread_stack() function config ARCH_THREAD_STACK_ALLOCATOR bool diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 05b8650f06f5..5be31eb7b266 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -104,6 +104,20 @@ extern int arch_task_struct_size __read_mostly; # define arch_task_struct_size (sizeof(struct task_struct)) #endif +#ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST +/* + * If an architecture has not declared a thread_struct whitelist we + * must assume something there may need to be copied to userspace. + */ +static inline void arch_thread_struct_whitelist(unsigned long *offset, + unsigned long *size) +{ + *offset = 0; + /* Handle dynamically sized thread_struct. */ + *size = arch_task_struct_size - offsetof(struct task_struct, thread); +} +#endif + #ifdef CONFIG_VMAP_STACK static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { diff --git a/kernel/fork.c b/kernel/fork.c index 0e086af148f2..5977e691c754 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -458,6 +458,21 @@ static void set_max_threads(unsigned int max_threads_suggested) int arch_task_struct_size __read_mostly; #endif +static void task_struct_whitelist(unsigned long *offset, unsigned long *size) +{ + /* Fetch thread_struct whitelist for the architecture. */ + arch_thread_struct_whitelist(offset, size); + + /* + * Handle zero-sized whitelist or empty thread_struct, otherwise + * adjust offset to position of thread_struct in task_struct. + */ + if (unlikely(*size == 0)) + *offset = 0; + else + *offset += offsetof(struct task_struct, thread); +} + void __init fork_init(void) { int i; @@ -466,11 +481,14 @@ void __init fork_init(void) #define ARCH_MIN_TASKALIGN 0 #endif int align = max_t(int, L1_CACHE_BYTES, ARCH_MIN_TASKALIGN); + unsigned long useroffset, usersize; /* create a slab on which task_structs can be allocated */ - task_struct_cachep = kmem_cache_create("task_struct", + task_struct_whitelist(&useroffset, &usersize); + task_struct_cachep = kmem_cache_create_usercopy("task_struct", arch_task_struct_size, align, - SLAB_PANIC|SLAB_ACCOUNT, NULL); + SLAB_PANIC|SLAB_ACCOUNT, + useroffset, usersize, NULL); #endif /* do the arch specific task caches init */ -- cgit v1.2.3 From 71ee78d538a7bcb7a876dcbd65eb73627425df6f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 13 Jan 2018 18:25:09 -0600 Subject: signal/blackfin: Move the blackfin specific si_codes to asm-generic/siginfo.h Having si_codes in many different files simply encourages duplicate definitions that can cause problems later. To avoid that merge the blackfin specific si_codes into uapi/asm-generic/siginfo.h Update copy_siginfo_to_user to copy with the absence of BUS_MCEERR_AR that blackfin defines to be something else. Signed-off-by: "Eric W. Biederman" --- arch/blackfin/include/uapi/asm/siginfo.h | 32 -------------------------------- include/uapi/asm-generic/siginfo.h | 29 +++++++++++++++++++++++++++-- kernel/signal.c | 9 ++++++--- 3 files changed, 33 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/arch/blackfin/include/uapi/asm/siginfo.h b/arch/blackfin/include/uapi/asm/siginfo.h index b1db506c8d2e..8c505170f4d0 100644 --- a/arch/blackfin/include/uapi/asm/siginfo.h +++ b/arch/blackfin/include/uapi/asm/siginfo.h @@ -15,36 +15,4 @@ #define si_uid16 _sifields._kill._uid -#define ILL_ILLPARAOP 2 /* illegal opcode combine ********** */ -#define ILL_ILLEXCPT 4 /* unrecoverable exception ********** */ -#define ILL_CPLB_VI 9 /* D/I CPLB protect violation ******** */ -#define ILL_CPLB_MISS 10 /* D/I CPLB miss ******** */ -#define ILL_CPLB_MULHIT 11 /* D/I CPLB multiple hit ******** */ -#undef NSIGILL -#define NSIGILL 11 - -/* - * SIGBUS si_codes - */ -#define BUS_OPFETCH 4 /* error from instruction fetch ******** */ -#undef NSIGBUS -#define NSIGBUS 4 - -/* - * SIGTRAP si_codes - */ -#define TRAP_STEP 1 /* single-step breakpoint************* */ -#define TRAP_TRACEFLOW 2 /* trace buffer overflow ************* */ -#define TRAP_WATCHPT 3 /* watchpoint match ************* */ -#define TRAP_ILLTRAP 4 /* illegal trap ************* */ -#undef NSIGTRAP -#define NSIGTRAP 4 - -/* - * SIGSEGV si_codes - */ -#define SEGV_STACKFLOW 3 /* stack overflow */ -#undef NSIGSEGV -#define NSIGSEGV 3 - #endif /* _UAPI_BFIN_SIGINFO_H */ diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h index 7a268db1c44f..254afc31e3be 100644 --- a/include/uapi/asm-generic/siginfo.h +++ b/include/uapi/asm-generic/siginfo.h @@ -179,13 +179,24 @@ typedef struct siginfo { * SIGILL si_codes */ #define ILL_ILLOPC 1 /* illegal opcode */ +#ifdef __bfin__ +# define ILL_ILLPARAOP 2 /* illegal opcode combine */ +#endif #define ILL_ILLOPN 2 /* illegal operand */ #define ILL_ILLADR 3 /* illegal addressing mode */ #define ILL_ILLTRP 4 /* illegal trap */ +#ifdef __bfin__ +# define ILL_ILLEXCPT 4 /* unrecoverable exception */ +#endif #define ILL_PRVOPC 5 /* privileged opcode */ #define ILL_PRVREG 6 /* privileged register */ #define ILL_COPROC 7 /* coprocessor error */ #define ILL_BADSTK 8 /* internal stack error */ +#ifdef __bfin__ +# define ILL_CPLB_VI 9 /* D/I CPLB protect violation */ +# define ILL_CPLB_MISS 10 /* D/I CPLB miss */ +# define ILL_CPLB_MULHIT 11 /* D/I CPLB multiple hit */ +#endif #ifdef __tile__ # define ILL_DBLFLT 9 /* double fault */ # define ILL_HARDWALL 10 /* user networks hardwall violation */ @@ -225,7 +236,11 @@ typedef struct siginfo { */ #define SEGV_MAPERR 1 /* address not mapped to object */ #define SEGV_ACCERR 2 /* invalid permissions for mapped object */ -#define SEGV_BNDERR 3 /* failed address bound checks */ +#ifdef __bfin__ +# define SEGV_STACKFLOW 3 /* stack overflow */ +#else +# define SEGV_BNDERR 3 /* failed address bound checks */ +#endif #ifdef __ia64__ # define __SEGV_PSTKOVF 4 /* paragraph stack overflow */ #else @@ -239,8 +254,12 @@ typedef struct siginfo { #define BUS_ADRALN 1 /* invalid address alignment */ #define BUS_ADRERR 2 /* non-existent physical address */ #define BUS_OBJERR 3 /* object specific hardware error */ +#ifdef __bfin__ +# define BUS_OPFETCH 4 /* error from instruction fetch */ +#else /* hardware memory error consumed on a machine check: action required */ -#define BUS_MCEERR_AR 4 +# define BUS_MCEERR_AR 4 +#endif /* hardware memory error detected in process but not consumed: action optional*/ #define BUS_MCEERR_AO 5 #define NSIGBUS 5 @@ -252,6 +271,12 @@ typedef struct siginfo { #define TRAP_TRACE 2 /* process trace trap */ #define TRAP_BRANCH 3 /* process taken branch trap */ #define TRAP_HWBKPT 4 /* hardware breakpoint/watchpoint */ +#ifdef __bfin__ +# define TRAP_STEP 1 /* single-step breakpoint */ +# define TRAP_TRACEFLOW 2 /* trace buffer overflow */ +# define TRAP_WATCHPT 3 /* watchpoint match */ +# define TRAP_ILLTRAP 4 /* illegal trap */ +#endif #define NSIGTRAP 4 /* diff --git a/kernel/signal.c b/kernel/signal.c index 47c87b1d0b8a..4c3f4448c5f1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2770,13 +2770,16 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_flags, &to->si_flags); err |= __put_user(from->si_isr, &to->si_isr); #endif -#ifdef BUS_MCEERR_AO /* * Other callers might not initialize the si_lsb field, * so check explicitly for the right codes here. */ - if (from->si_signo == SIGBUS && - (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) +#ifdef BUS_MCEERR_AR + if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AR) + err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); +#endif +#ifdef BUS_MCEERR_AO + if (from->si_signo == SIGBUS && from->si_code == BUS_MCEERR_AO) err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); #endif #ifdef SEGV_BNDERR -- cgit v1.2.3 From 212a36a17efe4d696d1e3c31ebd79a9fb0cbb14b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 31 Jul 2017 17:15:31 -0500 Subject: signal: Unify and correct copy_siginfo_from_user32 The function copy_siginfo_from_user32 is used for two things, in ptrace since the dawn of siginfo for arbirarily modifying a signal that user space sees, and in sigqueueinfo to send a signal with arbirary siginfo data. Create a single copy of copy_siginfo_from_user32 that all architectures share, and teach it to handle all of the cases in the siginfo union. In the generic version of copy_siginfo_from_user32 ensure that all of the fields in siginfo are initialized so that the siginfo structure can be safely copied to userspace if necessary. When copying the embedded sigval union copy the si_int member. That ensures the 32bit values passes through the kernel unchanged. Signed-off-by: "Eric W. Biederman" --- arch/arm64/kernel/signal32.c | 10 ----- arch/mips/kernel/signal32.c | 10 ----- arch/parisc/kernel/signal32.c | 44 ---------------------- arch/parisc/kernel/signal32.h | 1 - arch/powerpc/kernel/signal_32.c | 9 ----- arch/s390/kernel/compat_signal.c | 48 ------------------------ arch/sparc/kernel/signal32.c | 16 -------- arch/tile/kernel/compat_signal.c | 18 --------- arch/x86/kernel/signal_compat.c | 21 ----------- include/linux/compat.h | 2 +- kernel/signal.c | 81 ++++++++++++++++++++++++++++++++++++++++ 11 files changed, 82 insertions(+), 178 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 22711ee8e36c..4377907dbb70 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -195,16 +195,6 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) return err; } -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ - if (copy_from_user(to, from, __ARCH_SI_PREAMBLE_SIZE) || - copy_from_user(to->_sifields._pad, - from->_sifields._pad, SI_PAD_SIZE)) - return -EFAULT; - - return 0; -} - /* * VFP save/restore code. * diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index cf5c7c05e5a3..500b5e4634ea 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -133,13 +133,3 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) } return err; } - -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ - if (copy_from_user(to, from, 3*sizeof(int)) || - copy_from_user(to->_sifields._pad, - from->_sifields._pad, SI_PAD_SIZE32)) - return -EFAULT; - - return 0; -} diff --git a/arch/parisc/kernel/signal32.c b/arch/parisc/kernel/signal32.c index 41afa9cd1f55..558e32475c35 100644 --- a/arch/parisc/kernel/signal32.c +++ b/arch/parisc/kernel/signal32.c @@ -261,50 +261,6 @@ setup_sigcontext32(struct compat_sigcontext __user *sc, struct compat_regfile __ return err; } -int -copy_siginfo_from_user32 (siginfo_t *to, compat_siginfo_t __user *from) -{ - compat_uptr_t addr; - int err; - - if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) - return -EFAULT; - - err = __get_user(to->si_signo, &from->si_signo); - err |= __get_user(to->si_errno, &from->si_errno); - err |= __get_user(to->si_code, &from->si_code); - - if (to->si_code < 0) - err |= __copy_from_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); - else { - switch (siginfo_layout(to->si_signo, to->si_code)) { - case SIL_CHLD: - err |= __get_user(to->si_utime, &from->si_utime); - err |= __get_user(to->si_stime, &from->si_stime); - err |= __get_user(to->si_status, &from->si_status); - default: - case SIL_KILL: - err |= __get_user(to->si_pid, &from->si_pid); - err |= __get_user(to->si_uid, &from->si_uid); - break; - case SIL_FAULT: - err |= __get_user(addr, &from->si_addr); - to->si_addr = compat_ptr(addr); - break; - case SIL_POLL: - err |= __get_user(to->si_band, &from->si_band); - err |= __get_user(to->si_fd, &from->si_fd); - break; - case SIL_RT: - err |= __get_user(to->si_pid, &from->si_pid); - err |= __get_user(to->si_uid, &from->si_uid); - err |= __get_user(to->si_int, &from->si_int); - break; - } - } - return err; -} - int copy_siginfo_to_user32 (compat_siginfo_t __user *to, const siginfo_t *from) { diff --git a/arch/parisc/kernel/signal32.h b/arch/parisc/kernel/signal32.h index 719e7417732c..d25858e4db63 100644 --- a/arch/parisc/kernel/signal32.h +++ b/arch/parisc/kernel/signal32.h @@ -35,7 +35,6 @@ struct compat_ucontext { /* ELF32 signal handling */ int copy_siginfo_to_user32 (compat_siginfo_t __user *to, const siginfo_t *from); -int copy_siginfo_from_user32 (siginfo_t *to, compat_siginfo_t __user *from); /* In a deft move of uber-hackery, we decide to carry the top half of all * 64-bit registers in a non-portable, non-ABI, hidden structure. diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index 9ffd73296f64..ee62ff7b296c 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -933,15 +933,6 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s) #define copy_siginfo_to_user copy_siginfo_to_user32 -int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from) -{ - if (copy_from_user(to, from, 3*sizeof(int)) || - copy_from_user(to->_sifields._pad, - from->_sifields._pad, SI_PAD_SIZE32)) - return -EFAULT; - - return 0; -} #endif /* CONFIG_PPC64 */ /* diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index ef246940b44c..d77ce14ffa5c 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -102,54 +102,6 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) return err ? -EFAULT : 0; } -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ - int err; - u32 tmp; - - err = __get_user(to->si_signo, &from->si_signo); - err |= __get_user(to->si_errno, &from->si_errno); - err |= __get_user(to->si_code, &from->si_code); - - if (to->si_code < 0) - err |= __copy_from_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); - else { - switch (siginfo_layout(to->si_signo, to->si_code)) { - case SIL_RT: - err |= __get_user(to->si_int, &from->si_int); - /* fallthrough */ - case SIL_KILL: - err |= __get_user(to->si_pid, &from->si_pid); - err |= __get_user(to->si_uid, &from->si_uid); - break; - case SIL_CHLD: - err |= __get_user(to->si_pid, &from->si_pid); - err |= __get_user(to->si_uid, &from->si_uid); - err |= __get_user(to->si_utime, &from->si_utime); - err |= __get_user(to->si_stime, &from->si_stime); - err |= __get_user(to->si_status, &from->si_status); - break; - case SIL_FAULT: - err |= __get_user(tmp, &from->si_addr); - to->si_addr = (void __force __user *) - (u64) (tmp & PSW32_ADDR_INSN); - break; - case SIL_POLL: - err |= __get_user(to->si_band, &from->si_band); - err |= __get_user(to->si_fd, &from->si_fd); - break; - case SIL_TIMER: - err |= __get_user(to->si_tid, &from->si_tid); - err |= __get_user(to->si_overrun, &from->si_overrun); - err |= __get_user(to->si_int, &from->si_int); - break; - default: - break; - } - } - return err ? -EFAULT : 0; -} - /* Store registers needed to create the signal frame */ static void store_sigregs(void) { diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 54a6159b9cd8..8022bb4c65a5 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -123,22 +123,6 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) return err; } -/* CAUTION: This is just a very minimalist implementation for the - * sake of compat_sys_rt_sigqueueinfo() - */ -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ - if (!access_ok(VERIFY_WRITE, from, sizeof(compat_siginfo_t))) - return -EFAULT; - - if (copy_from_user(to, from, 3*sizeof(int)) || - copy_from_user(to->_sifields._pad, from->_sifields._pad, - SI_PAD_SIZE)) - return -EFAULT; - - return 0; -} - /* Checks if the fp is valid. We always build signal frames which are * 16-byte aligned, therefore we can always enforce that the restore * frame has that property as well. diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c index 971d87a1d8cf..4e7f40a10eb3 100644 --- a/arch/tile/kernel/compat_signal.c +++ b/arch/tile/kernel/compat_signal.c @@ -105,24 +105,6 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *fr return err; } -int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from) -{ - int err; - - if (!access_ok(VERIFY_READ, from, sizeof(struct compat_siginfo))) - return -EFAULT; - - err = __get_user(to->si_signo, &from->si_signo); - err |= __get_user(to->si_errno, &from->si_errno); - err |= __get_user(to->si_code, &from->si_code); - - err |= __get_user(to->si_pid, &from->si_pid); - err |= __get_user(to->si_uid, &from->si_uid); - err |= __get_user(to->si_int, &from->si_int); - - return err; -} - /* The assembly shim for this function arranges to ignore the return value. */ long compat_sys_rt_sigreturn(void) { diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index feb3ac135d0c..59148de2d83f 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -207,24 +207,3 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) return __copy_siginfo_to_user32(to, from, in_x32_syscall()); } -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ - int err = 0; - u32 ptr32; - - if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) - return -EFAULT; - - get_user_try { - get_user_ex(to->si_signo, &from->si_signo); - get_user_ex(to->si_errno, &from->si_errno); - get_user_ex(to->si_code, &from->si_code); - - get_user_ex(to->si_pid, &from->si_pid); - get_user_ex(to->si_uid, &from->si_uid); - get_user_ex(ptr32, &from->si_ptr); - to->si_ptr = compat_ptr(ptr32); - } get_user_catch(err); - - return err; -} diff --git a/include/linux/compat.h b/include/linux/compat.h index e698ec1908d9..8a9643857c4a 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -510,7 +510,7 @@ long compat_get_bitmap(unsigned long *mask, const compat_ulong_t __user *umask, unsigned long bitmap_size); long compat_put_bitmap(compat_ulong_t __user *umask, unsigned long *mask, unsigned long bitmap_size); -int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from); +int copy_siginfo_from_user32(siginfo_t *to, const struct compat_siginfo __user *from); int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *from); int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); diff --git a/kernel/signal.c b/kernel/signal.c index 4c3f4448c5f1..5211b1b57163 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2814,6 +2814,87 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) return err; } +#ifdef CONFIG_COMPAT +int copy_siginfo_from_user32(struct siginfo *to, + const struct compat_siginfo __user *ufrom) +{ + struct compat_siginfo from; + + if (copy_from_user(&from, ufrom, sizeof(struct compat_siginfo))) + return -EFAULT; + + clear_siginfo(to); + to->si_signo = from.si_signo; + to->si_errno = from.si_errno; + to->si_code = from.si_code; + switch(siginfo_layout(from.si_signo, from.si_code)) { + case SIL_KILL: + to->si_pid = from.si_pid; + to->si_uid = from.si_uid; + break; + case SIL_TIMER: + to->si_tid = from.si_tid; + to->si_overrun = from.si_overrun; + to->si_int = from.si_int; + break; + case SIL_POLL: + to->si_band = from.si_band; + to->si_fd = from.si_fd; + break; + case SIL_FAULT: + to->si_addr = compat_ptr(from.si_addr); +#ifdef __ARCH_SI_TRAPNO + to->si_trapno = from.si_trapno; +#endif +#ifdef BUS_MCEERR_AR + if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AR)) + to->si_addr_lsb = from.si_addr_lsb; +#endif +#ifdef BUS_MCEER_AO + if ((from.si_signo == SIGBUS) && (from.si_code == BUS_MCEERR_AO)) + to->si_addr_lsb = from.si_addr_lsb; +#endif +#ifdef SEGV_BNDERR + if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_BNDERR)) { + to->si_lower = compat_ptr(from.si_lower); + to->si_upper = compat_ptr(from.si_upper); + } +#endif +#ifdef SEGV_PKUERR + if ((from.si_signo == SIGSEGV) && (from.si_code == SEGV_PKUERR)) + to->si_pkey = from.si_pkey; +#endif + break; + case SIL_CHLD: + to->si_pid = from.si_pid; + to->si_uid = from.si_uid; + to->si_status = from.si_status; +#ifdef CONFIG_X86_X32_ABI + if (in_x32_syscall()) { + to->si_utime = from._sifields._sigchld_x32._utime; + to->si_stime = from._sifields._sigchld_x32._stime; + } else +#endif + { + to->si_utime = from.si_utime; + to->si_stime = from.si_stime; + } + break; + case SIL_RT: + to->si_pid = from.si_pid; + to->si_uid = from.si_uid; + to->si_int = from.si_int; + break; + case SIL_SYS: + to->si_call_addr = compat_ptr(from.si_call_addr); + to->si_syscall = from.si_syscall; + to->si_arch = from.si_arch; + break; + } + return 0; +} +#endif /* CONFIG_COMPAT */ + /** * do_sigtimedwait - wait for queued signals specified in @which * @which: queued signals to wait for -- cgit v1.2.3 From eb5346c379cb272eca77f63473de09103a22ebee Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 31 Jul 2017 17:18:40 -0500 Subject: signal: Remove the code to clear siginfo before calling copy_siginfo_from_user32 The new unified copy_siginfo_from_user32 takes care of this. Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 1 - kernel/signal.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 84b1367935e4..ec4365da9be8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -1226,7 +1226,6 @@ int compat_ptrace_request(struct task_struct *child, compat_long_t request, break; case PTRACE_SETSIGINFO: - memset(&siginfo, 0, sizeof siginfo); if (copy_siginfo_from_user32( &siginfo, (struct compat_siginfo __user *) datap)) ret = -EFAULT; diff --git a/kernel/signal.c b/kernel/signal.c index 5211b1b57163..bebe44265b8b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3155,7 +3155,7 @@ COMPAT_SYSCALL_DEFINE3(rt_sigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info = {}; + siginfo_t info; int ret = copy_siginfo_from_user32(&info, uinfo); if (unlikely(ret)) return ret; @@ -3199,7 +3199,7 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo, int, sig, struct compat_siginfo __user *, uinfo) { - siginfo_t info = {}; + siginfo_t info; if (copy_siginfo_from_user32(&info, uinfo)) return -EFAULT; -- cgit v1.2.3 From ea64d5acc8f033cd586182ae31531246cdeaea73 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 15 Jan 2018 18:03:33 -0600 Subject: signal: Unify and correct copy_siginfo_to_user32 Among the existing architecture specific versions of copy_siginfo_to_user32 there are several different implementation problems. Some architectures fail to handle all of the cases in in the siginfo union. Some architectures perform a blind copy of the siginfo union when the si_code is negative. A blind copy suggests the data is expected to be in 32bit siginfo format, which means that receiving such a signal via signalfd won't work, or that the data is in 64bit siginfo and the code is copying nonsense to userspace. Create a single instance of copy_siginfo_to_user32 that all of the architectures can share, and teach it to handle all of the cases in the siginfo union correctly, with the assumption that siginfo is stored internally to the kernel is 64bit siginfo format. A special case is made for x86 x32 format. This is needed as presence of both x32 and ia32 on x86_64 results in two different 32bit signal formats. By allowing this small special case there winds up being exactly one code base that needs to be maintained between all of the architectures. Vastly increasing the testing base and the chances of finding bugs. As the x86 copy of copy_siginfo_to_user32 the call of the x86 signal_compat_build_tests were moved into sigaction_compat_abi, so that they will keep running. Signed-off-by: "Eric W. Biederman" --- arch/arm64/kernel/signal32.c | 70 ---------------------------- arch/mips/include/asm/compat.h | 2 - arch/mips/kernel/signal32.c | 57 ----------------------- arch/parisc/kernel/signal32.c | 62 ------------------------- arch/parisc/kernel/signal32.h | 2 - arch/powerpc/include/asm/compat.h | 2 - arch/powerpc/kernel/signal_32.c | 57 ----------------------- arch/s390/kernel/compat_signal.c | 52 --------------------- arch/sparc/include/asm/compat.h | 2 - arch/sparc/kernel/signal32.c | 53 --------------------- arch/tile/include/asm/compat.h | 2 - arch/tile/kernel/compat_signal.c | 55 ---------------------- arch/x86/include/asm/compat.h | 4 ++ arch/x86/include/asm/fpu/signal.h | 6 --- arch/x86/kernel/signal_compat.c | 96 +-------------------------------------- kernel/signal.c | 90 ++++++++++++++++++++++++++++++++++++ 16 files changed, 96 insertions(+), 516 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 4377907dbb70..cbc4edd1b1eb 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -125,76 +125,6 @@ static inline int get_sigset_t(sigset_t *set, return 0; } -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) -{ - int err; - - if (!access_ok(VERIFY_WRITE, to, sizeof(*to))) - return -EFAULT; - - /* If you change siginfo_t structure, please be sure - * this code is fixed accordingly. - * It should never copy any pad contained in the structure - * to avoid security leaks, but must copy the generic - * 3 ints plus the relevant union member. - * This routine must convert siginfo from 64bit to 32bit as well - * at the same time. - */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - if (from->si_code < 0) - err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, - SI_PAD_SIZE); - else switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case SIL_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_int, &to->si_int); - break; - case SIL_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case SIL_FAULT: - err |= __put_user((compat_uptr_t)(unsigned long)from->si_addr, - &to->si_addr); -#ifdef BUS_MCEERR_AO - /* - * Other callers might not initialize the si_lsb field, - * so check explicitly for the right codes here. - */ - if (from->si_signo == SIGBUS && - (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)) - err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb); -#endif - break; - case SIL_CHLD: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_status, &to->si_status); - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - break; - case SIL_RT: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_int, &to->si_int); - break; - case SIL_SYS: - err |= __put_user((compat_uptr_t)(unsigned long) - from->si_call_addr, &to->si_call_addr); - err |= __put_user(from->si_syscall, &to->si_syscall); - err |= __put_user(from->si_arch, &to->si_arch); - break; - } - return err; -} - /* * VFP save/restore code. * diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index fe7d445f675f..946681db8dc3 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -126,8 +126,6 @@ typedef u32 compat_old_sigset_t; /* at least 32 bits */ typedef u32 compat_sigset_word; -#define SI_PAD_SIZE32 (128/sizeof(int) - 3) - #define COMPAT_OFF_T_MAX 0x7fffffff /* diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index 500b5e4634ea..c4db910a8794 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -76,60 +76,3 @@ SYSCALL_DEFINE3(32_sigaction, long, sig, const struct compat_sigaction __user *, return ret; } - -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) -{ - int err; - - if (!access_ok (VERIFY_WRITE, to, sizeof(compat_siginfo_t))) - return -EFAULT; - - /* If you change siginfo_t structure, please be sure - this code is fixed accordingly. - It should never copy any pad contained in the structure - to avoid security leaks, but must copy the generic - 3 ints plus the relevant union member. - This routine must convert siginfo from 64bit to 32bit as well - at the same time. */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - if (from->si_code < 0) - err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); - else { - switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_int, &to->si_int); - break; - case SIL_CHLD: - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - err |= __put_user(from->si_status, &to->si_status); - case SIL_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case SIL_FAULT: - err |= __put_user((unsigned long)from->si_addr, &to->si_addr); - break; - case SIL_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case SIL_RT: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_int, &to->si_int); - break; - case SIL_SYS: - err |= __copy_to_user(&to->si_call_addr, &from->si_call_addr, - sizeof(compat_uptr_t)); - err |= __put_user(from->si_syscall, &to->si_syscall); - err |= __put_user(from->si_arch, &to->si_arch); - break; - } - } - return err; -} diff --git a/arch/parisc/kernel/signal32.c b/arch/parisc/kernel/signal32.c index 558e32475c35..e8ef3eb69449 100644 --- a/arch/parisc/kernel/signal32.c +++ b/arch/parisc/kernel/signal32.c @@ -260,65 +260,3 @@ setup_sigcontext32(struct compat_sigcontext __user *sc, struct compat_regfile __ return err; } - -int -copy_siginfo_to_user32 (compat_siginfo_t __user *to, const siginfo_t *from) -{ - compat_uptr_t addr; - compat_int_t val; - int err; - - if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) - return -EFAULT; - - /* If you change siginfo_t structure, please be sure - this code is fixed accordingly. - It should never copy any pad contained in the structure - to avoid security leaks, but must copy the generic - 3 ints plus the relevant union member. - This routine must convert siginfo from 64bit to 32bit as well - at the same time. */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - if (from->si_code < 0) - err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); - else { - switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_CHLD: - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - err |= __put_user(from->si_status, &to->si_status); - case SIL_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case SIL_FAULT: - addr = ptr_to_compat(from->si_addr); - err |= __put_user(addr, &to->si_addr); - break; - case SIL_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case SIL_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - val = (compat_int_t)from->si_int; - err |= __put_user(val, &to->si_int); - break; - case SIL_RT: - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_pid, &to->si_pid); - val = (compat_int_t)from->si_int; - err |= __put_user(val, &to->si_int); - break; - case SIL_SYS: - err |= __put_user(ptr_to_compat(from->si_call_addr), &to->si_call_addr); - err |= __put_user(from->si_syscall, &to->si_syscall); - err |= __put_user(from->si_arch, &to->si_arch); - break; - } - } - return err; -} diff --git a/arch/parisc/kernel/signal32.h b/arch/parisc/kernel/signal32.h index d25858e4db63..a271dc0976ce 100644 --- a/arch/parisc/kernel/signal32.h +++ b/arch/parisc/kernel/signal32.h @@ -34,8 +34,6 @@ struct compat_ucontext { /* ELF32 signal handling */ -int copy_siginfo_to_user32 (compat_siginfo_t __user *to, const siginfo_t *from); - /* In a deft move of uber-hackery, we decide to carry the top half of all * 64-bit registers in a non-portable, non-ABI, hidden structure. * Userspace can read the hidden structure if it *wants* but is never diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 8a2363221b0c..62168e1158f1 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -119,8 +119,6 @@ typedef u32 compat_old_sigset_t; typedef u32 compat_sigset_word; -#define SI_PAD_SIZE32 (128/sizeof(int) - 3) - #define COMPAT_OFF_T_MAX 0x7fffffff /* diff --git a/arch/powerpc/kernel/signal_32.c b/arch/powerpc/kernel/signal_32.c index ee62ff7b296c..aded81169648 100644 --- a/arch/powerpc/kernel/signal_32.c +++ b/arch/powerpc/kernel/signal_32.c @@ -873,63 +873,6 @@ static long restore_tm_user_regs(struct pt_regs *regs, #endif #ifdef CONFIG_PPC64 -int copy_siginfo_to_user32(struct compat_siginfo __user *d, const siginfo_t *s) -{ - int err; - - if (!access_ok (VERIFY_WRITE, d, sizeof(*d))) - return -EFAULT; - - /* If you change siginfo_t structure, please be sure - * this code is fixed accordingly. - * It should never copy any pad contained in the structure - * to avoid security leaks, but must copy the generic - * 3 ints plus the relevant union member. - * This routine must convert siginfo from 64bit to 32bit as well - * at the same time. - */ - err = __put_user(s->si_signo, &d->si_signo); - err |= __put_user(s->si_errno, &d->si_errno); - err |= __put_user(s->si_code, &d->si_code); - if (s->si_code < 0) - err |= __copy_to_user(&d->_sifields._pad, &s->_sifields._pad, - SI_PAD_SIZE32); - else switch(siginfo_layout(s->si_signo, s->si_code)) { - case SIL_CHLD: - err |= __put_user(s->si_pid, &d->si_pid); - err |= __put_user(s->si_uid, &d->si_uid); - err |= __put_user(s->si_utime, &d->si_utime); - err |= __put_user(s->si_stime, &d->si_stime); - err |= __put_user(s->si_status, &d->si_status); - break; - case SIL_FAULT: - err |= __put_user((unsigned int)(unsigned long)s->si_addr, - &d->si_addr); - break; - case SIL_POLL: - err |= __put_user(s->si_band, &d->si_band); - err |= __put_user(s->si_fd, &d->si_fd); - break; - case SIL_TIMER: - err |= __put_user(s->si_tid, &d->si_tid); - err |= __put_user(s->si_overrun, &d->si_overrun); - err |= __put_user(s->si_int, &d->si_int); - break; - case SIL_SYS: - err |= __put_user(ptr_to_compat(s->si_call_addr), &d->si_call_addr); - err |= __put_user(s->si_syscall, &d->si_syscall); - err |= __put_user(s->si_arch, &d->si_arch); - break; - case SIL_RT: - err |= __put_user(s->si_int, &d->si_int); - /* fallthrough */ - case SIL_KILL: - err |= __put_user(s->si_pid, &d->si_pid); - err |= __put_user(s->si_uid, &d->si_uid); - break; - } - return err; -} #define copy_siginfo_to_user copy_siginfo_to_user32 diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index d77ce14ffa5c..18c1eeb847b2 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -50,58 +50,6 @@ typedef struct struct ucontext32 uc; } rt_sigframe32; -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) -{ - int err; - - /* If you change siginfo_t structure, please be sure - this code is fixed accordingly. - It should never copy any pad contained in the structure - to avoid security leaks, but must copy the generic - 3 ints plus the relevant union member. - This routine must convert siginfo from 64bit to 32bit as well - at the same time. */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - if (from->si_code < 0) - err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); - else { - switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_RT: - err |= __put_user(from->si_int, &to->si_int); - /* fallthrough */ - case SIL_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case SIL_CHLD: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - err |= __put_user(from->si_status, &to->si_status); - break; - case SIL_FAULT: - err |= __put_user((unsigned long) from->si_addr, - &to->si_addr); - break; - case SIL_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case SIL_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_int, &to->si_int); - break; - default: - break; - } - } - return err ? -EFAULT : 0; -} - /* Store registers needed to create the signal frame */ static void store_sigregs(void) { diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index c3688ae98b90..615283e16f22 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -149,8 +149,6 @@ typedef u32 compat_old_sigset_t; typedef u32 compat_sigset_word; -#define SI_PAD_SIZE32 (128/sizeof(int) - 3) - #define COMPAT_OFF_T_MAX 0x7fffffff /* diff --git a/arch/sparc/kernel/signal32.c b/arch/sparc/kernel/signal32.c index 8022bb4c65a5..44d379db3f64 100644 --- a/arch/sparc/kernel/signal32.c +++ b/arch/sparc/kernel/signal32.c @@ -70,59 +70,6 @@ struct rt_signal_frame32 { /* __siginfo_rwin_t * */u32 rwin_save; } __attribute__((aligned(8))); -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) -{ - int err; - - if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) - return -EFAULT; - - /* If you change siginfo_t structure, please be sure - this code is fixed accordingly. - It should never copy any pad contained in the structure - to avoid security leaks, but must copy the generic - 3 ints plus the relevant union member. - This routine must convert siginfo from 64bit to 32bit as well - at the same time. */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - if (from->si_code < 0) - err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); - else { - switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_TIMER: - err |= __put_user(from->si_tid, &to->si_tid); - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_int, &to->si_int); - break; - case SIL_CHLD: - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - err |= __put_user(from->si_status, &to->si_status); - default: - case SIL_KILL: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - break; - case SIL_FAULT: - err |= __put_user(from->si_trapno, &to->si_trapno); - err |= __put_user((unsigned long)from->si_addr, &to->si_addr); - break; - case SIL_POLL: - err |= __put_user(from->si_band, &to->si_band); - err |= __put_user(from->si_fd, &to->si_fd); - break; - case SIL_RT: - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_int, &to->si_int); - break; - } - } - return err; -} - /* Checks if the fp is valid. We always build signal frames which are * 16-byte aligned, therefore we can always enforce that the restore * frame has that property as well. diff --git a/arch/tile/include/asm/compat.h b/arch/tile/include/asm/compat.h index c6b7613256b4..769ff6ac0bf5 100644 --- a/arch/tile/include/asm/compat.h +++ b/arch/tile/include/asm/compat.h @@ -110,8 +110,6 @@ struct compat_flock64 { typedef u32 compat_sigset_word; -#define COMPAT_SI_PAD_SIZE (128/sizeof(int) - 3) - #define COMPAT_OFF_T_MAX 0x7fffffff struct compat_ipc64_perm { diff --git a/arch/tile/kernel/compat_signal.c b/arch/tile/kernel/compat_signal.c index 4e7f40a10eb3..a703bd0e0488 100644 --- a/arch/tile/kernel/compat_signal.c +++ b/arch/tile/kernel/compat_signal.c @@ -50,61 +50,6 @@ struct compat_rt_sigframe { struct compat_ucontext uc; }; -int copy_siginfo_to_user32(struct compat_siginfo __user *to, const siginfo_t *from) -{ - int err; - - if (!access_ok(VERIFY_WRITE, to, sizeof(struct compat_siginfo))) - return -EFAULT; - - /* If you change siginfo_t structure, please make sure that - this code is fixed accordingly. - It should never copy any pad contained in the structure - to avoid security leaks, but must copy the generic - 3 ints plus the relevant union member. */ - err = __put_user(from->si_signo, &to->si_signo); - err |= __put_user(from->si_errno, &to->si_errno); - err |= __put_user(from->si_code, &to->si_code); - - if (from->si_code < 0) { - err |= __put_user(from->si_pid, &to->si_pid); - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_int, &to->si_int); - } else { - /* - * First 32bits of unions are always present: - * si_pid === si_band === si_tid === si_addr(LS half) - */ - err |= __put_user(from->_sifields._pad[0], - &to->_sifields._pad[0]); - switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_FAULT: - break; - case SIL_CHLD: - err |= __put_user(from->si_utime, &to->si_utime); - err |= __put_user(from->si_stime, &to->si_stime); - err |= __put_user(from->si_status, &to->si_status); - /* FALL THROUGH */ - default: - case SIL_KILL: - err |= __put_user(from->si_uid, &to->si_uid); - break; - case SIL_POLL: - err |= __put_user(from->si_fd, &to->si_fd); - break; - case SIL_TIMER: - err |= __put_user(from->si_overrun, &to->si_overrun); - err |= __put_user(from->si_int, &to->si_int); - break; - case SIL_RT: - err |= __put_user(from->si_uid, &to->si_uid); - err |= __put_user(from->si_int, &to->si_int); - break; - } - } - return err; -} - /* The assembly shim for this function arranges to ignore the return value. */ long compat_sys_rt_sigreturn(void) { diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 0b76fc91f672..e1c8dab86670 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -249,4 +249,8 @@ static inline bool in_compat_syscall(void) } #define in_compat_syscall in_compat_syscall /* override the generic impl */ +struct compat_siginfo; +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const siginfo_t *from, bool x32_ABI); + #endif /* _ASM_X86_COMPAT_H */ diff --git a/arch/x86/include/asm/fpu/signal.h b/arch/x86/include/asm/fpu/signal.h index 4df2754ef380..44bbc39a57b3 100644 --- a/arch/x86/include/asm/fpu/signal.h +++ b/arch/x86/include/asm/fpu/signal.h @@ -20,12 +20,6 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, # define ia32_setup_rt_frame __setup_rt_frame #endif -#ifdef CONFIG_COMPAT -int __copy_siginfo_to_user32(compat_siginfo_t __user *to, - const siginfo_t *from, bool x32_ABI); -#endif - - extern void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk); extern void convert_to_fxsr(struct task_struct *tsk, diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 59148de2d83f..ac057f9b0763 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -98,6 +98,8 @@ static inline void signal_compat_build_tests(void) void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { + signal_compat_build_tests(); + /* Don't leak in-kernel non-uapi flags to user-space */ if (oact) oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); @@ -113,97 +115,3 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) if (in_x32_syscall()) act->sa.sa_flags |= SA_X32_ABI; } - -int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from, - bool x32_ABI) -{ - int err = 0; - - signal_compat_build_tests(); - - if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) - return -EFAULT; - - put_user_try { - /* If you change siginfo_t structure, please make sure that - this code is fixed accordingly. - It should never copy any pad contained in the structure - to avoid security leaks, but must copy the generic - 3 ints plus the relevant union member. */ - put_user_ex(from->si_signo, &to->si_signo); - put_user_ex(from->si_errno, &to->si_errno); - put_user_ex(from->si_code, &to->si_code); - - if (from->si_code < 0) { - put_user_ex(from->si_pid, &to->si_pid); - put_user_ex(from->si_uid, &to->si_uid); - put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); - } else { - /* - * First 32bits of unions are always present: - * si_pid === si_band === si_tid === si_addr(LS half) - */ - put_user_ex(from->_sifields._pad[0], - &to->_sifields._pad[0]); - switch (siginfo_layout(from->si_signo, from->si_code)) { - case SIL_FAULT: - if (from->si_signo == SIGBUS && - (from->si_code == BUS_MCEERR_AR || - from->si_code == BUS_MCEERR_AO)) - put_user_ex(from->si_addr_lsb, &to->si_addr_lsb); - - if (from->si_signo == SIGSEGV) { - if (from->si_code == SEGV_BNDERR) { - compat_uptr_t lower = (unsigned long)from->si_lower; - compat_uptr_t upper = (unsigned long)from->si_upper; - put_user_ex(lower, &to->si_lower); - put_user_ex(upper, &to->si_upper); - } - if (from->si_code == SEGV_PKUERR) - put_user_ex(from->si_pkey, &to->si_pkey); - } - break; - case SIL_SYS: - put_user_ex(from->si_syscall, &to->si_syscall); - put_user_ex(from->si_arch, &to->si_arch); - break; - case SIL_CHLD: - if (!x32_ABI) { - put_user_ex(from->si_utime, &to->si_utime); - put_user_ex(from->si_stime, &to->si_stime); -#ifdef CONFIG_X86_X32_ABI - } else { - put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); - put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); -#endif - } - put_user_ex(from->si_status, &to->si_status); - /* FALL THROUGH */ - case SIL_KILL: - put_user_ex(from->si_uid, &to->si_uid); - break; - case SIL_POLL: - put_user_ex(from->si_fd, &to->si_fd); - break; - case SIL_TIMER: - put_user_ex(from->si_overrun, &to->si_overrun); - put_user_ex(ptr_to_compat(from->si_ptr), - &to->si_ptr); - break; - case SIL_RT: - put_user_ex(from->si_uid, &to->si_uid); - put_user_ex(from->si_int, &to->si_int); - break; - } - } - } put_user_catch(err); - - return err; -} - -/* from syscall's path, where we know the ABI */ -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) -{ - return __copy_siginfo_to_user32(to, from, in_x32_syscall()); -} - diff --git a/kernel/signal.c b/kernel/signal.c index bebe44265b8b..4976f05aa09b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2815,6 +2815,96 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) } #ifdef CONFIG_COMPAT +int copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct siginfo *from) +#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) +{ + return __copy_siginfo_to_user32(to, from, in_x32_syscall()); +} +int __copy_siginfo_to_user32(struct compat_siginfo __user *to, + const struct siginfo *from, bool x32_ABI) +#endif +{ + struct compat_siginfo new; + memset(&new, 0, sizeof(new)); + + new.si_signo = from->si_signo; + new.si_errno = from->si_errno; + new.si_code = from->si_code; + switch(siginfo_layout(from->si_signo, from->si_code)) { + case SIL_KILL: + new.si_pid = from->si_pid; + new.si_uid = from->si_uid; + break; + case SIL_TIMER: + new.si_tid = from->si_tid; + new.si_overrun = from->si_overrun; + new.si_int = from->si_int; + break; + case SIL_POLL: + new.si_band = from->si_band; + new.si_fd = from->si_fd; + break; + case SIL_FAULT: + new.si_addr = ptr_to_compat(from->si_addr); +#ifdef __ARCH_SI_TRAPNO + new.si_trapno = from->si_trapno; +#endif +#ifdef BUS_MCEERR_AR + if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AR)) + new.si_addr_lsb = from->si_addr_lsb; +#endif +#ifdef BUS_MCEERR_AO + if ((from->si_signo == SIGBUS) && (from->si_code == BUS_MCEERR_AO)) + new.si_addr_lsb = from->si_addr_lsb; +#endif +#ifdef SEGV_BNDERR + if ((from->si_signo == SIGSEGV) && + (from->si_code == SEGV_BNDERR)) { + new.si_lower = ptr_to_compat(from->si_lower); + new.si_upper = ptr_to_compat(from->si_upper); + } +#endif +#ifdef SEGV_PKUERR + if ((from->si_signo == SIGSEGV) && + (from->si_code == SEGV_PKUERR)) + new.si_pkey = from->si_pkey; +#endif + + break; + case SIL_CHLD: + new.si_pid = from->si_pid; + new.si_uid = from->si_uid; + new.si_status = from->si_status; +#ifdef CONFIG_X86_X32_ABI + if (x32_ABI) { + new._sifields._sigchld_x32._utime = from->si_utime; + new._sifields._sigchld_x32._stime = from->si_stime; + } else +#endif + { + new.si_utime = from->si_utime; + new.si_stime = from->si_stime; + } + break; + case SIL_RT: + new.si_pid = from->si_pid; + new.si_uid = from->si_uid; + new.si_int = from->si_int; + break; + case SIL_SYS: + new.si_call_addr = ptr_to_compat(from->si_call_addr); + new.si_syscall = from->si_syscall; + new.si_arch = from->si_arch; + break; + } + + if (copy_to_user(to, &new, sizeof(struct compat_siginfo))) + return -EFAULT; + + return 0; +} + int copy_siginfo_from_user32(struct siginfo *to, const struct compat_siginfo __user *ufrom) { -- cgit v1.2.3 From d2279c9d7f7db7f97567368bfc4539b3411adf8d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 5 Jan 2018 19:25:38 +0900 Subject: kallsyms: remove print_symbol() function No more print_symbol()/__print_symbol() users left, remove these symbols. It was a very old API that encouraged people use continuous lines. It had been obsoleted by %pS format specifier in a normal printk() call. Link: http://lkml.kernel.org/r/20180105102538.GC471@jagdpanzerIV Cc: Andrew Morton Cc: Russell King Cc: Catalin Marinas Cc: Mark Salter Cc: Tony Luck Cc: David Howells Cc: Yoshinori Sato Cc: Guan Xuetao Cc: Borislav Petkov Cc: Greg Kroah-Hartman Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Vineet Gupta Cc: Fengguang Wu Cc: Steven Rostedt Cc: LKML Cc: linux-arm-kernel@lists.infradead.org Cc: linux-c6x-dev@linux-c6x.org Cc: linux-ia64@vger.kernel.org Cc: linux-am33-list@redhat.com Cc: linux-sh@vger.kernel.org Cc: linux-edac@vger.kernel.org Cc: x86@kernel.org Cc: linux-snps-arc@lists.infradead.org Cc: Sergey Senozhatsky Signed-off-by: Sergey Senozhatsky Suggested-by: Joe Perches [pmladek@suse.com: updated commit message] Signed-off-by: Petr Mladek --- Documentation/filesystems/sysfs.txt | 4 ++-- Documentation/translations/zh_CN/filesystems/sysfs.txt | 4 ++-- include/linux/kallsyms.h | 18 ------------------ kernel/kallsyms.c | 11 ----------- 4 files changed, 4 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt index 9a3658cc399e..a1426cabcef1 100644 --- a/Documentation/filesystems/sysfs.txt +++ b/Documentation/filesystems/sysfs.txt @@ -154,8 +154,8 @@ static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr, if (dev_attr->show) ret = dev_attr->show(dev, dev_attr, buf); if (ret >= (ssize_t)PAGE_SIZE) { - print_symbol("dev_attr_show: %s returned bad count\n", - (unsigned long)dev_attr->show); + printk("dev_attr_show: %pS returned bad count\n", + dev_attr->show); } return ret; } diff --git a/Documentation/translations/zh_CN/filesystems/sysfs.txt b/Documentation/translations/zh_CN/filesystems/sysfs.txt index 7d3b05edb8ce..452271dda141 100644 --- a/Documentation/translations/zh_CN/filesystems/sysfs.txt +++ b/Documentation/translations/zh_CN/filesystems/sysfs.txt @@ -167,8 +167,8 @@ static ssize_t dev_attr_show(struct kobject *kobj, struct attribute *attr, if (dev_attr->show) ret = dev_attr->show(dev, dev_attr, buf); if (ret >= (ssize_t)PAGE_SIZE) { - print_symbol("dev_attr_show: %s returned bad count\n", - (unsigned long)dev_attr->show); + printk("dev_attr_show: %pS returned bad count\n", + dev_attr->show); } return ret; } diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 708f337d780b..c733941901b3 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -46,9 +46,6 @@ extern int sprint_symbol(char *buffer, unsigned long address); extern int sprint_symbol_no_offset(char *buffer, unsigned long address); extern int sprint_backtrace(char *buffer, unsigned long address); -/* Look up a kernel symbol and print it to the kernel messages. */ -extern void __print_symbol(const char *fmt, unsigned long address); - int lookup_symbol_name(unsigned long addr, char *symname); int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); @@ -118,23 +115,8 @@ static inline int kallsyms_show_value(void) return false; } -/* Stupid that this does nothing, but I didn't create this mess. */ -#define __print_symbol(fmt, addr) #endif /*CONFIG_KALLSYMS*/ -/* This macro allows us to keep printk typechecking */ -static __printf(1, 2) -void __check_printsym_format(const char *fmt, ...) -{ -} - -static inline void print_symbol(const char *fmt, unsigned long addr) -{ - __check_printsym_format(fmt, ""); - __print_symbol(fmt, (unsigned long) - __builtin_extract_return_addr((void *)addr)); -} - static inline void print_ip_sym(unsigned long ip) { printk("[<%p>] %pS\n", (void *) ip, (void *) ip); diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 531ffa984bc2..32ba256f0092 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -464,17 +464,6 @@ int sprint_backtrace(char *buffer, unsigned long address) return __sprint_symbol(buffer, address, -1, 1); } -/* Look up a kernel symbol and print it to the kernel messages. */ -void __print_symbol(const char *fmt, unsigned long address) -{ - char buffer[KSYM_SYMBOL_LEN]; - - sprint_symbol(buffer, address); - - printk(fmt, buffer); -} -EXPORT_SYMBOL(__print_symbol); - /* To avoid using get_symbol_offset for every symbol, we carry prefix along. */ struct kallsym_iter { loff_t pos; -- cgit v1.2.3 From dbdda842fe96f8932bae554f0adf463c27c42bc7 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 10 Jan 2018 14:24:17 +0100 Subject: printk: Add console owner and waiter logic to load balance console writes This patch implements what I discussed in Kernel Summit. I added lockdep annotation (hopefully correctly), and it hasn't had any splats (since I fixed some bugs in the first iterations). It did catch problems when I had the owner covering too much. But now that the owner is only set when actively calling the consoles, lockdep has stayed quiet. Here's the design again: I added a "console_owner" which is set to a task that is actively writing to the consoles. It is *not* the same as the owner of the console_lock. It is only set when doing the calls to the console functions. It is protected by a console_owner_lock which is a raw spin lock. There is a console_waiter. This is set when there is an active console owner that is not current, and waiter is not set. This too is protected by console_owner_lock. In printk() when it tries to write to the consoles, we have: if (console_trylock()) console_unlock(); Now I added an else, which will check if there is an active owner, and no current waiter. If that is the case, then console_waiter is set, and the task goes into a spin until it is no longer set. When the active console owner finishes writing the current message to the consoles, it grabs the console_owner_lock and sees if there is a waiter, and clears console_owner. If there is a waiter, then it breaks out of the loop, clears the waiter flag (because that will release the waiter from its spin), and exits. Note, it does *not* release the console semaphore. Because it is a semaphore, there is no owner. Another task may release it. This means that the waiter is guaranteed to be the new console owner! Which it becomes. Then the waiter calls console_unlock() and continues to write to the consoles. If another task comes along and does a printk() it too can become the new waiter, and we wash rinse and repeat! By Petr Mladek about possible new deadlocks: The thing is that we move console_sem only to printk() call that normally calls console_unlock() as well. It means that the transferred owner should not bring new type of dependencies. As Steven said somewhere: "If there is a deadlock, it was there even before." We could look at it from this side. The possible deadlock would look like: CPU0 CPU1 console_unlock() console_owner = current; spin_lockA() printk() spin = true; while (...) call_console_drivers() spin_lockA() This would be a deadlock. CPU0 would wait for the lock A. While CPU1 would own the lockA and would wait for CPU0 to finish calling the console drivers and pass the console_sem owner. But if the above is true than the following scenario was already possible before: CPU0 spin_lockA() printk() console_unlock() call_console_drivers() spin_lockA() By other words, this deadlock was there even before. Such deadlocks are prevented by using printk_deferred() in the sections guarded by the lock A. By Steven Rostedt: To demonstrate the issue, this module has been shown to lock up a system with 4 CPUs and a slow console (like a serial console). It is also able to lock up a 8 CPU system with only a fast (VGA) console, by passing in "loops=100". The changes in this commit prevent this module from locking up the system. #include #include #include #include #include #include static bool stop_testing; static unsigned int loops = 1; static void preempt_printk_workfn(struct work_struct *work) { int i; while (!READ_ONCE(stop_testing)) { for (i = 0; i < loops && !READ_ONCE(stop_testing); i++) { preempt_disable(); pr_emerg("%5d%-75s\n", smp_processor_id(), " XXX NOPREEMPT"); preempt_enable(); } msleep(1); } } static struct work_struct __percpu *works; static void finish(void) { int cpu; WRITE_ONCE(stop_testing, true); for_each_online_cpu(cpu) flush_work(per_cpu_ptr(works, cpu)); free_percpu(works); } static int __init test_init(void) { int cpu; works = alloc_percpu(struct work_struct); if (!works) return -ENOMEM; /* * This is just a test module. This will break if you * do any CPU hot plugging between loading and * unloading the module. */ for_each_online_cpu(cpu) { struct work_struct *work = per_cpu_ptr(works, cpu); INIT_WORK(work, &preempt_printk_workfn); schedule_work_on(cpu, work); } return 0; } static void __exit test_exit(void) { finish(); } module_param(loops, uint, 0); module_init(test_init); module_exit(test_exit); MODULE_LICENSE("GPL"); Link: http://lkml.kernel.org/r/20180110132418.7080-2-pmladek@suse.com Cc: akpm@linux-foundation.org Cc: linux-mm@kvack.org Cc: Cong Wang Cc: Dave Hansen Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jan Kara Cc: Mathieu Desnoyers Cc: Tetsuo Handa Cc: Byungchul Park Cc: Tejun Heo Cc: Pavel Machek Cc: linux-kernel@vger.kernel.org Signed-off-by: Steven Rostedt (VMware) [pmladek@suse.com: Commit message about possible deadlocks] Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 107 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5d81206a572d..040fb948924e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -86,8 +86,15 @@ EXPORT_SYMBOL_GPL(console_drivers); static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; +static struct lockdep_map console_owner_dep_map = { + .name = "console_owner" +}; #endif +static DEFINE_RAW_SPINLOCK(console_owner_lock); +static struct task_struct *console_owner; +static bool console_waiter; + enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, @@ -1753,8 +1760,56 @@ asmlinkage int vprintk_emit(int facility, int level, * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock()) + if (console_trylock()) { console_unlock(); + } else { + struct task_struct *owner = NULL; + bool waiter; + bool spin = false; + + printk_safe_enter_irqsave(flags); + + raw_spin_lock(&console_owner_lock); + owner = READ_ONCE(console_owner); + waiter = READ_ONCE(console_waiter); + if (!waiter && owner && owner != current) { + WRITE_ONCE(console_waiter, true); + spin = true; + } + raw_spin_unlock(&console_owner_lock); + + /* + * If there is an active printk() writing to the + * consoles, instead of having it write our data too, + * see if we can offload that load from the active + * printer, and do some printing ourselves. + * Go into a spin only if there isn't already a waiter + * spinning, and there is an active printer, and + * that active printer isn't us (recursive printk?). + */ + if (spin) { + /* We spin waiting for the owner to release us */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); + /* Owner will clear console_waiter on hand off */ + while (READ_ONCE(console_waiter)) + cpu_relax(); + + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + printk_safe_exit_irqrestore(flags); + + /* + * The owner passed the console lock to us. + * Since we did not spin on console lock, annotate + * this as a trylock. Otherwise lockdep will + * complain. + */ + mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + console_unlock(); + printk_safe_enter_irqsave(flags); + } + printk_safe_exit_irqrestore(flags); + + } } return printed_len; @@ -2141,6 +2196,7 @@ void console_unlock(void) static u64 seen_seq; unsigned long flags; bool wake_klogd = false; + bool waiter = false; bool do_cond_resched, retry; if (console_suspended) { @@ -2229,14 +2285,64 @@ skip: console_seq++; raw_spin_unlock(&logbuf_lock); + /* + * While actively printing out messages, if another printk() + * were to occur on another CPU, it may wait for this one to + * finish. This task can not be preempted if there is a + * waiter waiting to take over. + */ + raw_spin_lock(&console_owner_lock); + console_owner = current; + raw_spin_unlock(&console_owner_lock); + + /* The waiter may spin on us after setting console_owner */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); + stop_critical_timings(); /* don't trace print latency */ call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); + + raw_spin_lock(&console_owner_lock); + waiter = READ_ONCE(console_waiter); + console_owner = NULL; + raw_spin_unlock(&console_owner_lock); + + /* + * If there is a waiter waiting for us, then pass the + * rest of the work load over to that waiter. + */ + if (waiter) + break; + + /* There was no waiter, and nothing will spin on us here */ + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + printk_safe_exit_irqrestore(flags); if (do_cond_resched) cond_resched(); } + + /* + * If there is an active waiter waiting on the console_lock. + * Pass off the printing to the waiter, and the waiter + * will continue printing on its CPU, and when all writing + * has finished, the last printer will wake up klogd. + */ + if (waiter) { + WRITE_ONCE(console_waiter, false); + /* The waiter is now free to continue */ + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + /* + * Hand off console_lock to waiter. The waiter will perform + * the up(). After this, the waiter is the console_lock owner. + */ + mutex_release(&console_lock_dep_map, 1, _THIS_IP_); + printk_safe_exit_irqrestore(flags); + /* Note, if waiter is set, logbuf_lock is not held */ + return; + } + console_locked = 0; /* Release the exclusive_console once it is used */ -- cgit v1.2.3 From c162d5b4338d72deed61aa65ed0f2f4ba2bbc8ab Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Fri, 12 Jan 2018 17:08:37 +0100 Subject: printk: Hide console waiter logic into helpers The commit ("printk: Add console owner and waiter logic to load balance console writes") made vprintk_emit() and console_unlock() even more complicated. This patch extracts the new code into 3 helper functions. They should help to keep it rather self-contained. It will be easier to use and maintain. This patch just shuffles the existing code. It does not change the functionality. Link: http://lkml.kernel.org/r/20180112160837.GD24497@linux.suse Cc: akpm@linux-foundation.org Cc: linux-mm@kvack.org Cc: Cong Wang Cc: Dave Hansen Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jan Kara Cc: Mathieu Desnoyers Cc: Tetsuo Handa Cc: rostedt@home.goodmis.org Cc: Byungchul Park Cc: Tejun Heo Cc: Pavel Machek Cc: linux-kernel@vger.kernel.org Reviewed-by: Steven Rostedt (VMware) Acked-by: Sergey Senozhatsky Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 245 +++++++++++++++++++++++++++++-------------------- 1 file changed, 148 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 040fb948924e..3a475f58b749 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -86,15 +86,8 @@ EXPORT_SYMBOL_GPL(console_drivers); static struct lockdep_map console_lock_dep_map = { .name = "console_lock" }; -static struct lockdep_map console_owner_dep_map = { - .name = "console_owner" -}; #endif -static DEFINE_RAW_SPINLOCK(console_owner_lock); -static struct task_struct *console_owner; -static bool console_waiter; - enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, @@ -1550,6 +1543,146 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) return do_syslog(type, buf, len, SYSLOG_FROM_READER); } +/* + * Special console_lock variants that help to reduce the risk of soft-lockups. + * They allow to pass console_lock to another printk() call using a busy wait. + */ + +#ifdef CONFIG_LOCKDEP +static struct lockdep_map console_owner_dep_map = { + .name = "console_owner" +}; +#endif + +static DEFINE_RAW_SPINLOCK(console_owner_lock); +static struct task_struct *console_owner; +static bool console_waiter; + +/** + * console_lock_spinning_enable - mark beginning of code where another + * thread might safely busy wait + * + * This basically converts console_lock into a spinlock. This marks + * the section where the console_lock owner can not sleep, because + * there may be a waiter spinning (like a spinlock). Also it must be + * ready to hand over the lock at the end of the section. + */ +static void console_lock_spinning_enable(void) +{ + raw_spin_lock(&console_owner_lock); + console_owner = current; + raw_spin_unlock(&console_owner_lock); + + /* The waiter may spin on us after setting console_owner */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +} + +/** + * console_lock_spinning_disable_and_check - mark end of code where another + * thread was able to busy wait and check if there is a waiter + * + * This is called at the end of the section where spinning is allowed. + * It has two functions. First, it is a signal that it is no longer + * safe to start busy waiting for the lock. Second, it checks if + * there is a busy waiter and passes the lock rights to her. + * + * Important: Callers lose the lock if there was a busy waiter. + * They must not touch items synchronized by console_lock + * in this case. + * + * Return: 1 if the lock rights were passed, 0 otherwise. + */ +static int console_lock_spinning_disable_and_check(void) +{ + int waiter; + + raw_spin_lock(&console_owner_lock); + waiter = READ_ONCE(console_waiter); + console_owner = NULL; + raw_spin_unlock(&console_owner_lock); + + if (!waiter) { + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + return 0; + } + + /* The waiter is now free to continue */ + WRITE_ONCE(console_waiter, false); + + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + + /* + * Hand off console_lock to waiter. The waiter will perform + * the up(). After this, the waiter is the console_lock owner. + */ + mutex_release(&console_lock_dep_map, 1, _THIS_IP_); + return 1; +} + +/** + * console_trylock_spinning - try to get console_lock by busy waiting + * + * This allows to busy wait for the console_lock when the current + * owner is running in specially marked sections. It means that + * the current owner is running and cannot reschedule until it + * is ready to lose the lock. + * + * Return: 1 if we got the lock, 0 othrewise + */ +static int console_trylock_spinning(void) +{ + struct task_struct *owner = NULL; + bool waiter; + bool spin = false; + unsigned long flags; + + if (console_trylock()) + return 1; + + printk_safe_enter_irqsave(flags); + + raw_spin_lock(&console_owner_lock); + owner = READ_ONCE(console_owner); + waiter = READ_ONCE(console_waiter); + if (!waiter && owner && owner != current) { + WRITE_ONCE(console_waiter, true); + spin = true; + } + raw_spin_unlock(&console_owner_lock); + + /* + * If there is an active printk() writing to the + * consoles, instead of having it write our data too, + * see if we can offload that load from the active + * printer, and do some printing ourselves. + * Go into a spin only if there isn't already a waiter + * spinning, and there is an active printer, and + * that active printer isn't us (recursive printk?). + */ + if (!spin) { + printk_safe_exit_irqrestore(flags); + return 0; + } + + /* We spin waiting for the owner to release us */ + spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); + /* Owner will clear console_waiter on hand off */ + while (READ_ONCE(console_waiter)) + cpu_relax(); + spin_release(&console_owner_dep_map, 1, _THIS_IP_); + + printk_safe_exit_irqrestore(flags); + /* + * The owner passed the console lock to us. + * Since we did not spin on console lock, annotate + * this as a trylock. Otherwise lockdep will + * complain. + */ + mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + + return 1; +} + /* * Call the console drivers, asking them to write out * log_buf[start] to log_buf[end - 1]. @@ -1760,56 +1893,8 @@ asmlinkage int vprintk_emit(int facility, int level, * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock()) { + if (console_trylock_spinning()) console_unlock(); - } else { - struct task_struct *owner = NULL; - bool waiter; - bool spin = false; - - printk_safe_enter_irqsave(flags); - - raw_spin_lock(&console_owner_lock); - owner = READ_ONCE(console_owner); - waiter = READ_ONCE(console_waiter); - if (!waiter && owner && owner != current) { - WRITE_ONCE(console_waiter, true); - spin = true; - } - raw_spin_unlock(&console_owner_lock); - - /* - * If there is an active printk() writing to the - * consoles, instead of having it write our data too, - * see if we can offload that load from the active - * printer, and do some printing ourselves. - * Go into a spin only if there isn't already a waiter - * spinning, and there is an active printer, and - * that active printer isn't us (recursive printk?). - */ - if (spin) { - /* We spin waiting for the owner to release us */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); - /* Owner will clear console_waiter on hand off */ - while (READ_ONCE(console_waiter)) - cpu_relax(); - - spin_release(&console_owner_dep_map, 1, _THIS_IP_); - printk_safe_exit_irqrestore(flags); - - /* - * The owner passed the console lock to us. - * Since we did not spin on console lock, annotate - * this as a trylock. Otherwise lockdep will - * complain. - */ - mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); - console_unlock(); - printk_safe_enter_irqsave(flags); - } - printk_safe_exit_irqrestore(flags); - - } } return printed_len; @@ -1910,6 +1995,8 @@ static ssize_t msg_print_ext_header(char *buf, size_t size, static ssize_t msg_print_ext_body(char *buf, size_t size, char *dict, size_t dict_len, char *text, size_t text_len) { return 0; } +static void console_lock_spinning_enable(void) { } +static int console_lock_spinning_disable_and_check(void) { return 0; } static void call_console_drivers(const char *ext_text, size_t ext_len, const char *text, size_t len) {} static size_t msg_print_text(const struct printk_log *msg, @@ -2196,7 +2283,6 @@ void console_unlock(void) static u64 seen_seq; unsigned long flags; bool wake_klogd = false; - bool waiter = false; bool do_cond_resched, retry; if (console_suspended) { @@ -2291,31 +2377,16 @@ skip: * finish. This task can not be preempted if there is a * waiter waiting to take over. */ - raw_spin_lock(&console_owner_lock); - console_owner = current; - raw_spin_unlock(&console_owner_lock); - - /* The waiter may spin on us after setting console_owner */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); + console_lock_spinning_enable(); stop_critical_timings(); /* don't trace print latency */ call_console_drivers(ext_text, ext_len, text, len); start_critical_timings(); - raw_spin_lock(&console_owner_lock); - waiter = READ_ONCE(console_waiter); - console_owner = NULL; - raw_spin_unlock(&console_owner_lock); - - /* - * If there is a waiter waiting for us, then pass the - * rest of the work load over to that waiter. - */ - if (waiter) - break; - - /* There was no waiter, and nothing will spin on us here */ - spin_release(&console_owner_dep_map, 1, _THIS_IP_); + if (console_lock_spinning_disable_and_check()) { + printk_safe_exit_irqrestore(flags); + return; + } printk_safe_exit_irqrestore(flags); @@ -2323,26 +2394,6 @@ skip: cond_resched(); } - /* - * If there is an active waiter waiting on the console_lock. - * Pass off the printing to the waiter, and the waiter - * will continue printing on its CPU, and when all writing - * has finished, the last printer will wake up klogd. - */ - if (waiter) { - WRITE_ONCE(console_waiter, false); - /* The waiter is now free to continue */ - spin_release(&console_owner_dep_map, 1, _THIS_IP_); - /* - * Hand off console_lock to waiter. The waiter will perform - * the up(). After this, the waiter is the console_lock owner. - */ - mutex_release(&console_lock_dep_map, 1, _THIS_IP_); - printk_safe_exit_irqrestore(flags); - /* Note, if waiter is set, logbuf_lock is not held */ - return; - } - console_locked = 0; /* Release the exclusive_console once it is used */ -- cgit v1.2.3 From fd5f7cde1b85d4c8e09ca46ce948e008a2377f64 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 16 Jan 2018 13:47:16 +0900 Subject: printk: Never set console_may_schedule in console_trylock() This patch, basically, reverts commit 6b97a20d3a79 ("printk: set may_schedule for some of console_trylock() callers"). That commit was a mistake, it introduced a big dependency on the scheduler, by enabling preemption under console_sem in printk()->console_unlock() path, which is rather too critical. The patch did not significantly reduce the possibilities of printk() lockups, but made it possible to stall printk(), as has been reported by Tetsuo Handa [1]. Another issues is that preemption under console_sem also messes up with Steven Rostedt's hand off scheme, by making it possible to sleep with console_sem both in console_unlock() and in vprintk_emit(), after acquiring the console_sem ownership (anywhere between printk_safe_exit_irqrestore() in console_trylock_spinning() and printk_safe_enter_irqsave() in console_unlock()). This makes hand off less likely and, at the same time, may result in a significant amount of pending logbuf messages. Preempted console_sem owner makes it impossible for other CPUs to emit logbuf messages, but does not make it impossible for other CPUs to append new messages to the logbuf. Reinstate the old behavior and make printk() non-preemptible. Should any printk() lockup reports arrive they must be handled in a different way. [1] http://lkml.kernel.org/r/201603022101.CAH73907.OVOOMFHFFtQJSL%20()%20I-love%20!%20SAKURA%20!%20ne%20!%20jp Fixes: 6b97a20d3a79 ("printk: set may_schedule for some of console_trylock() callers") Link: http://lkml.kernel.org/r/20180116044716.GE6607@jagdpanzerIV To: Tetsuo Handa Cc: Sergey Senozhatsky Cc: Tejun Heo Cc: akpm@linux-foundation.org Cc: linux-mm@kvack.org Cc: Cong Wang Cc: Dave Hansen Cc: Johannes Weiner Cc: Mel Gorman Cc: Michal Hocko Cc: Vlastimil Babka Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Jan Kara Cc: Mathieu Desnoyers Cc: Byungchul Park Cc: Pavel Machek Cc: linux-kernel@vger.kernel.org Signed-off-by: Sergey Senozhatsky Reported-by: Tetsuo Handa Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 3a475f58b749..6b9d8d56e0e2 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1888,6 +1888,12 @@ asmlinkage int vprintk_emit(int facility, int level, /* If called from the scheduler, we can not call up(). */ if (!in_sched) { + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to + * console + */ + preempt_disable(); /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up @@ -1895,6 +1901,7 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (console_trylock_spinning()) console_unlock(); + preempt_enable(); } return printed_len; @@ -2211,20 +2218,7 @@ int console_trylock(void) return 0; } console_locked = 1; - /* - * When PREEMPT_COUNT disabled we can't reliably detect if it's - * safe to schedule (e.g. calling printk while holding a spin_lock), - * because preempt_disable()/preempt_enable() are just barriers there - * and preempt_count() is always 0. - * - * RCU read sections have a separate preemption counter when - * PREEMPT_RCU enabled thus we must take extra care and check - * rcu_preempt_depth(), otherwise RCU read sections modify - * preempt_count(). - */ - console_may_schedule = !oops_in_progress && - preemptible() && - !rcu_preempt_depth(); + console_may_schedule = 0; return 1; } EXPORT_SYMBOL(console_trylock); -- cgit v1.2.3 From 0752d7bf626ba90c68caff8be4e5638f6679cacf Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 24 Jul 2017 15:08:16 -0500 Subject: ptrace: Use copy_siginfo in setsiginfo and getsiginfo Now that copy_siginfo copies all of the fields this is safe, safer (as all of the bits are guaranteed to be copied), clearer, and less error prone than using a structure copy. Signed-off-by: "Eric W. Biederman" --- kernel/ptrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index ec4365da9be8..f3c82e26b995 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -659,7 +659,7 @@ static int ptrace_getsiginfo(struct task_struct *child, siginfo_t *info) if (lock_task_sighand(child, &flags)) { error = -EINVAL; if (likely(child->last_siginfo != NULL)) { - *info = *child->last_siginfo; + copy_siginfo(info, child->last_siginfo); error = 0; } unlock_task_sighand(child, &flags); @@ -675,7 +675,7 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info) if (lock_task_sighand(child, &flags)) { error = -EINVAL; if (likely(child->last_siginfo != NULL)) { - *child->last_siginfo = *info; + copy_siginfo(child->last_siginfo, info); error = 0; } unlock_task_sighand(child, &flags); -- cgit v1.2.3 From 0fe875c5c7bc67072b629f13cf1ededcb4da4fb2 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 16 Jan 2018 11:27:05 +0000 Subject: bpf: cpumap: make some functions static Fixes the following sparse warnings: kernel/bpf/cpumap.c:146:6: warning: symbol '__cpu_map_queue_destructor' was not declared. Should it be static? kernel/bpf/cpumap.c:225:16: warning: symbol 'cpu_map_build_skb' was not declared. Should it be static? kernel/bpf/cpumap.c:340:26: warning: symbol '__cpu_map_entry_alloc' was not declared. Should it be static? kernel/bpf/cpumap.c:398:6: warning: symbol '__cpu_map_entry_free' was not declared. Should it be static? kernel/bpf/cpumap.c:441:6: warning: symbol '__cpu_map_entry_replace' was not declared. Should it be static? kernel/bpf/cpumap.c:454:5: warning: symbol 'cpu_map_delete_elem' was not declared. Should it be static? kernel/bpf/cpumap.c:467:5: warning: symbol 'cpu_map_update_elem' was not declared. Should it be static? kernel/bpf/cpumap.c:505:6: warning: symbol 'cpu_map_free' was not declared. Should it be static? Signed-off-by: Wei Yongjun Signed-off-by: Daniel Borkmann --- kernel/bpf/cpumap.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 192151ec9d12..fbfdada6caee 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -137,7 +137,7 @@ free_cmap: return ERR_PTR(err); } -void __cpu_map_queue_destructor(void *ptr) +static void __cpu_map_queue_destructor(void *ptr) { /* The tear-down procedure should have made sure that queue is * empty. See __cpu_map_entry_replace() and work-queue @@ -216,8 +216,8 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) return xdp_pkt; } -struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) +static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, + struct xdp_pkt *xdp_pkt) { unsigned int frame_size; void *pkt_data_start; @@ -331,7 +331,8 @@ static int cpu_map_kthread_run(void *data) return 0; } -struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, int map_id) +static struct bpf_cpu_map_entry *__cpu_map_entry_alloc(u32 qsize, u32 cpu, + int map_id) { gfp_t gfp = GFP_ATOMIC|__GFP_NOWARN; struct bpf_cpu_map_entry *rcpu; @@ -389,7 +390,7 @@ free_rcu: return NULL; } -void __cpu_map_entry_free(struct rcu_head *rcu) +static void __cpu_map_entry_free(struct rcu_head *rcu) { struct bpf_cpu_map_entry *rcpu; int cpu; @@ -432,8 +433,8 @@ void __cpu_map_entry_free(struct rcu_head *rcu) * cpu_map_kthread_stop, which waits for an RCU graze period before * stopping kthread, emptying the queue. */ -void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, - u32 key_cpu, struct bpf_cpu_map_entry *rcpu) +static void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, + u32 key_cpu, struct bpf_cpu_map_entry *rcpu) { struct bpf_cpu_map_entry *old_rcpu; @@ -445,7 +446,7 @@ void __cpu_map_entry_replace(struct bpf_cpu_map *cmap, } } -int cpu_map_delete_elem(struct bpf_map *map, void *key) +static int cpu_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); u32 key_cpu = *(u32 *)key; @@ -458,8 +459,8 @@ int cpu_map_delete_elem(struct bpf_map *map, void *key) return 0; } -int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, - u64 map_flags) +static int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); struct bpf_cpu_map_entry *rcpu; @@ -496,7 +497,7 @@ int cpu_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } -void cpu_map_free(struct bpf_map *map) +static void cpu_map_free(struct bpf_map *map) { struct bpf_cpu_map *cmap = container_of(map, struct bpf_cpu_map, map); int cpu; -- cgit v1.2.3 From 0a2d28ff516c48a21c1e3be1ff7fba3e94248baa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2018 15:51:45 -0800 Subject: bpf: offload: make bpf_offload_dev_match() reject host+host case Daniel suggests it would be more logical for bpf_offload_dev_match() to return false is either the program or the map are not offloaded, rather than treating the both not offloaded case as a "matching CPU/host device". This makes no functional difference today, since verifier only calls bpf_offload_dev_match() when one of the objects is offloaded. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/offload.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 453785fa1881..a88cebf368bf 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -395,10 +395,8 @@ bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) struct bpf_prog_offload *offload; bool ret; - if (!!bpf_prog_is_dev_bound(prog->aux) != !!bpf_map_is_dev_bound(map)) + if (!bpf_prog_is_dev_bound(prog->aux) || !bpf_map_is_dev_bound(map)) return false; - if (!bpf_prog_is_dev_bound(prog->aux)) - return true; down_read(&bpf_devs_lock); offload = prog->aux->offload; -- cgit v1.2.3 From 48b325632641da27a241912d66b38a1221ab425f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 16 Jan 2018 15:51:46 -0800 Subject: bpf: annotate bpf_insn_print_t with __printf Functions of type bpf_insn_print_t take printf-like format string, mark the type accordingly. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/disasm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index e0857d016f89..266fe8ee542b 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -29,8 +29,8 @@ extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -typedef void (*bpf_insn_print_t)(struct bpf_verifier_env *env, - const char *, ...); +typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env, + const char *, ...); typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, const struct bpf_insn *insn); typedef const char *(*bpf_insn_print_imm_t)(void *private_data, -- cgit v1.2.3 From fcfb126defda3cee3f1d9460dbe9a2ccac4fbd21 Mon Sep 17 00:00:00 2001 From: Jiong Wang Date: Tue, 16 Jan 2018 16:05:19 -0800 Subject: bpf: add new jited info fields in bpf_dev_offload and bpf_prog_info For host JIT, there are "jited_len"/"bpf_func" fields in struct bpf_prog used by all host JIT targets to get jited image and it's length. While for offload, targets are likely to have different offload mechanisms that these info are kept in device private data fields. Therefore, BPF_OBJ_GET_INFO_BY_FD syscall needs an unified way to get JIT length and contents info for offload targets. One way is to introduce new callback to parse device private data then fill those fields in bpf_prog_info. This might be a little heavy, the other way is to add generic fields which will be initialized by all offload targets. This patch follow the second approach to introduce two new fields in struct bpf_dev_offload and teach bpf_prog_get_info_by_fd about them to fill correct jited_prog_len and jited_prog_insns in bpf_prog_info. Reviewed-by: Jakub Kicinski Signed-off-by: Jiong Wang Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ kernel/bpf/offload.c | 23 +++++++++++++++++++++++ kernel/bpf/syscall.c | 31 ++++++++++++++++++------------- 3 files changed, 43 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 5c2c104dc2c5..025b1c2f8053 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -234,6 +234,8 @@ struct bpf_prog_offload { struct list_head offloads; bool dev_state; const struct bpf_prog_offload_ops *dev_ops; + void *jited_image; + u32 jited_len; }; struct bpf_prog_aux { diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index a88cebf368bf..6c0baa1cf8f8 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -230,9 +230,12 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, .prog = prog, .info = info, }; + struct bpf_prog_aux *aux = prog->aux; struct inode *ns_inode; struct path ns_path; + char __user *uinsns; void *res; + u32 ulen; res = ns_get_path_cb(&ns_path, bpf_prog_offload_info_fill_ns, &args); if (IS_ERR(res)) { @@ -241,6 +244,26 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, return PTR_ERR(res); } + down_read(&bpf_devs_lock); + + if (!aux->offload) { + up_read(&bpf_devs_lock); + return -ENODEV; + } + + ulen = info->jited_prog_len; + info->jited_prog_len = aux->offload->jited_len; + if (info->jited_prog_len & ulen) { + uinsns = u64_to_user_ptr(info->jited_prog_insns); + ulen = min_t(u32, info->jited_prog_len, ulen); + if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { + up_read(&bpf_devs_lock); + return -EFAULT; + } + } + + up_read(&bpf_devs_lock); + ns_inode = ns_path.dentry->d_inode; info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); info->netns_ino = ns_inode->i_ino; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c691b9e972e3..c28524483bf4 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1724,19 +1724,6 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, goto done; } - ulen = info.jited_prog_len; - info.jited_prog_len = prog->jited_len; - if (info.jited_prog_len && ulen) { - if (bpf_dump_raw_ok()) { - uinsns = u64_to_user_ptr(info.jited_prog_insns); - ulen = min_t(u32, info.jited_prog_len, ulen); - if (copy_to_user(uinsns, prog->bpf_func, ulen)) - return -EFAULT; - } else { - info.jited_prog_insns = 0; - } - } - ulen = info.xlated_prog_len; info.xlated_prog_len = bpf_prog_insn_size(prog); if (info.xlated_prog_len && ulen) { @@ -1762,6 +1749,24 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog, err = bpf_prog_offload_info_fill(&info, prog); if (err) return err; + goto done; + } + + /* NOTE: the following code is supposed to be skipped for offload. + * bpf_prog_offload_info_fill() is the place to fill similar fields + * for offload. + */ + ulen = info.jited_prog_len; + info.jited_prog_len = prog->jited_len; + if (info.jited_prog_len && ulen) { + if (bpf_dump_raw_ok()) { + uinsns = u64_to_user_ptr(info.jited_prog_insns); + ulen = min_t(u32, info.jited_prog_len, ulen); + if (copy_to_user(uinsns, prog->bpf_func, ulen)) + return -EFAULT; + } else { + info.jited_prog_insns = 0; + } } done: -- cgit v1.2.3 From eefa864a81501161c05b35f14197677c937e5b9a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 17 Jan 2018 09:19:32 -0800 Subject: bpf: change fake_ip for bpf_trace_printk helper Currently, for bpf_trace_printk helper, fake ip address 0x1 is used with comments saying that fake ip will not be printed. This is indeed true for 4.12 and earlier version, but for 4.13 and later version, the ip address will be printed if it cannot be resolved with kallsym. Running samples/bpf/tracex5 program and you will have the following in the debugfs trace_pipe output: ... <...>-1819 [003] .... 443.497877: 0x00000001: mmap <...>-1819 [003] .... 443.498289: 0x00000001: syscall=102 (one of get/set uid/pid/gid) ... The kernel commit changed this behavior is: commit feaf1283d11794b9d518fcfd54b6bf8bee1f0b4b Author: Steven Rostedt (VMware) Date: Thu Jun 22 17:04:55 2017 -0400 tracing: Show address when function names are not found ... This patch changed the comment and also altered the fake ip address to 0x0 as users may think 0x1 has some special meaning while it doesn't. The new output: ... <...>-1799 [002] .... 25.953576: 0: mmap <...>-1799 [002] .... 25.953865: 0: read(fd=0, buf=00000000053936b5, size=512) ... Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/trace/bpf_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f274468cbc45..fc2838ac8b78 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -245,7 +245,7 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1, */ #define __BPF_TP_EMIT() __BPF_ARG3_TP() #define __BPF_TP(...) \ - __trace_printk(1 /* Fake ip will not be printed. */, \ + __trace_printk(0 /* Fake ip */, \ fmt, ##__VA_ARGS__) #define __BPF_ARG1_TP(...) \ -- cgit v1.2.3 From 61f3c964dfd287b05d7ac6660a4f4ddfef84786c Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 17 Jan 2018 16:52:02 -0800 Subject: bpf: allow socket_filter programs to use bpf_prog_test_run in order to improve test coverage allow socket_filter program type to be run via bpf_prog_test_run command. Since such programs can be loaded by non-root tighten permissions for bpf_prog_test_run to be root only to avoid surprises. Signed-off-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- kernel/bpf/syscall.c | 2 ++ net/core/filter.c | 1 + 2 files changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c28524483bf4..97a825ffc763 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1504,6 +1504,8 @@ static int bpf_prog_test_run(const union bpf_attr *attr, struct bpf_prog *prog; int ret = -ENOTSUPP; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; if (CHECK_ATTR(BPF_PROG_TEST_RUN)) return -EINVAL; diff --git a/net/core/filter.c b/net/core/filter.c index db2ee8c7e1bd..30fafaaa90fa 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4526,6 +4526,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { }; const struct bpf_prog_ops sk_filter_prog_ops = { + .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { -- cgit v1.2.3 From ad46061fca87c0ab6670af3a44e03237f99d7a1f Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:25 -0800 Subject: bpf: arraymap: move checks out of alloc function Use the new callback to perform allocation checks for array maps. The fd maps don't need a special allocation callback, they only need a special check callback. Signed-off-by: Jakub Kicinski Reviewed-by: Quentin Monnet Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 42 ++++++++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index ab94d304a634..68336092bfb4 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -49,27 +49,35 @@ static int bpf_array_alloc_percpu(struct bpf_array *array) } /* Called from syscall */ -static struct bpf_map *array_map_alloc(union bpf_attr *attr) +static int array_map_alloc_check(union bpf_attr *attr) { bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; int numa_node = bpf_map_attr_numa_node(attr); - u32 elem_size, index_mask, max_entries; - bool unpriv = !capable(CAP_SYS_ADMIN); - struct bpf_array *array; - u64 array_size, mask64; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || attr->value_size == 0 || attr->map_flags & ~ARRAY_CREATE_FLAG_MASK || (percpu && numa_node != NUMA_NO_NODE)) - return ERR_PTR(-EINVAL); + return -EINVAL; if (attr->value_size > KMALLOC_MAX_SIZE) /* if value_size is bigger, the user space won't be able to * access the elements. */ - return ERR_PTR(-E2BIG); + return -E2BIG; + + return 0; +} + +static struct bpf_map *array_map_alloc(union bpf_attr *attr) +{ + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; + int numa_node = bpf_map_attr_numa_node(attr); + u32 elem_size, index_mask, max_entries; + bool unpriv = !capable(CAP_SYS_ADMIN); + struct bpf_array *array; + u64 array_size, mask64; elem_size = round_up(attr->value_size, 8); @@ -327,6 +335,7 @@ static void array_map_free(struct bpf_map *map) } const struct bpf_map_ops array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -337,6 +346,7 @@ const struct bpf_map_ops array_map_ops = { }; const struct bpf_map_ops percpu_array_map_ops = { + .map_alloc_check = array_map_alloc_check, .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -345,12 +355,12 @@ const struct bpf_map_ops percpu_array_map_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) +static int fd_array_map_alloc_check(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ if (attr->value_size != sizeof(u32)) - return ERR_PTR(-EINVAL); - return array_map_alloc(attr); + return -EINVAL; + return array_map_alloc_check(attr); } static void fd_array_map_free(struct bpf_map *map) @@ -474,7 +484,8 @@ void bpf_fd_array_map_clear(struct bpf_map *map) } const struct bpf_map_ops prog_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -561,7 +572,8 @@ static void perf_event_fd_array_release(struct bpf_map *map, } const struct bpf_map_ops perf_event_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -592,7 +604,8 @@ static void cgroup_fd_array_free(struct bpf_map *map) } const struct bpf_map_ops cgroup_array_map_ops = { - .map_alloc = fd_array_map_alloc, + .map_alloc_check = fd_array_map_alloc_check, + .map_alloc = array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = fd_array_map_lookup_elem, @@ -610,7 +623,7 @@ static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) if (IS_ERR(inner_map_meta)) return inner_map_meta; - map = fd_array_map_alloc(attr); + map = array_map_alloc(attr); if (IS_ERR(map)) { bpf_map_meta_free(inner_map_meta); return map; @@ -673,6 +686,7 @@ static u32 array_of_map_gen_lookup(struct bpf_map *map, } const struct bpf_map_ops array_of_maps_map_ops = { + .map_alloc_check = fd_array_map_alloc_check, .map_alloc = array_of_map_alloc, .map_free = array_of_map_free, .map_get_next_key = array_map_get_next_key, -- cgit v1.2.3 From 32852649ba3f74aab10025f2e59ca2b49d5cccfa Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:26 -0800 Subject: bpf: arraymap: use bpf_map_init_from_attr() Arraymap was not converted to use bpf_map_init_from_attr() to avoid merge conflicts with emergency fixes. Do it now. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- kernel/bpf/arraymap.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 68336092bfb4..b1f66480135b 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -120,12 +120,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) array->map.unpriv_array = unpriv; /* copy mandatory map attributes */ - array->map.map_type = attr->map_type; - array->map.key_size = attr->key_size; - array->map.value_size = attr->value_size; - array->map.max_entries = attr->max_entries; - array->map.map_flags = attr->map_flags; - array->map.numa_node = numa_node; + bpf_map_init_from_attr(&array->map, attr); array->elem_size = elem_size; if (!percpu) -- cgit v1.2.3 From 7a0ef6939548b9eb74bf464daf55ad68a23602a2 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:27 -0800 Subject: bpf: offload: allow array map offload The special handling of different map types is left to the driver. Allow offload of array maps by simply adding it to accepted types. For nfp we have to make sure array elements are not deleted. Signed-off-by: Jakub Kicinski Signed-off-by: Daniel Borkmann --- drivers/net/ethernet/netronome/nfp/bpf/offload.c | 2 ++ kernel/bpf/offload.c | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index c452bf9462e0..1a357aacc444 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -176,6 +176,8 @@ nfp_bpf_map_get_next_key(struct bpf_offloaded_map *offmap, static int nfp_bpf_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) { + if (offmap->map.map_type == BPF_MAP_TYPE_ARRAY) + return -EINVAL; return nfp_bpf_ctrl_del_entry(offmap, key); } diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 6c0baa1cf8f8..2657976aec2a 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -299,7 +299,8 @@ struct bpf_map *bpf_map_offload_map_alloc(union bpf_attr *attr) if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (attr->map_type != BPF_MAP_TYPE_HASH) + if (attr->map_type != BPF_MAP_TYPE_ARRAY && + attr->map_type != BPF_MAP_TYPE_HASH) return ERR_PTR(-EINVAL); offmap = kzalloc(sizeof(*offmap), GFP_USER); -- cgit v1.2.3 From 52775b33bb5072fbc07b02c0cf4fe8da1f7ee7cd Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 17 Jan 2018 19:13:28 -0800 Subject: bpf: offload: report device information about offloaded maps Tell user space about device on which the map was created. Unfortunate reality of user ABI makes sharing this code with program offload difficult but the information is the same. Signed-off-by: Jakub Kicinski Acked-by: Alexei Starovoitov Signed-off-by: Daniel Borkmann --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 3 +++ kernel/bpf/offload.c | 55 ++++++++++++++++++++++++++++++++++++++++++ kernel/bpf/syscall.c | 6 +++++ tools/include/uapi/linux/bpf.h | 3 +++ 5 files changed, 69 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 025b1c2f8053..66df387106de 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -586,6 +586,8 @@ void bpf_prog_offload_destroy(struct bpf_prog *prog); int bpf_prog_offload_info_fill(struct bpf_prog_info *info, struct bpf_prog *prog); +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map); + int bpf_map_offload_lookup_elem(struct bpf_map *map, void *key, void *value); int bpf_map_offload_update_elem(struct bpf_map *map, void *key, void *value, u64 flags); diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 74dc4dc98681..406c19d6016b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -938,6 +938,9 @@ struct bpf_map_info { __u32 max_entries; __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2657976aec2a..c9401075b58c 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -413,6 +413,61 @@ int bpf_map_offload_get_next_key(struct bpf_map *map, void *key, void *next_key) return ret; } +struct ns_get_path_bpf_map_args { + struct bpf_offloaded_map *offmap; + struct bpf_map_info *info; +}; + +static struct ns_common *bpf_map_offload_info_fill_ns(void *private_data) +{ + struct ns_get_path_bpf_map_args *args = private_data; + struct ns_common *ns; + struct net *net; + + rtnl_lock(); + down_read(&bpf_devs_lock); + + if (args->offmap->netdev) { + args->info->ifindex = args->offmap->netdev->ifindex; + net = dev_net(args->offmap->netdev); + get_net(net); + ns = &net->ns; + } else { + args->info->ifindex = 0; + ns = NULL; + } + + up_read(&bpf_devs_lock); + rtnl_unlock(); + + return ns; +} + +int bpf_map_offload_info_fill(struct bpf_map_info *info, struct bpf_map *map) +{ + struct ns_get_path_bpf_map_args args = { + .offmap = map_to_offmap(map), + .info = info, + }; + struct inode *ns_inode; + struct path ns_path; + void *res; + + res = ns_get_path_cb(&ns_path, bpf_map_offload_info_fill_ns, &args); + if (IS_ERR(res)) { + if (!info->ifindex) + return -ENODEV; + return PTR_ERR(res); + } + + ns_inode = ns_path.dentry->d_inode; + info->netns_dev = new_encode_dev(ns_inode->i_sb->s_dev); + info->netns_ino = ns_inode->i_ino; + path_put(&ns_path); + + return 0; +} + bool bpf_offload_dev_match(struct bpf_prog *prog, struct bpf_map *map) { struct bpf_offloaded_map *offmap; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 97a825ffc763..5bdb0cc84ad2 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1801,6 +1801,12 @@ static int bpf_map_get_info_by_fd(struct bpf_map *map, info.map_flags = map->map_flags; memcpy(info.name, map->name, sizeof(map->name)); + if (bpf_map_is_dev_bound(map)) { + err = bpf_map_offload_info_fill(&info, map); + if (err) + return err; + } + if (copy_to_user(uinfo, &info, info_len) || put_user(info_len, &uattr->info.info_len)) return -EFAULT; diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index 7c2259e8bc54..af1f49ad8b88 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -938,6 +938,9 @@ struct bpf_map_info { __u32 max_entries; __u32 map_flags; char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u64 netns_dev; + __u64 netns_ino; } __attribute__((aligned(8))); /* User bpf_sock_ops struct to access socket values and specify request ops -- cgit v1.2.3 From 08a77676f9c5fc69a681ccd2cd8140e65dcb26c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 9 Jan 2018 07:21:15 -0800 Subject: string: drop __must_check from strscpy() and restore strscpy() usages in cgroup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit e7fd37ba1217 ("cgroup: avoid copying strings longer than the buffers") converted possibly unsafe strncpy() usages in cgroup to strscpy(). However, although the callsites are completely fine with truncated copied, because strscpy() is marked __must_check, it led to the following warnings. kernel/cgroup/cgroup.c: In function ‘cgroup_file_name’: kernel/cgroup/cgroup.c:1400:10: warning: ignoring return value of ‘strscpy’, declared with attribute warn_unused_result [-Wunused-result] strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); ^ To avoid the warnings, 50034ed49645 ("cgroup: use strlcpy() instead of strscpy() to avoid spurious warning") switched them to strlcpy(). strlcpy() is worse than strlcpy() because it unconditionally runs strlen() on the source string, and the only reason we switched to strlcpy() here was because it was lacking __must_check, which doesn't reflect any material differences between the two function. It's just that someone added __must_check to strscpy() and not to strlcpy(). These basic string copy operations are used in variety of ways, and one of not-so-uncommon use cases is safely handling truncated copies, where the caller naturally doesn't care about the return value. The __must_check doesn't match the actual use cases and forces users to opt for inferior variants which lack __must_check by happenstance or spread ugly (void) casts. Remove __must_check from strscpy() and restore strscpy() usages in cgroup. Signed-off-by: Tejun Heo Suggested-by: Linus Torvalds Cc: Ma Shimiao Cc: Arnd Bergmann Cc: Chris Metcalf --- include/linux/string.h | 2 +- kernel/cgroup/cgroup.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/string.h b/include/linux/string.h index 410ecf17de3c..dfdf8afeac64 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -28,7 +28,7 @@ extern char * strncpy(char *,const char *, __kernel_size_t); size_t strlcpy(char *, const char *, size_t); #endif #ifndef __HAVE_ARCH_STRSCPY -ssize_t __must_check strscpy(char *, const char *, size_t); +ssize_t strscpy(char *, const char *, size_t); #endif #ifndef __HAVE_ARCH_STRCAT extern char * strcat(char *, const char *); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 7e4c44538119..8cda3bc3ae22 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1397,7 +1397,7 @@ static char *cgroup_file_name(struct cgroup *cgrp, const struct cftype *cft, cgroup_on_dfl(cgrp) ? ss->name : ss->legacy_name, cft->name); else - strlcpy(buf, cft->name, CGROUP_FILE_NAME_MAX); + strscpy(buf, cft->name, CGROUP_FILE_NAME_MAX); return buf; } @@ -1864,9 +1864,9 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) root->flags = opts->flags; if (opts->release_agent) - strlcpy(root->release_agent_path, opts->release_agent, PATH_MAX); + strscpy(root->release_agent_path, opts->release_agent, PATH_MAX); if (opts->name) - strlcpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); + strscpy(root->name, opts->name, MAX_CGROUP_ROOT_NAMELEN); if (opts->cpuset_clone_children) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -- cgit v1.2.3 From b471f2f1de8b816f1e799b80aa92588f3566e4bd Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 18 Jan 2018 15:08:50 -0800 Subject: bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map Current LPM_TRIE map type does not implement MAP_GET_NEXT_KEY command. This command is handy when users want to enumerate keys. Otherwise, a different map which supports key enumeration may be required to store the keys. If the map data is sparse and all map data are to be deleted without closing file descriptor, using MAP_GET_NEXT_KEY to find all keys is much faster than enumerating all key space. This patch implements MAP_GET_NEXT_KEY command for LPM_TRIE map. If user provided key pointer is NULL or the key does not have an exact match in the trie, the first key will be returned. Otherwise, the next key will be returned. In this implemenation, key enumeration follows a postorder traversal of internal trie. More specific keys will be returned first than less specific ones, given a sequence of MAP_GET_NEXT_KEY syscalls. Signed-off-by: Yonghong Song Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 95 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 584e02227671..d7ea96218516 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -591,9 +591,100 @@ unlock: raw_spin_unlock(&trie->lock); } -static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) +static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { - return -ENOTSUPP; + struct lpm_trie *trie = container_of(map, struct lpm_trie, map); + struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; + struct lpm_trie_node *node, *next_node = NULL, *parent; + struct lpm_trie_node **node_stack = NULL; + struct lpm_trie_node __rcu **root; + int err = 0, stack_ptr = -1; + unsigned int next_bit; + size_t matchlen; + + /* The get_next_key follows postorder. For the 4 node example in + * the top of this file, the trie_get_next_key() returns the following + * one after another: + * 192.168.0.0/24 + * 192.168.1.0/24 + * 192.168.128.0/24 + * 192.168.0.0/16 + * + * The idea is to return more specific keys before less specific ones. + */ + + /* Empty trie */ + if (!rcu_dereference(trie->root)) + return -ENOENT; + + /* For invalid key, find the leftmost node in the trie */ + if (!key || key->prefixlen > trie->max_prefixlen) { + root = &trie->root; + goto find_leftmost; + } + + node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), + GFP_USER | __GFP_NOWARN); + if (!node_stack) + return -ENOMEM; + + /* Try to find the exact node for the given key */ + for (node = rcu_dereference(trie->root); node;) { + node_stack[++stack_ptr] = node; + matchlen = longest_prefix_match(trie, node, key); + if (node->prefixlen != matchlen || + node->prefixlen == key->prefixlen) + break; + + next_bit = extract_bit(key->data, node->prefixlen); + node = rcu_dereference(node->child[next_bit]); + } + if (!node || node->prefixlen != key->prefixlen || + (node->flags & LPM_TREE_NODE_FLAG_IM)) { + root = &trie->root; + goto find_leftmost; + } + + /* The node with the exactly-matching key has been found, + * find the first node in postorder after the matched node. + */ + node = node_stack[stack_ptr]; + while (stack_ptr > 0) { + parent = node_stack[stack_ptr - 1]; + if (rcu_dereference(parent->child[0]) == node && + rcu_dereference(parent->child[1])) { + root = &parent->child[1]; + goto find_leftmost; + } + if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { + next_node = parent; + goto do_copy; + } + + node = parent; + stack_ptr--; + } + + /* did not find anything */ + err = -ENOENT; + goto free_stack; + +find_leftmost: + /* Find the leftmost non-intermediate node, all intermediate nodes + * have exact two children, so this function will never return NULL. + */ + for (node = rcu_dereference(*root); node;) { + if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) + next_node = node; + node = rcu_dereference(node->child[0]); + } +do_copy: + next_key->prefixlen = next_node->prefixlen; + memcpy((void *)next_key + offsetof(struct bpf_lpm_trie_key, data), + next_node->data, trie->data_size); +free_stack: + kfree(node_stack); + return err; } const struct bpf_map_ops trie_map_ops = { -- cgit v1.2.3 From 10a0cd6e4932b5078215b1ec2c896597eec0eff9 Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Fri, 19 Jan 2018 16:27:54 -0800 Subject: mm: Fix memory size alignment in devm_memremap_pages_release() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The functions devm_memremap_pages() and devm_memremap_pages_release() use different ways to calculate the section-aligned amount of memory. The latter function may use an incorrect size if the memory region is small but straddles a section border. Use the same code for both. Cc: Fixes: 5f29a77cd957 ("mm: fix mixed zone detection in devm_memremap_pages") Signed-off-by: Jan H. Schönherr Signed-off-by: Dan Williams --- kernel/memremap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index ada31b0d76d4..4ef97525a4ff 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -293,7 +293,8 @@ static void devm_memremap_pages_release(void *data) /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) + - align_start; mem_hotplug_begin(); arch_remove_memory(align_start, align_size, pgmap->altmap_valid ? -- cgit v1.2.3 From 77dd66a3c67c93ab401ccc15efff25578be281fd Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Fri, 19 Jan 2018 16:26:33 -0800 Subject: mm: Fix devm_memremap_pages() collision handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If devm_memremap_pages() detects a collision while adding entries to the radix-tree, we call pgmap_radix_release(). Unfortunately, the function removes *all* entries for the range -- including the entries that caused the collision in the first place. Modify pgmap_radix_release() to take an additional argument to indicate where to stop, so that only newly added entries are removed from the tree. Cc: Fixes: 9476df7d80df ("mm: introduce find_dev_pagemap()") Signed-off-by: Jan H. Schönherr Signed-off-by: Dan Williams --- kernel/memremap.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/memremap.c b/kernel/memremap.c index 4ef97525a4ff..4849be5f9b3c 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -241,13 +241,16 @@ int device_private_entry_fault(struct vm_area_struct *vma, EXPORT_SYMBOL(device_private_entry_fault); #endif /* CONFIG_DEVICE_PRIVATE */ -static void pgmap_radix_release(struct resource *res) +static void pgmap_radix_release(struct resource *res, unsigned long end_pgoff) { unsigned long pgoff, order; mutex_lock(&pgmap_lock); - foreach_order_pgoff(res, order, pgoff) + foreach_order_pgoff(res, order, pgoff) { + if (pgoff >= end_pgoff) + break; radix_tree_delete(&pgmap_radix, PHYS_PFN(res->start) + pgoff); + } mutex_unlock(&pgmap_lock); synchronize_rcu(); @@ -302,7 +305,7 @@ static void devm_memremap_pages_release(void *data) mem_hotplug_done(); untrack_pfn(NULL, PHYS_PFN(align_start), align_size); - pgmap_radix_release(res); + pgmap_radix_release(res, -1); dev_WARN_ONCE(dev, pgmap->altmap.alloc, "%s: failed to free all reserved pages\n", __func__); } @@ -418,7 +421,7 @@ void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) untrack_pfn(NULL, PHYS_PFN(align_start), align_size); err_pfn_remap: err_radix: - pgmap_radix_release(res); + pgmap_radix_release(res, pgoff); devres_free(pgmap); return ERR_PTR(error); } -- cgit v1.2.3 From 901334159419afc8c1b8556243fc53e9617472d2 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:29 +0100 Subject: bpf, verifier: detect misconfigured mem, size argument pair I've seen two patch proposals now for helper additions that used ARG_PTR_TO_MEM or similar in reg_X but no corresponding ARG_CONST_SIZE in reg_X+1. Verifier won't complain in such case, but it will omit verifying the memory passed to the helper thus ending up badly. Detect such buggy helper function signature and bail out during verification rather than finding them through review. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 79 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 55 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 2e7a43edf264..5eeb200e82c4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1837,6 +1837,19 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, } } +static bool arg_type_is_mem_ptr(enum bpf_arg_type type) +{ + return type == ARG_PTR_TO_MEM || + type == ARG_PTR_TO_MEM_OR_NULL || + type == ARG_PTR_TO_UNINIT_MEM; +} + +static bool arg_type_is_mem_size(enum bpf_arg_type type) +{ + return type == ARG_CONST_SIZE || + type == ARG_CONST_SIZE_OR_ZERO; +} + static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) @@ -1886,9 +1899,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, expected_type = PTR_TO_CTX; if (type != expected_type) goto err_type; - } else if (arg_type == ARG_PTR_TO_MEM || - arg_type == ARG_PTR_TO_MEM_OR_NULL || - arg_type == ARG_PTR_TO_UNINIT_MEM) { + } else if (arg_type_is_mem_ptr(arg_type)) { expected_type = PTR_TO_STACK; /* One exception here. In case function allows for NULL to be * passed in as argument, it's a SCALAR_VALUE type. Final test @@ -1949,25 +1960,12 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, err = check_stack_boundary(env, regno, meta->map_ptr->value_size, false, NULL); - } else if (arg_type == ARG_CONST_SIZE || - arg_type == ARG_CONST_SIZE_OR_ZERO) { + } else if (arg_type_is_mem_size(arg_type)) { bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO); - /* bpf_xxx(..., buf, len) call will access 'len' bytes - * from stack pointer 'buf'. Check it - * note: regno == len, regno - 1 == buf - */ - if (regno == 0) { - /* kernel subsystem misconfigured verifier */ - verbose(env, - "ARG_CONST_SIZE cannot be first argument\n"); - return -EACCES; - } - /* The register is SCALAR_VALUE; the access check * happens using its boundaries. */ - if (!tnum_is_const(reg->var_off)) /* For unprivileged variable accesses, disable raw * mode so that the program is required to @@ -2111,7 +2109,7 @@ error: return -EINVAL; } -static int check_raw_mode(const struct bpf_func_proto *fn) +static bool check_raw_mode_ok(const struct bpf_func_proto *fn) { int count = 0; @@ -2126,7 +2124,44 @@ static int check_raw_mode(const struct bpf_func_proto *fn) if (fn->arg5_type == ARG_PTR_TO_UNINIT_MEM) count++; - return count > 1 ? -EINVAL : 0; + /* We only support one arg being in raw mode at the moment, + * which is sufficient for the helper functions we have + * right now. + */ + return count <= 1; +} + +static bool check_args_pair_invalid(enum bpf_arg_type arg_curr, + enum bpf_arg_type arg_next) +{ + return (arg_type_is_mem_ptr(arg_curr) && + !arg_type_is_mem_size(arg_next)) || + (!arg_type_is_mem_ptr(arg_curr) && + arg_type_is_mem_size(arg_next)); +} + +static bool check_arg_pair_ok(const struct bpf_func_proto *fn) +{ + /* bpf_xxx(..., buf, len) call will access 'len' + * bytes from memory 'buf'. Both arg types need + * to be paired, so make sure there's no buggy + * helper function specification. + */ + if (arg_type_is_mem_size(fn->arg1_type) || + arg_type_is_mem_ptr(fn->arg5_type) || + check_args_pair_invalid(fn->arg1_type, fn->arg2_type) || + check_args_pair_invalid(fn->arg2_type, fn->arg3_type) || + check_args_pair_invalid(fn->arg3_type, fn->arg4_type) || + check_args_pair_invalid(fn->arg4_type, fn->arg5_type)) + return false; + + return true; +} + +static int check_func_proto(const struct bpf_func_proto *fn) +{ + return check_raw_mode_ok(fn) && + check_arg_pair_ok(fn) ? 0 : -EINVAL; } /* Packet data might have moved, any old PTR_TO_PACKET[_META,_END] @@ -2282,7 +2317,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (env->ops->get_func_proto) fn = env->ops->get_func_proto(func_id); - if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -2306,10 +2340,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn memset(&meta, 0, sizeof(meta)); meta.pkt_access = fn->pkt_access; - /* We only support one arg being in raw mode at the moment, which - * is sufficient for the helper functions we have right now. - */ - err = check_raw_mode(fn); + err = check_func_proto(fn); if (err) { verbose(env, "kernel subsystem misconfigured func %s#%d\n", func_id_name(func_id), func_id); -- cgit v1.2.3 From fa9dd599b4dae841924b022768354cfde9affecb Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:33 +0100 Subject: bpf: get rid of pure_initcall dependency to enable jits Having a pure_initcall() callback just to permanently enable BPF JITs under CONFIG_BPF_JIT_ALWAYS_ON is unnecessary and could leave a small race window in future where JIT is still disabled on boot. Since we know about the setting at compilation time anyway, just initialize it properly there. Also consolidate all the individual bpf_jit_enable variables into a single one and move them under one location. Moreover, don't allow for setting unspecified garbage values on them. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- arch/arm/net/bpf_jit_32.c | 2 -- arch/arm64/net/bpf_jit_comp.c | 2 -- arch/mips/net/bpf_jit.c | 2 -- arch/mips/net/ebpf_jit.c | 2 -- arch/powerpc/net/bpf_jit_comp.c | 2 -- arch/powerpc/net/bpf_jit_comp64.c | 2 -- arch/s390/net/bpf_jit_comp.c | 2 -- arch/sparc/net/bpf_jit_comp_32.c | 2 -- arch/sparc/net/bpf_jit_comp_64.c | 2 -- arch/x86/net/bpf_jit_comp.c | 2 -- kernel/bpf/core.c | 19 ++++++++++++------- net/core/sysctl_net_core.c | 18 ++++++++++++------ net/socket.c | 9 --------- 13 files changed, 24 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 4425189bb24c..a15e7cdf8754 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -25,8 +25,6 @@ #include "bpf_jit_32.h" -int bpf_jit_enable __read_mostly; - #define STACK_OFFSET(k) (k) #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) /* TEMP Register 1 */ #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) /* TEMP Register 2 */ diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index acaa935ed977..8d456ee6dddc 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -31,8 +31,6 @@ #include "bpf_jit.h" -int bpf_jit_enable __read_mostly; - #define TMP_REG_1 (MAX_BPF_JIT_REG + 0) #define TMP_REG_2 (MAX_BPF_JIT_REG + 1) #define TCALL_CNT (MAX_BPF_JIT_REG + 2) diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 44b925005dd3..4d8cb9bb8365 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1207,8 +1207,6 @@ jmp_cmp: return 0; } -int bpf_jit_enable __read_mostly; - void bpf_jit_compile(struct bpf_prog *fp) { struct jit_ctx ctx; diff --git a/arch/mips/net/ebpf_jit.c b/arch/mips/net/ebpf_jit.c index 97069a1b6f43..4e347030ed2c 100644 --- a/arch/mips/net/ebpf_jit.c +++ b/arch/mips/net/ebpf_jit.c @@ -177,8 +177,6 @@ static u32 b_imm(unsigned int tgt, struct jit_ctx *ctx) (ctx->idx * 4) - 4; } -int bpf_jit_enable __read_mostly; - enum which_ebpf_reg { src_reg, src_reg_no_fp, diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index f9941b3b5770..872d1f6dd11e 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -18,8 +18,6 @@ #include "bpf_jit32.h" -int bpf_jit_enable __read_mostly; - static inline void bpf_flush_icache(void *start, void *end) { smp_wmb(); diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 6771c63b2bec..217a78e84865 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -21,8 +21,6 @@ #include "bpf_jit64.h" -int bpf_jit_enable __read_mostly; - static void bpf_jit_fill_ill_insns(void *area, unsigned int size) { memset32(area, BREAKPOINT_INSTRUCTION, size/4); diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index 1dfadbd126f3..e50188773ff3 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -28,8 +28,6 @@ #include #include "bpf_jit.h" -int bpf_jit_enable __read_mostly; - struct bpf_jit { u32 seen; /* Flags to remember seen eBPF instructions */ u32 seen_reg[16]; /* Array to remember which registers are used */ diff --git a/arch/sparc/net/bpf_jit_comp_32.c b/arch/sparc/net/bpf_jit_comp_32.c index 09e318eb34ee..3bd8ca95e521 100644 --- a/arch/sparc/net/bpf_jit_comp_32.c +++ b/arch/sparc/net/bpf_jit_comp_32.c @@ -11,8 +11,6 @@ #include "bpf_jit_32.h" -int bpf_jit_enable __read_mostly; - static inline bool is_simm13(unsigned int value) { return value + 0x1000 < 0x2000; diff --git a/arch/sparc/net/bpf_jit_comp_64.c b/arch/sparc/net/bpf_jit_comp_64.c index 635fdefd4ae2..50a24d7bd4c5 100644 --- a/arch/sparc/net/bpf_jit_comp_64.c +++ b/arch/sparc/net/bpf_jit_comp_64.c @@ -12,8 +12,6 @@ #include "bpf_jit_64.h" -int bpf_jit_enable __read_mostly; - static inline bool is_simm13(unsigned int value) { return value + 0x1000 < 0x2000; diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 87f214fbe66e..b881a979efe1 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -15,8 +15,6 @@ #include #include -int bpf_jit_enable __read_mostly; - /* * assembly code in arch/x86/net/bpf_jit.S */ diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 25e723b0dfd4..bc9c5b11d6a9 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -300,6 +300,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, } #ifdef CONFIG_BPF_JIT +/* All BPF JIT sysctl knobs here. */ +int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +int bpf_jit_harden __read_mostly; +int bpf_jit_kallsyms __read_mostly; + static __always_inline void bpf_get_prog_addr_region(const struct bpf_prog *prog, unsigned long *symbol_start, @@ -381,8 +386,6 @@ static DEFINE_SPINLOCK(bpf_lock); static LIST_HEAD(bpf_kallsyms); static struct latch_tree_root bpf_tree __cacheline_aligned; -int bpf_jit_kallsyms __read_mostly; - static void bpf_prog_ksym_node_add(struct bpf_prog_aux *aux) { WARN_ON_ONCE(!list_empty(&aux->ksym_lnode)); @@ -563,8 +566,6 @@ void __weak bpf_jit_free(struct bpf_prog *fp) bpf_prog_unlock_free(fp); } -int bpf_jit_harden __read_mostly; - static int bpf_jit_blind_insn(const struct bpf_insn *from, const struct bpf_insn *aux, struct bpf_insn *to_buff) @@ -1379,9 +1380,13 @@ void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth) } #else -static unsigned int __bpf_prog_ret0(const void *ctx, - const struct bpf_insn *insn) +static unsigned int __bpf_prog_ret0_warn(const void *ctx, + const struct bpf_insn *insn) { + /* If this handler ever gets executed, then BPF_JIT_ALWAYS_ON + * is not working properly, so warn about it! + */ + WARN_ON_ONCE(1); return 0; } #endif @@ -1441,7 +1446,7 @@ struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err) fp->bpf_func = interpreters[(round_up(stack_depth, 32) / 32) - 1]; #else - fp->bpf_func = __bpf_prog_ret0; + fp->bpf_func = __bpf_prog_ret0_warn; #endif /* eBPF JITs can rewrite the program in case constant diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a47ad6cd41c0..6d39b4c01fc6 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,6 +25,7 @@ static int zero = 0; static int one = 1; +static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -325,13 +326,14 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - .proc_handler = proc_dointvec -#else .proc_handler = proc_dointvec_minmax, +# ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = &one, .extra2 = &one, -#endif +# else + .extra1 = &zero, + .extra2 = &two, +# endif }, # ifdef CONFIG_HAVE_EBPF_JIT { @@ -339,14 +341,18 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "bpf_jit_kallsyms", .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, }, # endif #endif diff --git a/net/socket.c b/net/socket.c index fbfae1ed3ff5..1536515b6437 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2613,15 +2613,6 @@ out_fs: core_initcall(sock_init); /* early initcall */ -static int __init jit_init(void) -{ -#ifdef CONFIG_BPF_JIT_ALWAYS_ON - bpf_jit_enable = 1; -#endif - return 0; -} -pure_initcall(jit_init); - #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { -- cgit v1.2.3 From 4bd95f4b99e921f51783bfddcd9738e9d3eef2b5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 20 Jan 2018 01:24:36 +0100 Subject: bpf: add upper complexity limit to verifier log Given the limit could potentially get further adjustments in the future, add it to the log so it becomes obvious what the current limit is w/o having to check the source first. This may also be helpful for debugging complexity related issues on kernels that backport from upstream. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5eeb200e82c4..caae4955fbdb 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4810,7 +4810,8 @@ process_bpf_exit: insn_idx++; } - verbose(env, "processed %d insns, stack depth ", insn_processed); + verbose(env, "processed %d insns (limit %d), stack depth ", + insn_processed, BPF_COMPLEXITY_LIMIT_INSNS); for (i = 0; i < env->subprog_cnt + 1; i++) { u32 depth = env->subprog_stack_depth[i]; -- cgit v1.2.3 From 6fd78a1a99c9580da49ee8f951fdce9846256375 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Fri, 19 Jan 2018 13:39:01 +0900 Subject: printk: drop redundant devkmsg_log_str memsets We copy in null terminated strings "on" and "off", no need to zero out devkmsg_log_str in control_devkmsg(). Link: http://lkml.kernel.org/r/20180119043901.1728-1-sergey.senozhatsky@gmail.com Cc: linux-kernel@vger.kernel.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Petr Mladek --- kernel/printk/printk.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 568729e0dc2c..bf2e6741ec12 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -131,13 +131,10 @@ static int __init control_devkmsg(char *str) /* * Set sysctl string accordingly: */ - if (devkmsg_log == DEVKMSG_LOG_MASK_ON) { - memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); - strncpy(devkmsg_log_str, "on", 2); - } else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) { - memset(devkmsg_log_str, 0, DEVKMSG_STR_MAX_SIZE); - strncpy(devkmsg_log_str, "off", 3); - } + if (devkmsg_log == DEVKMSG_LOG_MASK_ON) + strcpy(devkmsg_log_str, "on"); + else if (devkmsg_log == DEVKMSG_LOG_MASK_OFF) + strcpy(devkmsg_log_str, "off"); /* else "ratelimit" which is set by default. */ /* -- cgit v1.2.3 From 5f74972ce69fdc6473f74253283408af75a3be15 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Jan 2018 14:58:57 -0600 Subject: signal: Don't use structure initializers for struct siginfo The siginfo structure has all manners of holes with the result that a structure initializer is not guaranteed to initialize all of the bits. As we have to copy the structure to userspace don't even try to use a structure initializer. Instead use clear_siginfo followed by initializing selected fields. This gives a guarantee that uninitialized kernel memory is not copied to userspace. Signed-off-by: "Eric W. Biederman" --- arch/arc/kernel/traps.c | 14 +++++++----- arch/arm64/kernel/debug-monitors.c | 13 ++++++----- arch/arm64/kernel/ptrace.c | 13 ++++++----- arch/m68k/mm/fault.c | 3 ++- arch/mips/kernel/traps.c | 29 ++++++++++++++++-------- arch/tile/kernel/single_step.c | 24 +++++++++++--------- arch/tile/kernel/traps.c | 4 +++- arch/tile/kernel/unaligned.c | 46 +++++++++++++++++++++----------------- kernel/signal.c | 3 ++- 9 files changed, 89 insertions(+), 60 deletions(-) (limited to 'kernel') diff --git a/arch/arc/kernel/traps.c b/arch/arc/kernel/traps.c index bcd7c9fc5d0f..c7206789e9ce 100644 --- a/arch/arc/kernel/traps.c +++ b/arch/arc/kernel/traps.c @@ -65,12 +65,14 @@ unhandled_exception(const char *str, struct pt_regs *regs, siginfo_t *info) #define DO_ERROR_INFO(signr, str, name, sicode) \ int name(unsigned long address, struct pt_regs *regs) \ { \ - siginfo_t info = { \ - .si_signo = signr, \ - .si_errno = 0, \ - .si_code = sicode, \ - .si_addr = (void __user *)address, \ - }; \ + siginfo_t info; \ + \ + clear_siginfo(&info); \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)address; \ + \ return unhandled_exception(str, regs, &info);\ } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index a88b6ccebbb4..53781f5687c5 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -209,12 +209,13 @@ NOKPROBE_SYMBOL(call_step_hook); static void send_user_sigtrap(int si_code) { struct pt_regs *regs = current_pt_regs(); - siginfo_t info = { - .si_signo = SIGTRAP, - .si_errno = 0, - .si_code = si_code, - .si_addr = (void __user *)instruction_pointer(regs), - }; + siginfo_t info; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *)instruction_pointer(regs); if (WARN_ON(!user_mode(regs))) return; diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 0a1cf830e4b3..95daa1478a7c 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -180,12 +180,13 @@ static void ptrace_hbptriggered(struct perf_event *bp, struct pt_regs *regs) { struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); - siginfo_t info = { - .si_signo = SIGTRAP, - .si_errno = 0, - .si_code = TRAP_HWBKPT, - .si_addr = (void __user *)(bkpt->trigger), - }; + siginfo_t info; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_HWBKPT; + info.si_addr = (void __user *)(bkpt->trigger); #ifdef CONFIG_COMPAT if (is_compat_task()) { diff --git a/arch/m68k/mm/fault.c b/arch/m68k/mm/fault.c index 127d7c1f2090..03253c4f8e6a 100644 --- a/arch/m68k/mm/fault.c +++ b/arch/m68k/mm/fault.c @@ -21,8 +21,9 @@ extern void die_if_kernel(char *, struct pt_regs *, long); int send_fault_sig(struct pt_regs *regs) { - siginfo_t siginfo = { 0, 0, 0, }; + siginfo_t siginfo; + clear_siginfo(&siginfo); siginfo.si_signo = current->thread.signo; siginfo.si_code = current->thread.code; siginfo.si_addr = (void *)current->thread.faddr; diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 5d19ed07e99d..0ae4a731cc12 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -699,11 +699,12 @@ static int simulate_sync(struct pt_regs *regs, unsigned int opcode) asmlinkage void do_ov(struct pt_regs *regs) { enum ctx_state prev_state; - siginfo_t info = { - .si_signo = SIGFPE, - .si_code = FPE_INTOVF, - .si_addr = (void __user *)regs->cp0_epc, - }; + siginfo_t info; + + clear_siginfo(&info); + info.si_signo = SIGFPE; + info.si_code = FPE_INTOVF; + info.si_addr = (void __user *)regs->cp0_epc; prev_state = exception_enter(); die_if_kernel("Integer overflow", regs); @@ -721,7 +722,11 @@ asmlinkage void do_ov(struct pt_regs *regs) void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr, struct task_struct *tsk) { - struct siginfo si = { .si_addr = fault_addr, .si_signo = SIGFPE }; + struct siginfo si; + + clear_siginfo(&si); + si.si_addr = fault_addr; + si.si_signo = SIGFPE; if (fcr31 & FPU_CSR_INV_X) si.si_code = FPE_FLTINV; @@ -739,9 +744,10 @@ void force_fcr31_sig(unsigned long fcr31, void __user *fault_addr, int process_fpemu_return(int sig, void __user *fault_addr, unsigned long fcr31) { - struct siginfo si = { 0 }; + struct siginfo si; struct vm_area_struct *vma; + clear_siginfo(&si); switch (sig) { case 0: return 0; @@ -890,9 +896,10 @@ out: void do_trap_or_bp(struct pt_regs *regs, unsigned int code, int si_code, const char *str) { - siginfo_t info = { 0 }; + siginfo_t info; char b[40]; + clear_siginfo(&info); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_TRAP, str, regs, code, current->thread.trap_nr, SIGTRAP) == NOTIFY_STOP) @@ -1499,9 +1506,13 @@ asmlinkage void do_mdmx(struct pt_regs *regs) */ asmlinkage void do_watch(struct pt_regs *regs) { - siginfo_t info = { .si_signo = SIGTRAP, .si_code = TRAP_HWBKPT }; + siginfo_t info; enum ctx_state prev_state; + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_code = TRAP_HWBKPT; + prev_state = exception_enter(); /* * Clear WP (bit 22) bit of cause register so we don't loop diff --git a/arch/tile/kernel/single_step.c b/arch/tile/kernel/single_step.c index de3eae813e52..479d8033a801 100644 --- a/arch/tile/kernel/single_step.c +++ b/arch/tile/kernel/single_step.c @@ -163,11 +163,13 @@ static tilepro_bundle_bits rewrite_load_store_unaligned( * actual bad address in an SPR, which it doesn't. */ if (align_ctl == 0) { - siginfo_t info = { - .si_signo = SIGBUS, - .si_code = BUS_ADRALN, - .si_addr = addr - }; + siginfo_t info; + + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_code = BUS_ADRALN; + info.si_addr = addr; + trace_unhandled_signal("unaligned trap", regs, (unsigned long)addr, SIGBUS); force_sig_info(info.si_signo, &info, current); @@ -210,11 +212,13 @@ static tilepro_bundle_bits rewrite_load_store_unaligned( } if (err) { - siginfo_t info = { - .si_signo = SIGBUS, - .si_code = BUS_ADRALN, - .si_addr = addr - }; + siginfo_t info; + + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_code = BUS_ADRALN; + info.si_addr = addr; + trace_unhandled_signal("bad address for unaligned fixup", regs, (unsigned long)addr, SIGBUS); force_sig_info(info.si_signo, &info, current); diff --git a/arch/tile/kernel/traps.c b/arch/tile/kernel/traps.c index 9b08c6055f15..83a7186198d7 100644 --- a/arch/tile/kernel/traps.c +++ b/arch/tile/kernel/traps.c @@ -256,12 +256,14 @@ static int do_bpt(struct pt_regs *regs) void __kprobes do_trap(struct pt_regs *regs, int fault_num, unsigned long reason) { - siginfo_t info = { 0 }; + siginfo_t info; int signo, code; unsigned long address = 0; tile_bundle_bits instr; int is_kernel = !user_mode(regs); + clear_siginfo(&info); + /* Handle breakpoints, etc. */ if (is_kernel && fault_num == INT_ILL && do_bpt(regs)) return; diff --git a/arch/tile/kernel/unaligned.c b/arch/tile/kernel/unaligned.c index 8149c38f67b6..77a0b6b6a2a1 100644 --- a/arch/tile/kernel/unaligned.c +++ b/arch/tile/kernel/unaligned.c @@ -980,11 +980,13 @@ void jit_bundle_gen(struct pt_regs *regs, tilegx_bundle_bits bundle, } if ((align_ctl == 0) || unexpected) { - siginfo_t info = { - .si_signo = SIGBUS, - .si_code = BUS_ADRALN, - .si_addr = (unsigned char __user *)0 - }; + siginfo_t info; + + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_code = BUS_ADRALN; + info.si_addr = (unsigned char __user *)0; + if (unaligned_printk) pr_info("Unalign bundle: unexp @%llx, %llx\n", (unsigned long long)regs->pc, @@ -1396,11 +1398,12 @@ void jit_bundle_gen(struct pt_regs *regs, tilegx_bundle_bits bundle, &frag, sizeof(frag)); if (status) { /* Fail to copy JIT into user land. send SIGSEGV. */ - siginfo_t info = { - .si_signo = SIGSEGV, - .si_code = SEGV_MAPERR, - .si_addr = (void __user *)&jit_code_area[idx] - }; + siginfo_t info; + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)&jit_code_area[idx]; pr_warn("Unalign fixup: pid=%d %s jit_code_area=%llx\n", current->pid, current->comm, @@ -1511,11 +1514,12 @@ void do_unaligned(struct pt_regs *regs, int vecnum) * If so, we will trigger SIGBUS. */ if ((regs->sp & 0x7) || (regs->ex1) || (align_ctl < 0)) { - siginfo_t info = { - .si_signo = SIGBUS, - .si_code = BUS_ADRALN, - .si_addr = (unsigned char __user *)0 - }; + siginfo_t info; + + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_code = BUS_ADRALN; + info.si_addr = (unsigned char __user *)0; if (unaligned_printk) pr_info("Unalign fixup: %d %llx @%llx\n", @@ -1535,11 +1539,13 @@ void do_unaligned(struct pt_regs *regs, int vecnum) pc = (tilegx_bundle_bits __user *)(regs->pc); if (get_user(bundle, pc) != 0) { /* Probably never be here since pc is valid user address.*/ - siginfo_t info = { - .si_signo = SIGSEGV, - .si_code = SEGV_MAPERR, - .si_addr = (void __user *)pc - }; + siginfo_t info; + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)pc; + pr_err("Couldn't read instruction at %p trying to step\n", pc); trace_unhandled_signal("segfault in unalign fixup", regs, (unsigned long)info.si_addr, SIGSEGV); diff --git a/kernel/signal.c b/kernel/signal.c index 4976f05aa09b..f14492ff976f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3163,8 +3163,9 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) static int do_tkill(pid_t tgid, pid_t pid, int sig) { - struct siginfo info = {}; + struct siginfo info; + clear_siginfo(&info); info.si_signo = sig; info.si_errno = 0; info.si_code = SI_TKILL; -- cgit v1.2.3 From 3b10db2b06e2f6191aabb14babe28dcaa657a947 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 18 Aug 2017 19:56:27 -0500 Subject: signal: Replace memset(info,...) with clear_siginfo for clarity The function clear_siginfo is just a nice wrapper around memset so this results in no functional change. This change makes mistakes a little more difficult and it makes it clearer what is going on. Signed-off-by: "Eric W. Biederman" --- arch/um/kernel/trap.c | 2 +- drivers/usb/core/devio.c | 4 ++-- kernel/seccomp.c | 2 +- kernel/time/posix-timers.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 428644175956..b2b02df9896e 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -306,7 +306,7 @@ void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs) arch_examine_signal(sig, regs); - memset(&clean_si, 0, sizeof(clean_si)); + clear_siginfo(&clean_si); clean_si.si_signo = si->si_signo; clean_si.si_errno = si->si_errno; clean_si.si_code = si->si_code; diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index a3fad4ec9870..3f9bd3e4c373 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -595,7 +595,7 @@ static void async_completed(struct urb *urb) as->status = urb->status; signr = as->signr; if (signr) { - memset(&sinfo, 0, sizeof(sinfo)); + clear_siginfo(&sinfo); sinfo.si_signo = as->signr; sinfo.si_errno = as->status; sinfo.si_code = SI_ASYNCIO; @@ -2613,7 +2613,7 @@ static void usbdev_remove(struct usb_device *udev) wake_up_all(&ps->wait); list_del_init(&ps->list); if (ps->discsignr) { - memset(&sinfo, 0, sizeof(sinfo)); + clear_siginfo(&sinfo); sinfo.si_signo = ps->discsignr; sinfo.si_errno = EPIPE; sinfo.si_code = SI_ASYNCIO; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 5f0dfb2abb8d..3153c9ea51bf 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -515,7 +515,7 @@ void put_seccomp_filter(struct task_struct *tsk) static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason) { - memset(info, 0, sizeof(*info)); + clear_siginfo(info); info->si_signo = SIGSYS; info->si_code = SYS_SECCOMP; info->si_call_addr = (void __user *)KSTK_EIP(current); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index ec999f32c840..75043046914e 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -462,7 +462,7 @@ static struct k_itimer * alloc_posix_timer(void) kmem_cache_free(posix_timers_cache, tmr); return NULL; } - memset(&tmr->sigq->info, 0, sizeof(siginfo_t)); + clear_siginfo(&tmr->sigq->info); return tmr; } -- cgit v1.2.3 From f8ec66014ffd95a783b1f9f3b62d7cadb96b78d5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Jan 2018 14:54:54 -0600 Subject: signal: Add send_sig_fault and force_sig_fault The vast majority of signals sent from architecture specific code are simple faults. Encapsulate this reality with two helper functions so that the nit-picky implementation of preparing a siginfo does not need to be repeated many times on each architecture. As only some architectures support the trapno field, make the trapno arguement only present on those architectures. Similary as ia64 has three fields: imm, flags, and isr that are specific to it. Have those arguments always present on ia64 and no where else. This ensures the architecture specific code always remembers which fields it needs to pass into the siginfo structure. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 20 +++++++++++++++++++ kernel/signal.c | 47 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0aa4548fb492..375f31eb3b6b 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -285,6 +285,26 @@ static inline void kernel_signal_stop(void) schedule(); } +#ifdef __ARCH_SI_TRAPNO +# define ___ARCH_SI_TRAPNO(_a1) , _a1 +#else +# define ___ARCH_SI_TRAPNO(_a1) +#endif +#ifdef __ia64__ +# define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3 +#else +# define ___ARCH_SI_IA64(_a1, _a2, _a3) +#endif + +int force_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t); +int send_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t); + extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); extern int force_sig_info(int, struct siginfo *, struct task_struct *); diff --git a/kernel/signal.c b/kernel/signal.c index f14492ff976f..15ec7b3cbe69 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1491,6 +1491,53 @@ force_sigsegv(int sig, struct task_struct *p) return 0; } +int force_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; +#ifdef __ARCH_SI_TRAPNO + info.si_trapno = trapno; +#endif +#ifdef __ia64__ + info.si_imm = imm; + info.si_flags = flags; + info.si_isr = isr; +#endif + return force_sig_info(info.si_signo, &info, t); +} + +int send_sig_fault(int sig, int code, void __user *addr + ___ARCH_SI_TRAPNO(int trapno) + ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) + , struct task_struct *t) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = sig; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; +#ifdef __ARCH_SI_TRAPNO + info.si_trapno = trapno; +#endif +#ifdef __ia64__ + info.si_imm = imm; + info.si_flags = flags; + info.si_isr = isr; +#endif + return send_sig_info(info.si_signo, &info, t); +} + + int kill_pgrp(struct pid *pid, int sig, int priv) { int ret; -- cgit v1.2.3 From 382467358ac9675b1b6814400a9a9e36dc7da14f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 18 Jan 2018 18:54:31 -0600 Subject: signal: Helpers for faults with specialized siginfo layouts The helpers added are: send_sig_mceerr force_sig_mceerr force_sig_bnderr force_sig_pkuerr Filling out siginfo properly can ge tricky. Especially for these specialized cases where the temptation is to share code with other cases which use a different subset of siginfo fields. Unfortunately that code sharing frequently results in bugs with the wrong siginfo fields filled in, and makes it harder to verify that the siginfo structure was properly initialized. Provide these helpers instead that get all of the details right, and guarantee that siginfo is properly initialized. send_sig_mceerr and force_sig_mceer are a little special as two si codes BUS_MCEERR_AO and BUS_MCEER_AR both use the same extended signinfo layout. Signed-off-by: "Eric W. Biederman" --- include/linux/sched/signal.h | 6 +++++ kernel/signal.c | 61 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 375f31eb3b6b..944fe6356f4a 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -305,6 +305,12 @@ int send_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t); +int force_sig_mceerr(int code, void __user *, short, struct task_struct *); +int send_sig_mceerr(int code, void __user *, short, struct task_struct *); + +int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); +int force_sig_pkuerr(void __user *addr, u32 pkey); + extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); extern int force_sig_info(int, struct siginfo *, struct task_struct *); diff --git a/kernel/signal.c b/kernel/signal.c index 15ec7b3cbe69..4f6300ef8062 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1537,6 +1537,67 @@ int send_sig_fault(int sig, int code, void __user *addr return send_sig_info(info.si_signo, &info, t); } +#if defined(BUS_MCEERR_AO) && defined(BUS_MCEERR_AR) +int force_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) +{ + struct siginfo info; + + WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_addr_lsb = lsb; + return force_sig_info(info.si_signo, &info, t); +} + +int send_sig_mceerr(int code, void __user *addr, short lsb, struct task_struct *t) +{ + struct siginfo info; + + WARN_ON((code != BUS_MCEERR_AO) && (code != BUS_MCEERR_AR)); + clear_siginfo(&info); + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = code; + info.si_addr = addr; + info.si_addr_lsb = lsb; + return send_sig_info(info.si_signo, &info, t); +} +EXPORT_SYMBOL(send_sig_mceerr); +#endif + +#ifdef SEGV_BNDERR +int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_BNDERR; + info.si_addr = addr; + info.si_lower = lower; + info.si_upper = upper; + return force_sig_info(info.si_signo, &info, current); +} +#endif + +#ifdef SEGV_PKUERR +int force_sig_pkuerr(void __user *addr, u32 pkey) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_PKUERR; + info.si_addr = addr; + info.si_pkey = pkey; + return force_sig_info(info.si_signo, &info, current); +} +#endif int kill_pgrp(struct pid *pid, int sig, int priv) { -- cgit v1.2.3 From f71dd7dc2dc989dc712b246a74d243e4b2c5f8a7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 22 Jan 2018 14:37:25 -0600 Subject: signal/ptrace: Add force_sig_ptrace_errno_trap and use it where needed There are so many places that build struct siginfo by hand that at least one of them is bound to get it wrong. A handful of cases in the kernel arguably did just that when using the errno field of siginfo to pass no errno values to userspace. The usage is limited to a single si_code so at least does not mess up anything else. Encapsulate this questionable pattern in a helper function so that the userspace ABI is preserved. Update all of the places that use this pattern to use the new helper function. Signed-off-by: "Eric W. Biederman" --- arch/arm/kernel/ptrace.c | 8 +------- arch/arm64/kernel/ptrace.c | 6 ++++-- arch/powerpc/kernel/process.c | 9 ++------- arch/xtensa/kernel/ptrace.c | 8 +------- include/linux/sched/signal.h | 2 ++ kernel/signal.c | 15 +++++++++++++++ 6 files changed, 25 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 58e3771e4c5b..7724b0f661b3 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -390,7 +390,6 @@ static void ptrace_hbptriggered(struct perf_event *bp, struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); long num; int i; - siginfo_t info; for (i = 0; i < ARM_MAX_HBP_SLOTS; ++i) if (current->thread.debug.hbp[i] == bp) @@ -398,12 +397,7 @@ static void ptrace_hbptriggered(struct perf_event *bp, num = (i == ARM_MAX_HBP_SLOTS) ? 0 : ptrace_hbp_idx_to_num(i); - info.si_signo = SIGTRAP; - info.si_errno = (int)num; - info.si_code = TRAP_HWBKPT; - info.si_addr = (void __user *)(bkpt->trigger); - - force_sig_info(SIGTRAP, &info, current); + force_sig_ptrace_errno_trap((int)num, (void __user *)(bkpt->trigger)); } /* diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index 95daa1478a7c..6618036ae6d4 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -190,21 +190,23 @@ static void ptrace_hbptriggered(struct perf_event *bp, #ifdef CONFIG_COMPAT if (is_compat_task()) { + int si_errno = 0; int i; for (i = 0; i < ARM_MAX_BRP; ++i) { if (current->thread.debug.hbp_break[i] == bp) { - info.si_errno = (i << 1) + 1; + si_errno = (i << 1) + 1; break; } } for (i = 0; i < ARM_MAX_WRP; ++i) { if (current->thread.debug.hbp_watch[i] == bp) { - info.si_errno = -((i << 1) + 1); + si_errno = -((i << 1) + 1); break; } } + force_sig_ptrace_errno_trap(si_errno, (void __user *)bkpt->trigger); } #endif force_sig_info(SIGTRAP, &info, current); diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index bfb48cf56bc3..4208cbe2fb7f 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -603,19 +603,14 @@ EXPORT_SYMBOL(flush_all_to_thread); void do_send_trap(struct pt_regs *regs, unsigned long address, unsigned long error_code, int breakpt) { - siginfo_t info; - current->thread.trap_nr = TRAP_HWBKPT; if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, 11, SIGSEGV) == NOTIFY_STOP) return; /* Deliver the signal to userspace */ - info.si_signo = SIGTRAP; - info.si_errno = breakpt; /* breakpoint or watchpoint id */ - info.si_code = TRAP_HWBKPT; - info.si_addr = (void __user *)address; - force_sig_info(SIGTRAP, &info, current); + force_sig_ptrace_errno_trap(breakpt, /* breakpoint or watchpoint id */ + (void __user *)address); } #else /* !CONFIG_PPC_ADV_DEBUG_REGS */ void do_break (struct pt_regs *regs, unsigned long address, diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c index e2461968efb2..c0845cb1cbb9 100644 --- a/arch/xtensa/kernel/ptrace.c +++ b/arch/xtensa/kernel/ptrace.c @@ -278,7 +278,6 @@ static void ptrace_hbptriggered(struct perf_event *bp, struct pt_regs *regs) { int i; - siginfo_t info; struct arch_hw_breakpoint *bkpt = counter_arch_bp(bp); if (bp->attr.bp_type & HW_BREAKPOINT_X) { @@ -293,12 +292,7 @@ static void ptrace_hbptriggered(struct perf_event *bp, i = (i << 1) | 1; } - info.si_signo = SIGTRAP; - info.si_errno = i; - info.si_code = TRAP_HWBKPT; - info.si_addr = (void __user *)bkpt->address; - - force_sig_info(SIGTRAP, &info, current); + force_sig_ptrace_errno_trap(i, (void __user *)bkpt->address); } static struct perf_event *ptrace_hbp_create(struct task_struct *tsk, int type) diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 944fe6356f4a..23b4f9cb82db 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -311,6 +311,8 @@ int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); +int force_sig_ptrace_errno_trap(int errno, void __user *addr); + extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); extern int force_sig_info(int, struct siginfo *, struct task_struct *); diff --git a/kernel/signal.c b/kernel/signal.c index 4f6300ef8062..e549174c0831 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1599,6 +1599,21 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) } #endif +/* For the crazy architectures that include trap information in + * the errno field, instead of an actual errno value. + */ +int force_sig_ptrace_errno_trap(int errno, void __user *addr) +{ + struct siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_errno = errno; + info.si_code = TRAP_HWBKPT; + info.si_addr = addr; + return force_sig_info(info.si_signo, &info, current); +} + int kill_pgrp(struct pid *pid, int sig, int priv) { int ret; -- cgit v1.2.3 From 2310035fa03f651dd5b03f19a26a97512aa8842c Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Mon, 22 Jan 2018 22:53:51 -0800 Subject: bpf: fix incorrect kmalloc usage in lpm_trie MAP_GET_NEXT_KEY rcu region In commit b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map"), the implemented MAP_GET_NEXT_KEY callback function is guarded with rcu read lock. In the function body, "kmalloc(size, GFP_USER | __GFP_NOWARN)" is used which may sleep and violate rcu read lock region requirements. This patch fixed the issue by using GFP_ATOMIC instead to avoid blocking kmalloc. Tested with CONFIG_DEBUG_ATOMIC_SLEEP=y as suggested by Eric Dumazet. Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map") Signed-off-by: Yonghong Song Reported-by: syzbot Reviewed-by: Eric Dumazet Signed-off-by: Daniel Borkmann --- kernel/bpf/lpm_trie.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index d7ea96218516..8f083ea9d4b9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -624,7 +624,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) } node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), - GFP_USER | __GFP_NOWARN); + GFP_ATOMIC | __GFP_NOWARN); if (!node_stack) return -ENOMEM; -- cgit v1.2.3 From 921a7acd85ebbab1b3cd99828e6842fd3e78df24 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 16 Jan 2018 17:02:28 +0800 Subject: tracing: Detect the string nul character when parsing user input string User space can pass in a C nul character '\0' along with its input. The function trace_get_user() will try to process it as a normal character, and that will fail to parse. open("/sys/kernel/debug/tracing//set_ftrace_pid", O_WRONLY|O_TRUNC) = 3 write(3, " \0", 2) = -1 EINVAL (Invalid argument) while parse can handle spaces, so below works. $ echo "" > set_ftrace_pid $ echo " " > set_ftrace_pid $ echo -n " " > set_ftrace_pid Have the parser stop on '\0' and cease any further parsing. Only process the characters up to the nul '\0' character and do not process it. Link: http://lkml.kernel.org/r/1516093350-12045-2-git-send-email-changbin.du@intel.com Acked-by: Namhyung Kim Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8e3f20a18a06..c00a31d18f8a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1237,7 +1237,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, } /* only spaces were written */ - if (isspace(ch)) { + if (isspace(ch) || !ch) { *ppos += read; ret = read; goto out; @@ -1247,7 +1247,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, } /* read the non-space input */ - while (cnt && !isspace(ch)) { + while (cnt && !isspace(ch) && ch) { if (parser->idx < parser->size - 1) parser->buffer[parser->idx++] = ch; else { @@ -1262,7 +1262,7 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, } /* We either got finished input or we have to wait for another call. */ - if (isspace(ch)) { + if (isspace(ch) || !ch) { parser->buffer[parser->idx] = 0; parser->cont = false; } else if (parser->idx < parser->size - 1) { -- cgit v1.2.3 From 76638d96502744b0d593f2386b75ae5a017c13bb Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 16 Jan 2018 17:02:29 +0800 Subject: tracing: Clear parser->idx if only spaces are read If only spaces were read while parsing the next string, then parser->idx should be cleared in order to make trace_parser_loaded() return false. Link: http://lkml.kernel.org/r/1516093350-12045-3-git-send-email-changbin.du@intel.com Acked-by: Namhyung Kim Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c00a31d18f8a..cb90435e63da 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1236,14 +1236,14 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, cnt--; } + parser->idx = 0; + /* only spaces were written */ if (isspace(ch) || !ch) { *ppos += read; ret = read; goto out; } - - parser->idx = 0; } /* read the non-space input */ -- cgit v1.2.3 From f4d0706cde27f29ff89e6bf94ded4113f8fe6e80 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Tue, 16 Jan 2018 17:02:30 +0800 Subject: tracing: Make sure the parsed string always terminates with '\0' Always mark the parsed string with a terminated nul '\0' character. This removes the need for the users to have to append the '\0' before using the parsed string. Link: http://lkml.kernel.org/r/1516093350-12045-4-git-send-email-changbin.du@intel.com Acked-by: Namhyung Kim Signed-off-by: Changbin Du Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/ftrace.c | 2 -- kernel/trace/trace.c | 4 ++-- kernel/trace/trace_events.c | 2 -- 3 files changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 554b517c61a0..dabd9d167d42 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5015,7 +5015,6 @@ int ftrace_regex_release(struct inode *inode, struct file *file) parser = &iter->parser; if (trace_parser_loaded(parser)) { - parser->buffer[parser->idx] = 0; ftrace_match_records(iter->hash, parser->buffer, parser->idx); } @@ -5329,7 +5328,6 @@ ftrace_graph_release(struct inode *inode, struct file *file) parser = &fgd->parser; if (trace_parser_loaded((parser))) { - parser->buffer[parser->idx] = 0; ret = ftrace_graph_set_hash(fgd->new_hash, parser->buffer); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index cb90435e63da..58de825df19c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -530,8 +530,6 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, ubuf += ret; cnt -= ret; - parser.buffer[parser.idx] = 0; - ret = -EINVAL; if (kstrtoul(parser.buffer, 0, &val)) break; @@ -1268,6 +1266,8 @@ int trace_get_user(struct trace_parser *parser, const char __user *ubuf, } else if (parser->idx < parser->size - 1) { parser->cont = true; parser->buffer[parser->idx++] = ch; + /* Make sure the parsed string always terminates with '\0'. */ + parser->buffer[parser->idx] = 0; } else { ret = -EINVAL; goto out; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1b87157edbff..05c7172c6667 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -885,8 +885,6 @@ ftrace_event_write(struct file *file, const char __user *ubuf, if (*parser.buffer == '!') set = 0; - parser.buffer[parser.idx] = 0; - ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; -- cgit v1.2.3 From 0e4d819d0893dc043ea7b7cb6baf4be1e310bd96 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Sat, 6 Jan 2018 11:12:46 +0530 Subject: trace_uprobe: Display correct offset in uprobe_events Recently, how the pointers being printed with %p has been changed by commit ad67b74d2469 ("printk: hash addresses printed with %p"). This is causing a regression while showing offset in the uprobe_events file. Instead of %p, use %px to display offset. Before patch: # perf probe -vv -x /tmp/a.out main Opening /sys/kernel/debug/tracing//uprobe_events write=1 Writing event: p:probe_a/main /tmp/a.out:0x58c # cat /sys/kernel/debug/tracing/uprobe_events p:probe_a/main /tmp/a.out:0x0000000049a0f352 After patch: # cat /sys/kernel/debug/tracing/uprobe_events p:probe_a/main /tmp/a.out:0x000000000000058c Link: http://lkml.kernel.org/r/20180106054246.15375-1-ravi.bangoria@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: ad67b74d2469 ("printk: hash addresses printed with %p") Acked-by: Srikar Dronamraju Signed-off-by: Ravi Bangoria Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_uprobe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 40592e7b3568..268029ae1be6 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -608,7 +608,7 @@ static int probes_seq_show(struct seq_file *m, void *v) /* Don't print "0x (null)" when offset is 0 */ if (tu->offset) { - seq_printf(m, "0x%p", (void *)tu->offset); + seq_printf(m, "0x%px", (void *)tu->offset); } else { switch (sizeof(void *)) { case 4: -- cgit v1.2.3 From dd3dad0d716dbb9fccadcc8709900d9cac6ca252 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 21 Dec 2017 15:37:32 -0800 Subject: ftrace: Mark function tracer test functions noinline/noclone The ftrace function tracer self tests calls some functions to verify the get traced. This relies on them not being inlined. Previously this was ensured by putting them into another file, but with LTO the compiler can inline across files, which makes the tests fail. Mark these functions as noinline and noclone. Link: http://lkml.kernel.org/r/20171221233732.31896-1-andi@firstfloor.org Signed-off-by: Andi Kleen Signed-off-by: Steven Rostedt (VMware) --- kernel/trace/trace_selftest_dynamic.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c index 8cda06a10d66..c364cf777e1a 100644 --- a/kernel/trace/trace_selftest_dynamic.c +++ b/kernel/trace/trace_selftest_dynamic.c @@ -1,13 +1,14 @@ // SPDX-License-Identifier: GPL-2.0 +#include #include "trace.h" -int DYN_FTRACE_TEST_NAME(void) +noinline __noclone int DYN_FTRACE_TEST_NAME(void) { /* used to call mcount */ return 0; } -int DYN_FTRACE_TEST_NAME2(void) +noinline __noclone int DYN_FTRACE_TEST_NAME2(void) { /* used to call mcount */ return 0; -- cgit v1.2.3 From 9c147b56fc7165856da9c510463fafc2f0d58d5f Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Fri, 26 Jan 2018 00:54:02 +0100 Subject: bpf: Use the IS_FD_ARRAY() macro in map_update_elem() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the code more readable. Signed-off-by: Mickaël Salaün Cc: Alexei Starovoitov Cc: Daniel Borkmann Signed-off-by: Alexei Starovoitov --- kernel/bpf/syscall.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 5bdb0cc84ad2..e24aa3241387 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -709,10 +709,7 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_hash_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { err = bpf_percpu_array_update(map, key, value, attr->flags); - } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || - map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || - map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { + } else if (IS_FD_ARRAY(map)) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); -- cgit v1.2.3 From 2a5418a13fcfbb1f13a847eedb9a8e30a9ead765 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:37 +0100 Subject: bpf: improve dead code sanitizing Given we recently had c131187db2d3 ("bpf: fix branch pruning logic") and 95a762e2c8c9 ("bpf: fix incorrect sign extension in check_alu_op()") in particular where before verifier skipped verification of the wrongly assumed dead branch, we should not just replace the dead code parts with nops (mov r0,r0). If there is a bug such as fixed in 95a762e2c8c9 in future again, where runtime could execute those insns, then one of the potential issues with the current setting would be that given the nops would be at the end of the program, we could execute out of bounds at some point. The best in such case would be to just exit the BPF program altogether and return an exception code. However, given this would require two instructions, and such a dead code gap could just be a single insn long, we would need to place 'r0 = X; ret' snippet at the very end after the user program or at the start before the program (where we'd skip that region on prog entry), and then place unconditional ja's into the dead code gap. While more complex but possible, there's still another block in the road that currently prevents from this, namely BPF to BPF calls. The issue here is that such exception could be returned from a callee, but the caller would not know that it's an exception that needs to be propagated further down. Alternative that has little complexity is to just use a ja-1 code for now which will trap the execution here instead of silently doing bad things if we ever get there due to bugs. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index dfb138b46488..8365259a7c5a 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5064,14 +5064,21 @@ static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 of return new_prog; } -/* The verifier does more data flow analysis than llvm and will not explore - * branches that are dead at run time. Malicious programs can have dead code - * too. Therefore replace all dead at-run-time code with nops. +/* The verifier does more data flow analysis than llvm and will not + * explore branches that are dead at run time. Malicious programs can + * have dead code too. Therefore replace all dead at-run-time code + * with 'ja -1'. + * + * Just nops are not optimal, e.g. if they would sit at the end of the + * program and through another bug we would manage to jump there, then + * we'd execute beyond program memory otherwise. Returning exception + * code also wouldn't work since we can have subprogs where the dead + * code could be located. */ static void sanitize_dead_code(struct bpf_verifier_env *env) { struct bpf_insn_aux_data *aux_data = env->insn_aux_data; - struct bpf_insn nop = BPF_MOV64_REG(BPF_REG_0, BPF_REG_0); + struct bpf_insn trap = BPF_JMP_IMM(BPF_JA, 0, 0, -1); struct bpf_insn *insn = env->prog->insnsi; const int insn_cnt = env->prog->len; int i; @@ -5079,7 +5086,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) for (i = 0; i < insn_cnt; i++) { if (aux_data[i].seen) continue; - memcpy(insn + i, &nop, sizeof(nop)); + memcpy(insn + i, &trap, sizeof(trap)); } } -- cgit v1.2.3 From 5e581dad4fec0e6d062740dc35b8dc248b39d224 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:38 +0100 Subject: bpf: make unknown opcode handling more robust Recent findings by syzcaller fixed in 7891a87efc71 ("bpf: arsh is not supported in 32 bit alu thus reject it") triggered a warning in the interpreter due to unknown opcode not being rejected by the verifier. The 'return 0' for an unknown opcode is really not optimal, since with BPF to BPF calls, this would go untracked by the verifier. Do two things here to improve the situation: i) perform basic insn sanity check early on in the verification phase and reject every non-uapi insn right there. The bpf_opcode_in_insntable() table reuses the same mapping as the jumptable in ___bpf_prog_run() sans the non-public mappings. And ii) in ___bpf_prog_run() we do need to BUG in the case where the verifier would ever create an unknown opcode due to some rewrites. Note that JITs do not have such issues since they would punt to interpreter in these situations. Moreover, the BPF_JIT_ALWAYS_ON would also help to avoid such unknown opcodes in the first place. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- include/linux/filter.h | 2 + kernel/bpf/core.c | 250 ++++++++++++++++++++++++++++--------------------- kernel/bpf/verifier.c | 7 ++ 3 files changed, 154 insertions(+), 105 deletions(-) (limited to 'kernel') diff --git a/include/linux/filter.h b/include/linux/filter.h index 20384c4bed25..276932d75975 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -688,6 +688,8 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb) struct bpf_prog *bpf_prog_select_runtime(struct bpf_prog *fp, int *err); void bpf_prog_free(struct bpf_prog *fp); +bool bpf_opcode_in_insntable(u8 code); + struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 3aa0658add76..8301de2d1f96 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -782,6 +782,137 @@ noinline u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) } EXPORT_SYMBOL_GPL(__bpf_call_base); +/* All UAPI available opcodes. */ +#define BPF_INSN_MAP(INSN_2, INSN_3) \ + /* 32 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU, ADD, X), \ + INSN_3(ALU, SUB, X), \ + INSN_3(ALU, AND, X), \ + INSN_3(ALU, OR, X), \ + INSN_3(ALU, LSH, X), \ + INSN_3(ALU, RSH, X), \ + INSN_3(ALU, XOR, X), \ + INSN_3(ALU, MUL, X), \ + INSN_3(ALU, MOV, X), \ + INSN_3(ALU, DIV, X), \ + INSN_3(ALU, MOD, X), \ + INSN_2(ALU, NEG), \ + INSN_3(ALU, END, TO_BE), \ + INSN_3(ALU, END, TO_LE), \ + /* Immediate based. */ \ + INSN_3(ALU, ADD, K), \ + INSN_3(ALU, SUB, K), \ + INSN_3(ALU, AND, K), \ + INSN_3(ALU, OR, K), \ + INSN_3(ALU, LSH, K), \ + INSN_3(ALU, RSH, K), \ + INSN_3(ALU, XOR, K), \ + INSN_3(ALU, MUL, K), \ + INSN_3(ALU, MOV, K), \ + INSN_3(ALU, DIV, K), \ + INSN_3(ALU, MOD, K), \ + /* 64 bit ALU operations. */ \ + /* Register based. */ \ + INSN_3(ALU64, ADD, X), \ + INSN_3(ALU64, SUB, X), \ + INSN_3(ALU64, AND, X), \ + INSN_3(ALU64, OR, X), \ + INSN_3(ALU64, LSH, X), \ + INSN_3(ALU64, RSH, X), \ + INSN_3(ALU64, XOR, X), \ + INSN_3(ALU64, MUL, X), \ + INSN_3(ALU64, MOV, X), \ + INSN_3(ALU64, ARSH, X), \ + INSN_3(ALU64, DIV, X), \ + INSN_3(ALU64, MOD, X), \ + INSN_2(ALU64, NEG), \ + /* Immediate based. */ \ + INSN_3(ALU64, ADD, K), \ + INSN_3(ALU64, SUB, K), \ + INSN_3(ALU64, AND, K), \ + INSN_3(ALU64, OR, K), \ + INSN_3(ALU64, LSH, K), \ + INSN_3(ALU64, RSH, K), \ + INSN_3(ALU64, XOR, K), \ + INSN_3(ALU64, MUL, K), \ + INSN_3(ALU64, MOV, K), \ + INSN_3(ALU64, ARSH, K), \ + INSN_3(ALU64, DIV, K), \ + INSN_3(ALU64, MOD, K), \ + /* Call instruction. */ \ + INSN_2(JMP, CALL), \ + /* Exit instruction. */ \ + INSN_2(JMP, EXIT), \ + /* Jump instructions. */ \ + /* Register based. */ \ + INSN_3(JMP, JEQ, X), \ + INSN_3(JMP, JNE, X), \ + INSN_3(JMP, JGT, X), \ + INSN_3(JMP, JLT, X), \ + INSN_3(JMP, JGE, X), \ + INSN_3(JMP, JLE, X), \ + INSN_3(JMP, JSGT, X), \ + INSN_3(JMP, JSLT, X), \ + INSN_3(JMP, JSGE, X), \ + INSN_3(JMP, JSLE, X), \ + INSN_3(JMP, JSET, X), \ + /* Immediate based. */ \ + INSN_3(JMP, JEQ, K), \ + INSN_3(JMP, JNE, K), \ + INSN_3(JMP, JGT, K), \ + INSN_3(JMP, JLT, K), \ + INSN_3(JMP, JGE, K), \ + INSN_3(JMP, JLE, K), \ + INSN_3(JMP, JSGT, K), \ + INSN_3(JMP, JSLT, K), \ + INSN_3(JMP, JSGE, K), \ + INSN_3(JMP, JSLE, K), \ + INSN_3(JMP, JSET, K), \ + INSN_2(JMP, JA), \ + /* Store instructions. */ \ + /* Register based. */ \ + INSN_3(STX, MEM, B), \ + INSN_3(STX, MEM, H), \ + INSN_3(STX, MEM, W), \ + INSN_3(STX, MEM, DW), \ + INSN_3(STX, XADD, W), \ + INSN_3(STX, XADD, DW), \ + /* Immediate based. */ \ + INSN_3(ST, MEM, B), \ + INSN_3(ST, MEM, H), \ + INSN_3(ST, MEM, W), \ + INSN_3(ST, MEM, DW), \ + /* Load instructions. */ \ + /* Register based. */ \ + INSN_3(LDX, MEM, B), \ + INSN_3(LDX, MEM, H), \ + INSN_3(LDX, MEM, W), \ + INSN_3(LDX, MEM, DW), \ + /* Immediate based. */ \ + INSN_3(LD, IMM, DW), \ + /* Misc (old cBPF carry-over). */ \ + INSN_3(LD, ABS, B), \ + INSN_3(LD, ABS, H), \ + INSN_3(LD, ABS, W), \ + INSN_3(LD, IND, B), \ + INSN_3(LD, IND, H), \ + INSN_3(LD, IND, W) + +bool bpf_opcode_in_insntable(u8 code) +{ +#define BPF_INSN_2_TBL(x, y) [BPF_##x | BPF_##y] = true +#define BPF_INSN_3_TBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = true + static const bool public_insntable[256] = { + [0 ... 255] = false, + /* Now overwrite non-defaults ... */ + BPF_INSN_MAP(BPF_INSN_2_TBL, BPF_INSN_3_TBL), + }; +#undef BPF_INSN_3_TBL +#undef BPF_INSN_2_TBL + return public_insntable[code]; +} + #ifndef CONFIG_BPF_JIT_ALWAYS_ON /** * __bpf_prog_run - run eBPF program on a given context @@ -793,115 +924,18 @@ EXPORT_SYMBOL_GPL(__bpf_call_base); static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { u64 tmp; +#define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y +#define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ - /* 32 bit ALU operations */ - [BPF_ALU | BPF_ADD | BPF_X] = &&ALU_ADD_X, - [BPF_ALU | BPF_ADD | BPF_K] = &&ALU_ADD_K, - [BPF_ALU | BPF_SUB | BPF_X] = &&ALU_SUB_X, - [BPF_ALU | BPF_SUB | BPF_K] = &&ALU_SUB_K, - [BPF_ALU | BPF_AND | BPF_X] = &&ALU_AND_X, - [BPF_ALU | BPF_AND | BPF_K] = &&ALU_AND_K, - [BPF_ALU | BPF_OR | BPF_X] = &&ALU_OR_X, - [BPF_ALU | BPF_OR | BPF_K] = &&ALU_OR_K, - [BPF_ALU | BPF_LSH | BPF_X] = &&ALU_LSH_X, - [BPF_ALU | BPF_LSH | BPF_K] = &&ALU_LSH_K, - [BPF_ALU | BPF_RSH | BPF_X] = &&ALU_RSH_X, - [BPF_ALU | BPF_RSH | BPF_K] = &&ALU_RSH_K, - [BPF_ALU | BPF_XOR | BPF_X] = &&ALU_XOR_X, - [BPF_ALU | BPF_XOR | BPF_K] = &&ALU_XOR_K, - [BPF_ALU | BPF_MUL | BPF_X] = &&ALU_MUL_X, - [BPF_ALU | BPF_MUL | BPF_K] = &&ALU_MUL_K, - [BPF_ALU | BPF_MOV | BPF_X] = &&ALU_MOV_X, - [BPF_ALU | BPF_MOV | BPF_K] = &&ALU_MOV_K, - [BPF_ALU | BPF_DIV | BPF_X] = &&ALU_DIV_X, - [BPF_ALU | BPF_DIV | BPF_K] = &&ALU_DIV_K, - [BPF_ALU | BPF_MOD | BPF_X] = &&ALU_MOD_X, - [BPF_ALU | BPF_MOD | BPF_K] = &&ALU_MOD_K, - [BPF_ALU | BPF_NEG] = &&ALU_NEG, - [BPF_ALU | BPF_END | BPF_TO_BE] = &&ALU_END_TO_BE, - [BPF_ALU | BPF_END | BPF_TO_LE] = &&ALU_END_TO_LE, - /* 64 bit ALU operations */ - [BPF_ALU64 | BPF_ADD | BPF_X] = &&ALU64_ADD_X, - [BPF_ALU64 | BPF_ADD | BPF_K] = &&ALU64_ADD_K, - [BPF_ALU64 | BPF_SUB | BPF_X] = &&ALU64_SUB_X, - [BPF_ALU64 | BPF_SUB | BPF_K] = &&ALU64_SUB_K, - [BPF_ALU64 | BPF_AND | BPF_X] = &&ALU64_AND_X, - [BPF_ALU64 | BPF_AND | BPF_K] = &&ALU64_AND_K, - [BPF_ALU64 | BPF_OR | BPF_X] = &&ALU64_OR_X, - [BPF_ALU64 | BPF_OR | BPF_K] = &&ALU64_OR_K, - [BPF_ALU64 | BPF_LSH | BPF_X] = &&ALU64_LSH_X, - [BPF_ALU64 | BPF_LSH | BPF_K] = &&ALU64_LSH_K, - [BPF_ALU64 | BPF_RSH | BPF_X] = &&ALU64_RSH_X, - [BPF_ALU64 | BPF_RSH | BPF_K] = &&ALU64_RSH_K, - [BPF_ALU64 | BPF_XOR | BPF_X] = &&ALU64_XOR_X, - [BPF_ALU64 | BPF_XOR | BPF_K] = &&ALU64_XOR_K, - [BPF_ALU64 | BPF_MUL | BPF_X] = &&ALU64_MUL_X, - [BPF_ALU64 | BPF_MUL | BPF_K] = &&ALU64_MUL_K, - [BPF_ALU64 | BPF_MOV | BPF_X] = &&ALU64_MOV_X, - [BPF_ALU64 | BPF_MOV | BPF_K] = &&ALU64_MOV_K, - [BPF_ALU64 | BPF_ARSH | BPF_X] = &&ALU64_ARSH_X, - [BPF_ALU64 | BPF_ARSH | BPF_K] = &&ALU64_ARSH_K, - [BPF_ALU64 | BPF_DIV | BPF_X] = &&ALU64_DIV_X, - [BPF_ALU64 | BPF_DIV | BPF_K] = &&ALU64_DIV_K, - [BPF_ALU64 | BPF_MOD | BPF_X] = &&ALU64_MOD_X, - [BPF_ALU64 | BPF_MOD | BPF_K] = &&ALU64_MOD_K, - [BPF_ALU64 | BPF_NEG] = &&ALU64_NEG, - /* Call instruction */ - [BPF_JMP | BPF_CALL] = &&JMP_CALL, + BPF_INSN_MAP(BPF_INSN_2_LBL, BPF_INSN_3_LBL), + /* Non-UAPI available opcodes. */ [BPF_JMP | BPF_CALL_ARGS] = &&JMP_CALL_ARGS, [BPF_JMP | BPF_TAIL_CALL] = &&JMP_TAIL_CALL, - /* Jumps */ - [BPF_JMP | BPF_JA] = &&JMP_JA, - [BPF_JMP | BPF_JEQ | BPF_X] = &&JMP_JEQ_X, - [BPF_JMP | BPF_JEQ | BPF_K] = &&JMP_JEQ_K, - [BPF_JMP | BPF_JNE | BPF_X] = &&JMP_JNE_X, - [BPF_JMP | BPF_JNE | BPF_K] = &&JMP_JNE_K, - [BPF_JMP | BPF_JGT | BPF_X] = &&JMP_JGT_X, - [BPF_JMP | BPF_JGT | BPF_K] = &&JMP_JGT_K, - [BPF_JMP | BPF_JLT | BPF_X] = &&JMP_JLT_X, - [BPF_JMP | BPF_JLT | BPF_K] = &&JMP_JLT_K, - [BPF_JMP | BPF_JGE | BPF_X] = &&JMP_JGE_X, - [BPF_JMP | BPF_JGE | BPF_K] = &&JMP_JGE_K, - [BPF_JMP | BPF_JLE | BPF_X] = &&JMP_JLE_X, - [BPF_JMP | BPF_JLE | BPF_K] = &&JMP_JLE_K, - [BPF_JMP | BPF_JSGT | BPF_X] = &&JMP_JSGT_X, - [BPF_JMP | BPF_JSGT | BPF_K] = &&JMP_JSGT_K, - [BPF_JMP | BPF_JSLT | BPF_X] = &&JMP_JSLT_X, - [BPF_JMP | BPF_JSLT | BPF_K] = &&JMP_JSLT_K, - [BPF_JMP | BPF_JSGE | BPF_X] = &&JMP_JSGE_X, - [BPF_JMP | BPF_JSGE | BPF_K] = &&JMP_JSGE_K, - [BPF_JMP | BPF_JSLE | BPF_X] = &&JMP_JSLE_X, - [BPF_JMP | BPF_JSLE | BPF_K] = &&JMP_JSLE_K, - [BPF_JMP | BPF_JSET | BPF_X] = &&JMP_JSET_X, - [BPF_JMP | BPF_JSET | BPF_K] = &&JMP_JSET_K, - /* Program return */ - [BPF_JMP | BPF_EXIT] = &&JMP_EXIT, - /* Store instructions */ - [BPF_STX | BPF_MEM | BPF_B] = &&STX_MEM_B, - [BPF_STX | BPF_MEM | BPF_H] = &&STX_MEM_H, - [BPF_STX | BPF_MEM | BPF_W] = &&STX_MEM_W, - [BPF_STX | BPF_MEM | BPF_DW] = &&STX_MEM_DW, - [BPF_STX | BPF_XADD | BPF_W] = &&STX_XADD_W, - [BPF_STX | BPF_XADD | BPF_DW] = &&STX_XADD_DW, - [BPF_ST | BPF_MEM | BPF_B] = &&ST_MEM_B, - [BPF_ST | BPF_MEM | BPF_H] = &&ST_MEM_H, - [BPF_ST | BPF_MEM | BPF_W] = &&ST_MEM_W, - [BPF_ST | BPF_MEM | BPF_DW] = &&ST_MEM_DW, - /* Load instructions */ - [BPF_LDX | BPF_MEM | BPF_B] = &&LDX_MEM_B, - [BPF_LDX | BPF_MEM | BPF_H] = &&LDX_MEM_H, - [BPF_LDX | BPF_MEM | BPF_W] = &&LDX_MEM_W, - [BPF_LDX | BPF_MEM | BPF_DW] = &&LDX_MEM_DW, - [BPF_LD | BPF_ABS | BPF_W] = &&LD_ABS_W, - [BPF_LD | BPF_ABS | BPF_H] = &&LD_ABS_H, - [BPF_LD | BPF_ABS | BPF_B] = &&LD_ABS_B, - [BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W, - [BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H, - [BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B, - [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW, }; +#undef BPF_INSN_3_LBL +#undef BPF_INSN_2_LBL u32 tail_call_cnt = 0; void *ptr; int off; @@ -1302,8 +1336,14 @@ load_byte: goto load_byte; default_label: - /* If we ever reach this, we have a bug somewhere. */ - WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); + /* If we ever reach this, we have a bug somewhere. Die hard here + * instead of just returning 0; we could be somewhere in a subprog, + * so execution could continue otherwise which we do /not/ want. + * + * Note, verifier whitelists all opcodes in bpf_opcode_in_insntable(). + */ + pr_warn("BPF interpreter: unknown opcode %02x\n", insn->code); + BUG_ON(1); return 0; } STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 8365259a7c5a..0c5269415090 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -4981,6 +4981,13 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env) next_insn: insn++; i++; + continue; + } + + /* Basic sanity check before we invest more work here. */ + if (!bpf_opcode_in_insntable(insn->code)) { + verbose(env, "unknown opcode %02x\n", insn->code); + return -EINVAL; } } -- cgit v1.2.3 From f6b1b3bf0d5f681631a293cfe1ca934b81716f1e Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 26 Jan 2018 23:33:39 +0100 Subject: bpf: fix subprog verifier bypass by div/mod by 0 exception One of the ugly leftovers from the early eBPF days is that div/mod operations based on registers have a hard-coded src_reg == 0 test in the interpreter as well as in JIT code generators that would return from the BPF program with exit code 0. This was basically adopted from cBPF interpreter for historical reasons. There are multiple reasons why this is very suboptimal and prone to bugs. To name one: the return code mapping for such abnormal program exit of 0 does not always match with a suitable program type's exit code mapping. For example, '0' in tc means action 'ok' where the packet gets passed further up the stack, which is just undesirable for such cases (e.g. when implementing policy) and also does not match with other program types. While trying to work out an exception handling scheme, I also noticed that programs crafted like the following will currently pass the verifier: 0: (bf) r6 = r1 1: (85) call pc+8 caller: R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 callee: frame1: R1=ctx(id=0,off=0,imm=0) R10=fp0,call_1 10: (b4) (u32) r2 = (u32) 0 11: (b4) (u32) r3 = (u32) 1 12: (3c) (u32) r3 /= (u32) r2 13: (61) r0 = *(u32 *)(r1 +76) 14: (95) exit returning from callee: frame1: R0_w=pkt(id=0,off=0,r=0,imm=0) R1=ctx(id=0,off=0,imm=0) R2_w=inv0 R3_w=inv(id=0,umax_value=4294967295,var_off=(0x0; 0xffffffff)) R10=fp0,call_1 to caller at 2: R0_w=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 from 14 to 2: R0=pkt(id=0,off=0,r=0,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 2: (bf) r1 = r6 3: (61) r1 = *(u32 *)(r1 +80) 4: (bf) r2 = r0 5: (07) r2 += 8 6: (2d) if r2 > r1 goto pc+1 R0=pkt(id=0,off=0,r=8,imm=0) R1=pkt_end(id=0,off=0,imm=0) R2=pkt(id=0,off=8,r=8,imm=0) R6=ctx(id=0,off=0,imm=0) R10=fp0,call_-1 7: (71) r0 = *(u8 *)(r0 +0) 8: (b7) r0 = 1 9: (95) exit from 6 to 8: safe processed 16 insns (limit 131072), stack depth 0+0 Basically what happens is that in the subprog we make use of a div/mod by 0 exception and in the 'normal' subprog's exit path we just return skb->data back to the main prog. This has the implication that the verifier thinks we always get a pkt pointer in R0 while we still have the implicit 'return 0' from the div as an alternative unconditional return path earlier. Thus, R0 then contains 0, meaning back in the parent prog we get the address range of [0x0, skb->data_end] as read and writeable. Similar can be crafted with other pointer register types. Since i) BPF_ABS/IND is not allowed in programs that contain BPF to BPF calls (and generally it's also disadvised to use in native eBPF context), ii) unknown opcodes don't return zero anymore, iii) we don't return an exception code in dead branches, the only last missing case affected and to fix is the div/mod handling. What we would really need is some infrastructure to propagate exceptions all the way to the original prog unwinding the current stack and returning that code to the caller of the BPF program. In user space such exception handling for similar runtimes is typically implemented with setjmp(3) and longjmp(3) as one possibility which is not available in the kernel, though (kgdb used to implement it in kernel long time ago). I implemented a PoC exception handling mechanism into the BPF interpreter with porting setjmp()/longjmp() into x86_64 and adding a new internal BPF_ABRT opcode that can use a program specific exception code for all exception cases we have (e.g. div/mod by 0, unknown opcodes, etc). While this seems to work in the constrained BPF environment (meaning, here, we don't need to deal with state e.g. from memory allocations that we would need to undo before going into exception state), it still has various drawbacks: i) we would need to implement the setjmp()/longjmp() for every arch supported in the kernel and for x86_64, arm64, sparc64 JITs currently supporting calls, ii) it has unconditional additional cost on main program entry to store CPU register state in initial setjmp() call, and we would need some way to pass the jmp_buf down into ___bpf_prog_run() for main prog and all subprogs, but also storing on stack is not really nice (other option would be per-cpu storage for this, but it also has the drawback that we need to disable preemption for every BPF program types). All in all this approach would add a lot of complexity. Another poor-man's solution would be to have some sort of additional shared register or scratch buffer to hold state for exceptions, and test that after every call return to chain returns and pass R0 all the way down to BPF prog caller. This is also problematic in various ways: i) an additional register doesn't map well into JITs, and some other scratch space could only be on per-cpu storage, which, again has the side-effect that this only works when we disable preemption, or somewhere in the input context which is not available everywhere either, and ii) this adds significant runtime overhead by putting conditionals after each and every call, as well as implementation complexity. Yet another option is to teach verifier that div/mod can return an integer, which however is also complex to implement as verifier would need to walk such fake 'mov r0,; exit;' sequeuence and there would still be no guarantee for having propagation of this further down to the BPF caller as proper exception code. For parent prog, it is also is not distinguishable from a normal return of a constant scalar value. The approach taken here is a completely different one with little complexity and no additional overhead involved in that we make use of the fact that a div/mod by 0 is undefined behavior. Instead of bailing out, we adapt the same behavior as on some major archs like ARMv8 [0] into eBPF as well: X div 0 results in 0, and X mod 0 results in X. aarch64 and aarch32 ISA do not generate any traps or otherwise aborts of program execution for unsigned divides. I verified this also with a test program compiled by gcc and clang, and the behavior matches with the spec. Going forward we adapt the eBPF verifier to emit such rewrites once div/mod by register was seen. cBPF is not touched and will keep existing 'return 0' semantics. Given the options, it seems the most suitable from all of them, also since major archs have similar schemes in place. Given this is all in the realm of undefined behavior, we still have the option to adapt if deemed necessary and this way we would also have the option of more flexibility from LLVM code generation side (which is then fully visible to verifier). Thus, this patch i) fixes the panic seen in above program and ii) doesn't bypass the verifier observations. [0] ARM Architecture Reference Manual, ARMv8 [ARM DDI 0487B.b] http://infocenter.arm.com/help/topic/com.arm.doc.ddi0487b.b/DDI0487B_b_armv8_arm.pdf 1) aarch64 instruction set: section C3.4.7 and C6.2.279 (UDIV) "A division by zero results in a zero being written to the destination register, without any indication that the division by zero occurred." 2) aarch32 instruction set: section F1.4.8 and F5.1.263 (UDIV) "For the SDIV and UDIV instructions, division by zero always returns a zero result." Fixes: f4d7e40a5b71 ("bpf: introduce function calls (verification)") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: Alexei Starovoitov --- kernel/bpf/core.c | 8 -------- kernel/bpf/verifier.c | 38 ++++++++++++++++++++++++++++++-------- net/core/filter.c | 9 ++++++++- 3 files changed, 38 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 8301de2d1f96..5f35f93dcab2 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -999,14 +999,10 @@ select_insn: (*(s64 *) &DST) >>= IMM; CONT; ALU64_MOD_X: - if (unlikely(SRC == 0)) - return 0; div64_u64_rem(DST, SRC, &tmp); DST = tmp; CONT; ALU_MOD_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; DST = do_div(tmp, (u32) SRC); CONT; @@ -1019,13 +1015,9 @@ select_insn: DST = do_div(tmp, (u32) IMM); CONT; ALU64_DIV_X: - if (unlikely(SRC == 0)) - return 0; DST = div64_u64(DST, SRC); CONT; ALU_DIV_X: - if (unlikely((u32)SRC == 0)) - return 0; tmp = (u32) DST; do_div(tmp, (u32) SRC); DST = (u32) tmp; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0c5269415090..5fb69a85d967 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5400,15 +5400,37 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) int i, cnt, delta = 0; for (i = 0; i < insn_cnt; i++, insn++) { - if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || + if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_MOD | BPF_X) || insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { - /* due to JIT bugs clear upper 32-bits of src register - * before div/mod operation - */ - insn_buf[0] = BPF_MOV32_REG(insn->src_reg, insn->src_reg); - insn_buf[1] = *insn; - cnt = 2; - new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + bool is64 = BPF_CLASS(insn->code) == BPF_ALU64; + struct bpf_insn mask_and_div[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx div 0 -> 0 */ + BPF_JMP_IMM(BPF_JNE, insn->src_reg, 0, 2), + BPF_ALU32_REG(BPF_XOR, insn->dst_reg, insn->dst_reg), + BPF_JMP_IMM(BPF_JA, 0, 0, 1), + *insn, + }; + struct bpf_insn mask_and_mod[] = { + BPF_MOV32_REG(insn->src_reg, insn->src_reg), + /* Rx mod 0 -> Rx */ + BPF_JMP_IMM(BPF_JEQ, insn->src_reg, 0, 1), + *insn, + }; + struct bpf_insn *patchlet; + + if (insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) || + insn->code == (BPF_ALU | BPF_DIV | BPF_X)) { + patchlet = mask_and_div + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_div) - (is64 ? 1 : 0); + } else { + patchlet = mask_and_mod + (is64 ? 1 : 0); + cnt = ARRAY_SIZE(mask_and_mod) - (is64 ? 1 : 0); + } + + new_prog = bpf_patch_insn_data(env, i + delta, patchlet, cnt); if (!new_prog) return -ENOMEM; diff --git a/net/core/filter.c b/net/core/filter.c index 60d8c8712652..08ab4c65a998 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -459,8 +459,15 @@ do_pass: break; if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || - fp->code == (BPF_ALU | BPF_MOD | BPF_X)) + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + /* Error with exception code on div/mod by 0. + * For cBPF programs, this was always return 0. + */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn++ = BPF_EXIT_INSN(); + } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; -- cgit v1.2.3 From 6dd1ec6c7a2c304e9e2e2edd9e7ccb8e8791d36a Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Fri, 26 Jan 2018 15:06:07 -0800 Subject: bpf: fix kernel page fault in lpm map trie_get_next_key Commit b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command for LPM_TRIE map") introduces a bug likes below: if (!rcu_dereference(trie->root)) return -ENOENT; if (!key || key->prefixlen > trie->max_prefixlen) { root = &trie->root; goto find_leftmost; } ...... find_leftmost: for (node = rcu_dereference(*root); node;) { In the code after label find_leftmost, it is assumed that *root should not be NULL, but it is not true as it is possbile trie->root is changed to NULL by an asynchronous delete operation. The issue is reported by syzbot and Eric Dumazet with the below error log: ...... kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Dumping ftrace buffer: (ftrace buffer empty) Modules linked in: CPU: 1 PID: 8033 Comm: syz-executor3 Not tainted 4.15.0-rc8+ #4 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:trie_get_next_key+0x3c2/0xf10 kernel/bpf/lpm_trie.c:682 ...... This patch fixed the issue by use local rcu_dereferenced pointer instead of *(&trie->root) later on. Fixes: b471f2f1de8b ("bpf: implement MAP_GET_NEXT_KEY command or LPM_TRIE map") Reported-by: syzbot Reported-by: Eric Dumazet Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov --- kernel/bpf/lpm_trie.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index 8f083ea9d4b9..7b469d10d0e9 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -593,11 +593,10 @@ unlock: static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) { + struct lpm_trie_node *node, *next_node = NULL, *parent, *search_root; struct lpm_trie *trie = container_of(map, struct lpm_trie, map); struct bpf_lpm_trie_key *key = _key, *next_key = _next_key; - struct lpm_trie_node *node, *next_node = NULL, *parent; struct lpm_trie_node **node_stack = NULL; - struct lpm_trie_node __rcu **root; int err = 0, stack_ptr = -1; unsigned int next_bit; size_t matchlen; @@ -614,14 +613,13 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) */ /* Empty trie */ - if (!rcu_dereference(trie->root)) + search_root = rcu_dereference(trie->root); + if (!search_root) return -ENOENT; /* For invalid key, find the leftmost node in the trie */ - if (!key || key->prefixlen > trie->max_prefixlen) { - root = &trie->root; + if (!key || key->prefixlen > trie->max_prefixlen) goto find_leftmost; - } node_stack = kmalloc(trie->max_prefixlen * sizeof(struct lpm_trie_node *), GFP_ATOMIC | __GFP_NOWARN); @@ -629,7 +627,7 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) return -ENOMEM; /* Try to find the exact node for the given key */ - for (node = rcu_dereference(trie->root); node;) { + for (node = search_root; node;) { node_stack[++stack_ptr] = node; matchlen = longest_prefix_match(trie, node, key); if (node->prefixlen != matchlen || @@ -640,10 +638,8 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = rcu_dereference(node->child[next_bit]); } if (!node || node->prefixlen != key->prefixlen || - (node->flags & LPM_TREE_NODE_FLAG_IM)) { - root = &trie->root; + (node->flags & LPM_TREE_NODE_FLAG_IM)) goto find_leftmost; - } /* The node with the exactly-matching key has been found, * find the first node in postorder after the matched node. @@ -651,10 +647,10 @@ static int trie_get_next_key(struct bpf_map *map, void *_key, void *_next_key) node = node_stack[stack_ptr]; while (stack_ptr > 0) { parent = node_stack[stack_ptr - 1]; - if (rcu_dereference(parent->child[0]) == node && - rcu_dereference(parent->child[1])) { - root = &parent->child[1]; - goto find_leftmost; + if (rcu_dereference(parent->child[0]) == node) { + search_root = rcu_dereference(parent->child[1]); + if (search_root) + goto find_leftmost; } if (!(parent->flags & LPM_TREE_NODE_FLAG_IM)) { next_node = parent; @@ -673,7 +669,7 @@ find_leftmost: /* Find the leftmost non-intermediate node, all intermediate nodes * have exact two children, so this function will never return NULL. */ - for (node = rcu_dereference(*root); node;) { + for (node = search_root; node;) { if (!(node->flags & LPM_TREE_NODE_FLAG_IM)) next_node = node; node = rcu_dereference(node->child[0]); -- cgit v1.2.3 From d70f2a14b72a4bc094cf3a92e4794644a7adc590 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 31 Jan 2018 16:15:51 -0800 Subject: include/linux/sched/mm.h: uninline mmdrop_async(), etc mmdrop_async() is only used in fork.c. Move that and its support functions into fork.c, uninline it all. Quite a lot of code gets moved around to avoid forward declarations. Cc: Ingo Molnar Cc: Michal Hocko Cc: Peter Zijlstra Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 24 +-- kernel/fork.c | 448 +++++++++++++++++++++++++---------------------- 2 files changed, 238 insertions(+), 234 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3d49b91b674d..bd422561a75e 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -11,7 +11,7 @@ /* * Routines for handling mm_structs */ -extern struct mm_struct * mm_alloc(void); +extern struct mm_struct *mm_alloc(void); /** * mmgrab() - Pin a &struct mm_struct. @@ -35,27 +35,7 @@ static inline void mmgrab(struct mm_struct *mm) atomic_inc(&mm->mm_count); } -/* mmdrop drops the mm and the page tables */ -extern void __mmdrop(struct mm_struct *); -static inline void mmdrop(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) - __mmdrop(mm); -} - -static inline void mmdrop_async_fn(struct work_struct *work) -{ - struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); - __mmdrop(mm); -} - -static inline void mmdrop_async(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) { - INIT_WORK(&mm->async_put_work, mmdrop_async_fn); - schedule_work(&mm->async_put_work); - } -} +extern void mmdrop(struct mm_struct *mm); /** * mmget() - Pin the address space associated with a &struct mm_struct. diff --git a/kernel/fork.c b/kernel/fork.c index 2295fc69717f..5e6cf0dd031c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -390,6 +391,241 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +#ifdef CONFIG_MMU +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) +{ + struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge; + LIST_HEAD(uf); + + uprobe_start_dup_mmap(); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; + + prev = NULL; + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + struct file *file; + + if (mpnt->vm_flags & VM_DONTCOPY) { + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + tmp->vm_next = tmp->vm_prev = NULL; + file = tmp->vm_file; + if (file) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; + + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + i_mmap_lock_write(mapping); + if (tmp->vm_flags & VM_SHARED) + atomic_inc(&mapping->i_mmap_writable); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* + * Link in the new vma and copy the page table entries. + */ + *pprev = tmp; + pprev = &tmp->vm_next; + tmp->vm_prev = prev; + prev = tmp; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto out; + } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); + retval = 0; +out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); + up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); +fail_uprobe_end: + uprobe_end_dup_mmap(); + return retval; +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +} + +static inline int mm_alloc_pgd(struct mm_struct *mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct *mm) +{ + pgd_free(mm, mm->pgd); +} +#else +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); +#endif +} + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +static void __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); + hmm_mm_destroy(mm); + mmu_notifier_mm_destroy(mm); + check_mm(mm); + put_user_ns(mm->user_ns); + free_mm(mm); +} + +void mmdrop(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) + __mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmdrop); + +static void mmdrop_async_fn(struct work_struct *work) +{ + struct mm_struct *mm; + + mm = container_of(work, struct mm_struct, async_put_work); + __mmdrop(mm); +} + +static void mmdrop_async(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) { + INIT_WORK(&mm->async_put_work, mmdrop_async_fn); + schedule_work(&mm->async_put_work); + } +} + static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); @@ -594,181 +830,8 @@ free_tsk: return NULL; } -#ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; - int retval; - unsigned long charge; - LIST_HEAD(uf); - - uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { - retval = -EINTR; - goto fail_uprobe_end; - } - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; - - prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { - struct file *file; - - if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* VM_WIPEONFORK gets a clean slate in the child. */ - tmp->anon_vma = NULL; - if (anon_vma_prepare(tmp)) - goto fail_nomem_anon_vma_fork; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); - tmp->vm_next = tmp->vm_prev = NULL; - file = tmp->vm_file; - if (file) { - struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; - - get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); - i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only - */ - if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); - - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - - mm->map_count++; - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - if (retval) - goto out; - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); - dup_userfaultfd_complete(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); - return retval; -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto out; -} - -static inline int mm_alloc_pgd(struct mm_struct *mm) -{ - mm->pgd = pgd_alloc(mm); - if (unlikely(!mm->pgd)) - return -ENOMEM; - return 0; -} - -static inline void mm_free_pgd(struct mm_struct *mm) -{ - pgd_free(mm, mm->pgd); -} -#else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - down_write(&oldmm->mmap_sem); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); - return 0; -} -#define mm_alloc_pgd(mm) (0) -#define mm_free_pgd(mm) -#endif /* CONFIG_MMU */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) - static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) @@ -858,27 +921,6 @@ fail_nopgd: return NULL; } -static void check_mm(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); - - if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); - } - - if (mm_pgtables_bytes(mm)) - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", - mm_pgtables_bytes(mm)); - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -#endif -} - /* * Allocate and initialize an mm_struct. */ @@ -894,24 +936,6 @@ struct mm_struct *mm_alloc(void) return mm_init(mm, current, current_user_ns()); } -/* - * Called when the last reference to the mm - * is dropped: either by a lazy thread or by - * mmput. Free the page directory and the mm. - */ -void __mmdrop(struct mm_struct *mm) -{ - BUG_ON(mm == &init_mm); - mm_free_pgd(mm); - destroy_context(mm); - hmm_mm_destroy(mm); - mmu_notifier_mm_destroy(mm); - check_mm(mm); - put_user_ns(mm->user_ns); - free_mm(mm); -} -EXPORT_SYMBOL_GPL(__mmdrop); - static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); -- cgit v1.2.3 From d6cb41cc44c63492702281b1d329955ca767d399 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:17:10 -0800 Subject: mm, hugetlb: remove hugepages_treat_as_movable sysctl hugepages_treat_as_movable has been introduced by 396faf0303d2 ("Allow huge page allocations to use GFP_HIGH_MOVABLE") to allow hugetlb allocations from ZONE_MOVABLE even when hugetlb pages were not migrateable. The purpose of the movable zone was different at the time. It aimed at reducing memory fragmentation and hugetlb pages being long lived and large werre not contributing to the fragmentation so it was acceptable to use the zone back then. Things have changed though and the primary purpose of the zone became migratability guarantee. If we allow non migrateable hugetlb pages to be in ZONE_MOVABLE memory hotplug might fail to offline the memory. Remove the knob and only rely on hugepage_migration_supported to allow movable zones. Mel said: : Primarily it was aimed at allowing the hugetlb pool to safely shrink with : the ability to grow it again. The use case was for batched jobs, some of : which needed huge pages and others that did not but didn't want the memory : useless pinned in the huge pages pool. : : I suspect that more users rely on THP than hugetlbfs for flexible use of : huge pages with fallback options so I think that removing the option : should be ok. Link: http://lkml.kernel.org/r/20171003072619.8654-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Alexandru Moise <00moses.alexander00@gmail.com> Acked-by: Mel Gorman Cc: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 25 ------------------------- include/linux/hugetlb.h | 1 - kernel/sysctl.c | 7 ------- mm/hugetlb.c | 4 +--- 4 files changed, 1 insertion(+), 36 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 5025ff9307e6..ff234d229cbb 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -30,7 +30,6 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold -- hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode - legacy_va_layout @@ -261,30 +260,6 @@ any throttling. ============================================================== -hugepages_treat_as_movable - -This parameter controls whether we can allocate hugepages from ZONE_MOVABLE -or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE. -ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified, -so this parameter has no effect if used without kernelcore=. - -Hugepage migration is now available in some situations which depend on the -architecture and/or the hugepage size. If a hugepage supports migration, -allocation from ZONE_MOVABLE is always enabled for the hugepage regardless -of the value of this parameter. -IOW, this parameter affects only non-migratable hugepages. - -Assuming that hugepages are not migratable in your system, one usecase of -this parameter is that users can make hugepage pool more extensible by -enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE -page reclaim/migration/compaction work more and you can get contiguous -memory more likely. Note that using ZONE_MOVABLE for non-migratable -hugepages can do harm to other features like memory hotremove (because -memory hotremove expects that memory blocks on ZONE_MOVABLE are always -removable,) so it's a trade-off responsible for the users. - -============================================================== - hugetlb_shm_group hugetlb_shm_group contains group id that is allowed to create SysV diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 82a25880714a..6fcf140188d0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -129,7 +129,6 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 557d46728577..2fb4e27c636a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1374,13 +1374,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "hugepages_treat_as_movable", - .data = &hugepages_treat_as_movable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "nr_overcommit_hugepages", .data = NULL, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1e6a5ad0d420..4137fb67cd79 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -36,8 +36,6 @@ #include #include "internal.h" -int hugepages_treat_as_movable; - int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -926,7 +924,7 @@ retry_cpuset: /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepages_treat_as_movable || hugepage_migration_supported(h)) + if (hugepage_migration_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; -- cgit v1.2.3 From 1beaeacdc88b537703d04d5536235d0bbb36db93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 30 Jan 2018 19:36:32 +0100 Subject: genirq: Make legacy autoprobing work again Meelis reported the following warning on a quad P3 HP NetServer museum piece: WARNING: CPU: 3 PID: 258 at kernel/irq/chip.c:244 __irq_startup+0x80/0x100 EIP: __irq_startup+0x80/0x100 irq_startup+0x7e/0x170 probe_irq_on+0x128/0x2b0 parport_irq_probe.constprop.18+0x8d/0x1af [parport_pc] parport_pc_probe_port+0xf11/0x1260 [parport_pc] parport_pc_init+0x78a/0xf10 [parport_pc] parport_parse_param.constprop.16+0xf0/0xf0 [parport_pc] do_one_initcall+0x45/0x1e0 This is caused by the rewrite of the irq activation/startup sequence which missed to convert a callsite in the irq legacy auto probing code. To fix this irq_activate_and_startup() needs to gain a return value so the pending logic can work proper. Fixes: c942cee46bba ("genirq: Separate activation and startup") Reported-by: Meelis Roos Signed-off-by: Thomas Gleixner Tested-by: Meelis Roos Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801301935410.1797@nanos --- kernel/irq/autoprobe.c | 2 +- kernel/irq/chip.c | 6 +++--- kernel/irq/internals.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c index 4e8089b319ae..8c82ea26e837 100644 --- a/kernel/irq/autoprobe.c +++ b/kernel/irq/autoprobe.c @@ -71,7 +71,7 @@ unsigned long probe_irq_on(void) raw_spin_lock_irq(&desc->lock); if (!desc->action && irq_settings_can_probe(desc)) { desc->istate |= IRQS_AUTODETECT | IRQS_WAITING; - if (irq_startup(desc, IRQ_NORESEND, IRQ_START_FORCE)) + if (irq_activate_and_startup(desc, IRQ_NORESEND)) desc->istate |= IRQS_PENDING; } raw_spin_unlock_irq(&desc->lock); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 043bfc35b353..c69357a43849 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -294,11 +294,11 @@ int irq_activate(struct irq_desc *desc) return 0; } -void irq_activate_and_startup(struct irq_desc *desc, bool resend) +int irq_activate_and_startup(struct irq_desc *desc, bool resend) { if (WARN_ON(irq_activate(desc))) - return; - irq_startup(desc, resend, IRQ_START_FORCE); + return 0; + return irq_startup(desc, resend, IRQ_START_FORCE); } static void __irq_disable(struct irq_desc *desc, bool mask); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index ab19371eab9b..ca6afa267070 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -76,7 +76,7 @@ extern void __enable_irq(struct irq_desc *desc); #define IRQ_START_COND false extern int irq_activate(struct irq_desc *desc); -extern void irq_activate_and_startup(struct irq_desc *desc, bool resend); +extern int irq_activate_and_startup(struct irq_desc *desc, bool resend); extern int irq_startup(struct irq_desc *desc, bool resend, bool force); extern void irq_shutdown(struct irq_desc *desc); -- cgit v1.2.3 From 328008a72d38b5bde6491e463405c34a81a65d3e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 2 Feb 2018 15:56:18 +0100 Subject: x86/power: Fix swsusp_arch_resume prototype The declaration for swsusp_arch_resume marks it as 'asmlinkage', but the definition in x86-32 does not, and it fails to include the header with the declaration. This leads to a warning when building with link-time-optimizations: kernel/power/power.h:108:23: error: type of 'swsusp_arch_resume' does not match original declaration [-Werror=lto-type-mismatch] extern asmlinkage int swsusp_arch_resume(void); ^ arch/x86/power/hibernate_32.c:148:0: note: 'swsusp_arch_resume' was previously declared here int swsusp_arch_resume(void) This moves the declaration into a globally visible header file and fixes up both x86 definitions to match it. Signed-off-by: Arnd Bergmann Signed-off-by: Thomas Gleixner Cc: Len Brown Cc: Andi Kleen Cc: Nicolas Pitre Cc: linux-pm@vger.kernel.org Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Bart Van Assche Link: https://lkml.kernel.org/r/20180202145634.200291-2-arnd@arndb.de --- arch/x86/power/hibernate_32.c | 2 +- arch/x86/power/hibernate_64.c | 2 +- include/linux/suspend.h | 2 ++ kernel/power/power.h | 3 --- 4 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index c35fdb585c68..afc4ed7b1578 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -145,7 +145,7 @@ static inline void resume_init_first_level_page_table(pgd_t *pg_dir) #endif } -int swsusp_arch_resume(void) +asmlinkage int swsusp_arch_resume(void) { int error; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index f910c514438f..0ef5e5204968 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -174,7 +174,7 @@ out: return 0; } -int swsusp_arch_resume(void) +asmlinkage int swsusp_arch_resume(void) { int error; diff --git a/include/linux/suspend.h b/include/linux/suspend.h index cc22a24516d6..440b62f7502e 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -384,6 +384,8 @@ extern int swsusp_page_is_forbidden(struct page *); extern void swsusp_set_page_free(struct page *); extern void swsusp_unset_page_free(struct page *); extern unsigned long get_safe_page(gfp_t gfp_mask); +extern asmlinkage int swsusp_arch_suspend(void); +extern asmlinkage int swsusp_arch_resume(void); extern void hibernation_set_ops(const struct platform_hibernation_ops *ops); extern int hibernate(void); diff --git a/kernel/power/power.h b/kernel/power/power.h index f29cd178df90..9e58bdc8a562 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -104,9 +104,6 @@ extern int in_suspend; extern dev_t swsusp_resume_device; extern sector_t swsusp_resume_block; -extern asmlinkage int swsusp_arch_suspend(void); -extern asmlinkage int swsusp_arch_resume(void); - extern int create_basic_memory_bitmaps(void); extern void free_basic_memory_bitmaps(void); extern int hibernate_preallocate_memory(void); -- cgit v1.2.3