From 33ba43ed0afc13a29b1314e3e45a9938d310ba13 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 23 Aug 2017 00:06:09 +0200 Subject: bpf: fix map value attribute for hash of maps Currently, iproute2's BPF ELF loader works fine with array of maps when retrieving the fd from a pinned node and doing a selfcheck against the provided map attributes from the object file, but we fail to do the same for hash of maps and thus refuse to get the map from pinned node. Reason is that when allocating hash of maps, fd_htab_map_alloc() will set the value size to sizeof(void *), and any user space map creation requests are forced to set 4 bytes as value size. Thus, selfcheck will complain about exposed 8 bytes on 64 bit archs vs. 4 bytes from object file as value size. Contract is that fdinfo or BPF_MAP_GET_FD_BY_ID returns the value size used to create the map. Fix it by handling it the same way as we do for array of maps, which means that we leave value size at 4 bytes and in the allocation phase round up value size to 8 bytes. alloc_htab_elem() needs an adjustment in order to copy rounded up 8 bytes due to bpf_fd_htab_map_update_elem() calling into htab_map_update_elem() with the pointer of the map pointer as value. Unlike array of maps where we just xchg(), we're using the generic htab_map_update_elem() callback also used from helper calls, which published the key/value already on return, so we need to ensure to memcpy() the right size. Fixes: bcc6b1b7ebf8 ("bpf: Add hash of maps support") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller --- kernel/bpf/hashtab.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 4fb463172aa8..d11c8181f4c5 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -652,12 +652,27 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) +{ + return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && + BITS_PER_LONG == 64; +} + +static u32 htab_size_value(const struct bpf_htab *htab, bool percpu) +{ + u32 size = htab->map.value_size; + + if (percpu || fd_htab_map_needs_adjust(htab)) + size = round_up(size, 8); + return size; +} + static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, void *value, u32 key_size, u32 hash, bool percpu, bool onallcpus, struct htab_elem *old_elem) { - u32 size = htab->map.value_size; + u32 size = htab_size_value(htab, percpu); bool prealloc = htab_is_prealloc(htab); struct htab_elem *l_new, **pl_new; void __percpu *pptr; @@ -696,9 +711,6 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, memcpy(l_new->key, key, key_size); if (percpu) { - /* round up value_size to 8 bytes */ - size = round_up(size, 8); - if (prealloc) { pptr = htab_elem_get_ptr(l_new, key_size); } else { @@ -1209,17 +1221,9 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = { static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) { - struct bpf_map *map; - if (attr->value_size != sizeof(u32)) return ERR_PTR(-EINVAL); - - /* pointer is stored internally */ - attr->value_size = sizeof(void *); - map = htab_map_alloc(attr); - attr->value_size = sizeof(u32); - - return map; + return htab_map_alloc(attr); } static void fd_htab_map_free(struct bpf_map *map) -- cgit v1.2.3 From 2fe59f507a65dbd734b990a11ebc7488f6f87a24 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 22 Aug 2017 18:43:48 +1000 Subject: timers: Fix excessive granularity of new timers after a nohz idle When a timer base is idle, it is forwarded when a new timer is added to ensure that granularity does not become excessive. When not idle, the timer tick is expected to increment the base. However there are several problems: - If an existing timer is modified, the base is forwarded only after the index is calculated. - The base is not forwarded by add_timer_on. - There is a window after a timer is restarted from a nohz idle, after it is marked not-idle and before the timer tick on this CPU, where a timer may be added but the ancient base does not get forwarded. These result in excessive granularity (a 1 jiffy timeout can blow out to 100s of jiffies), which cause the rcu lockup detector to trigger, among other things. Fix this by keeping track of whether the timer base has been idle since it was last run or forwarded, and if so then forward it before adding a new timer. There is still a case where mod_timer optimises the case of a pending timer mod with the same expiry time, where the timer can see excessive granularity relative to the new, shorter interval. A comment is added, but it's not changed because it is an important fastpath for networking. This has been tested and found to fix the RCU softlockup messages. Testing was also done with tracing to measure requested versus achieved wakeup latencies for all non-deferrable timers in an idle system (with no lockup watchdogs running). Wakeup latency relative to absolute latency is calculated (note this suffers from round-up skew at low absolute times) and analysed: max avg std upstream 506.0 1.20 4.68 patched 2.0 1.08 0.15 The bug was noticed due to the lockup detector Kconfig changes dropping it out of people's .configs and resulting in larger base clk skew When the lockup detectors are enabled, no CPU can go idle for longer than 4 seconds, which limits the granularity errors. Sub-optimal timer behaviour is observable on a smaller scale in that case: max avg std upstream 9.0 1.05 0.19 patched 2.0 1.04 0.11 Fixes: Fixes: a683f390b93f ("timers: Forward the wheel clock whenever possible") Signed-off-by: Nicholas Piggin Signed-off-by: Thomas Gleixner Tested-by: Jonathan Cameron Tested-by: David Miller Cc: dzickus@redhat.com Cc: sfr@canb.auug.org.au Cc: mpe@ellerman.id.au Cc: Stephen Boyd Cc: linuxarm@huawei.com Cc: abdhalee@linux.vnet.ibm.com Cc: John Stultz Cc: akpm@linux-foundation.org Cc: paulmck@linux.vnet.ibm.com Cc: torvalds@linux-foundation.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20170822084348.21436-1-npiggin@gmail.com --- kernel/time/timer.c | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 8f5d1bf18854..f2674a056c26 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -203,6 +203,7 @@ struct timer_base { bool migration_enabled; bool nohz_active; bool is_idle; + bool must_forward_clk; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; @@ -856,13 +857,19 @@ get_target_base(struct timer_base *base, unsigned tflags) static inline void forward_timer_base(struct timer_base *base) { - unsigned long jnow = READ_ONCE(jiffies); + unsigned long jnow; /* - * We only forward the base when it's idle and we have a delta between - * base clock and jiffies. + * We only forward the base when we are idle or have just come out of + * idle (must_forward_clk logic), and have a delta between base clock + * and jiffies. In the common case, run_timers will take care of it. */ - if (!base->is_idle || (long) (jnow - base->clk) < 2) + if (likely(!base->must_forward_clk)) + return; + + jnow = READ_ONCE(jiffies); + base->must_forward_clk = base->is_idle; + if ((long)(jnow - base->clk) < 2) return; /* @@ -938,6 +945,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * same array bucket then just return: */ if (timer_pending(timer)) { + /* + * The downside of this optimization is that it can result in + * larger granularity than you would get from adding a new + * timer with this expiry. + */ if (timer->expires == expires) return 1; @@ -948,6 +960,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) * dequeue/enqueue dance. */ base = lock_timer_base(timer, &flags); + forward_timer_base(base); clk = base->clk; idx = calc_wheel_index(expires, clk); @@ -964,6 +977,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) } } else { base = lock_timer_base(timer, &flags); + forward_timer_base(base); } ret = detach_if_pending(timer, base, false); @@ -991,12 +1005,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only) raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | base->cpu); + forward_timer_base(base); } } - /* Try to forward a stale timer base clock */ - forward_timer_base(base); - timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance @@ -1112,6 +1124,7 @@ void add_timer_on(struct timer_list *timer, int cpu) WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | cpu); } + forward_timer_base(base); debug_activate(timer, timer->expires); internal_add_timer(base, timer); @@ -1497,10 +1510,16 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) if (!is_max_delta) expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* - * If we expect to sleep more than a tick, mark the base idle: + * If we expect to sleep more than a tick, mark the base idle. + * Also the tick is stopped so any added timer must forward + * the base clk itself to keep granularity small. This idle + * logic is only maintained for the BASE_STD base, deferrable + * timers may still see large granularity skew (by design). */ - if ((expires - basem) > TICK_NSEC) + if ((expires - basem) > TICK_NSEC) { + base->must_forward_clk = true; base->is_idle = true; + } } raw_spin_unlock(&base->lock); @@ -1611,6 +1630,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + /* + * must_forward_clk must be cleared before running timers so that any + * timer functions that call mod_timer will not try to forward the + * base. idle trcking / clock forwarding logic is only used with + * BASE_STD timers. + * + * The deferrable base does not do idle tracking at all, so we do + * not forward it. This can result in very large variations in + * granularity for deferrable timers, but they can be deferred for + * long periods due to idle. + */ + base->must_forward_clk = false; + __run_timers(base); if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); -- cgit v1.2.3 From 1c08c22c874ac88799cab1f78c40f46110274915 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 24 Aug 2017 12:04:29 -0400 Subject: cpuset: Fix incorrect memory_pressure control file mapping The memory_pressure control file was incorrectly set up without a private value (0, by default). As a result, this control file was treated like memory_migrate on read. By adding back the FILE_MEMORY_PRESSURE private value, the correct memory pressure value will be returned. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Fixes: 7dbdb199d3bf ("cgroup: replace cftype->mode with CFTYPE_WORLD_WRITABLE") Cc: stable@vger.kernel.org # v4.4+ --- kernel/cgroup/cpuset.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ca8376e5008c..8362bac0d179 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -1891,6 +1891,7 @@ static struct cftype files[] = { { .name = "memory_pressure", .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, }, { -- cgit v1.2.3 From 64aee2a965cf2954a038b5522f11d2cd2f0f8f3e Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 22 Jun 2017 15:41:38 +0100 Subject: perf/core: Fix group {cpu,task} validation Regardless of which events form a group, it does not make sense for the events to target different tasks and/or CPUs, as this leaves the group inconsistent and impossible to schedule. The core perf code assumes that these are consistent across (successfully intialised) groups. Core perf code only verifies this when moving SW events into a HW context. Thus, we can violate this requirement for pure SW groups and pure HW groups, unless the relevant PMU driver happens to perform this verification itself. These mismatched groups subsequently wreak havoc elsewhere. For example, we handle watchpoints as SW events, and reserve watchpoint HW on a per-CPU basis at pmu::event_init() time to ensure that any event that is initialised is guaranteed to have a slot at pmu::add() time. However, the core code only checks the group leader's cpu filter (via event_filter_match()), and can thus install follower events onto CPUs violating thier (mismatched) CPU filters, potentially installing them into a CPU without sufficient reserved slots. This can be triggered with the below test case, resulting in warnings from arch backends. #define _GNU_SOURCE #include #include #include #include #include #include #include static int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); } char watched_char; struct perf_event_attr wp_attr = { .type = PERF_TYPE_BREAKPOINT, .bp_type = HW_BREAKPOINT_RW, .bp_addr = (unsigned long)&watched_char, .bp_len = 1, .size = sizeof(wp_attr), }; int main(int argc, char *argv[]) { int leader, ret; cpu_set_t cpus; /* * Force use of CPU0 to ensure our CPU0-bound events get scheduled. */ CPU_ZERO(&cpus); CPU_SET(0, &cpus); ret = sched_setaffinity(0, sizeof(cpus), &cpus); if (ret) { printf("Unable to set cpu affinity\n"); return 1; } /* open leader event, bound to this task, CPU0 only */ leader = perf_event_open(&wp_attr, 0, 0, -1, 0); if (leader < 0) { printf("Couldn't open leader: %d\n", leader); return 1; } /* * Open a follower event that is bound to the same task, but a * different CPU. This means that the group should never be possible to * schedule. */ ret = perf_event_open(&wp_attr, 0, 1, leader, 0); if (ret < 0) { printf("Couldn't open mismatched follower: %d\n", ret); return 1; } else { printf("Opened leader/follower with mismastched CPUs\n"); } /* * Open as many independent events as we can, all bound to the same * task, CPU0 only. */ do { ret = perf_event_open(&wp_attr, 0, 0, -1, 0); } while (ret >= 0); /* * Force enable/disble all events to trigger the erronoeous * installation of the follower event. */ printf("Opened all events. Toggling..\n"); for (;;) { prctl(PR_TASK_PERF_EVENTS_DISABLE, 0, 0, 0, 0); prctl(PR_TASK_PERF_EVENTS_ENABLE, 0, 0, 0, 0); } return 0; } Fix this by validating this requirement regardless of whether we're moving events. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Zhou Chengming Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1498142498-15758-1-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ee20d4c546b5..3504125871d2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10032,28 +10032,27 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; /* - * Do not allow to attach to a group in a different - * task or CPU context: + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. */ - if (move_group) { - /* - * Make sure we're both on the same task, or both - * per-cpu events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + if (group_leader->cpu != event->cpu) + goto err_context; - /* - * Make sure we're both events for the same CPU; - * grouping events for different CPUs is broken; since - * you can never concurrently schedule them anyhow. - */ - if (group_leader->cpu != event->cpu) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } + /* + * Make sure we're both on the same task, or both + * per-CPU events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Do not allow to attach to a group in a different task + * or CPU context. If we're moving SW events, we'll fix + * this up later, so allow that. + */ + if (!move_group && group_leader->ctx != ctx) + goto err_context; /* * Only a group leader can be exclusive or pinned -- cgit v1.2.3 From 2b7e8665b4ff51c034c55df3cff76518d1a9ee3a Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 25 Aug 2017 15:55:43 -0700 Subject: fork: fix incorrect fput of ->exe_file causing use-after-free Commit 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") made it possible to kill a forking task while it is waiting to acquire its ->mmap_sem for write, in dup_mmap(). However, it was overlooked that this introduced an new error path before a reference is taken on the mm_struct's ->exe_file. Since the ->exe_file of the new mm_struct was already set to the old ->exe_file by the memcpy() in dup_mm(), it was possible for the mmput() in the error path of dup_mm() to drop a reference to ->exe_file which was never taken. This caused the struct file to later be freed prematurely. Fix it by updating mm_init() to NULL out the ->exe_file, in the same place it clears other things like the list of mmaps. This bug was found by syzkaller. It can be reproduced using the following C program: #define _GNU_SOURCE #include #include #include #include #include #include static void *mmap_thread(void *_arg) { for (;;) { mmap(NULL, 0x1000000, PROT_READ, MAP_POPULATE|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); } } static void *fork_thread(void *_arg) { usleep(rand() % 10000); fork(); } int main(void) { fork(); fork(); fork(); for (;;) { if (fork() == 0) { pthread_t t; pthread_create(&t, NULL, mmap_thread, NULL); pthread_create(&t, NULL, fork_thread, NULL); usleep(rand() % 10000); syscall(__NR_exit_group, 0); } wait(NULL); } } No special kernel config options are needed. It usually causes a NULL pointer dereference in __remove_shared_vm_struct() during exit, or in dup_mmap() (which is usually inlined into copy_process()) during fork. Both are due to a vm_area_struct's ->vm_file being used after it's already been freed. Google Bug Id: 64772007 Link: http://lkml.kernel.org/r/20170823211408.31198-1-ebiggers3@gmail.com Fixes: 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") Signed-off-by: Eric Biggers Tested-by: Mark Rutland Acked-by: Michal Hocko Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Konstantin Khlebnikov Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Vlastimil Babka Cc: [v4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index e075b7780421..cbbea277b3fb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -806,6 +806,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); + RCU_INIT_POINTER(mm->exe_file, NULL); mmu_notifier_mm_init(mm); init_tlb_flush_pending(mm); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS -- cgit v1.2.3 From 0bcdc0987cce9880436b70836c6a92bb8e744fd1 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Fri, 25 Aug 2017 15:57:04 -0700 Subject: time: Fix ktime_get_raw() incorrect base accumulation In comqit fc6eead7c1e2 ("time: Clean up CLOCK_MONOTONIC_RAW time handling"), the following code got mistakenly added to the update of the raw timekeeper: /* Update the monotonic raw base */ seconds = tk->raw_sec; nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift); tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); Which adds the raw_sec value and the shifted down raw xtime_nsec to the base value. But the read function adds the shifted down tk->tkr_raw.xtime_nsec value another time, The result of this is that ktime_get_raw() users (which are all internal users) see the raw time move faster then it should (the rate at which can vary with the current size of tkr_raw.xtime_nsec), which has resulted in at least problems with graphics rendering performance. The change tried to match the monotonic base update logic: seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); nsec = (u32) tk->wall_to_monotonic.tv_nsec; tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); Which adds the wall_to_monotonic.tv_nsec value, but not the tk->tkr_mono.xtime_nsec value to the base. To fix this, simplify the tkr_raw.base accumulation to only accumulate the raw_sec portion, and do not include the tkr_raw.xtime_nsec portion, which will be added at read time. Fixes: fc6eead7c1e2 ("time: Clean up CLOCK_MONOTONIC_RAW time handling") Reported-and-tested-by: Chris Wilson Signed-off-by: John Stultz Signed-off-by: Thomas Gleixner Cc: Prarit Bhargava Cc: Kevin Brodsky Cc: Richard Cochran Cc: Stephen Boyd Cc: Will Deacon Cc: Miroslav Lichvar Cc: Daniel Mentz Link: http://lkml.kernel.org/r/1503701824-1645-1-git-send-email-john.stultz@linaro.org --- kernel/time/timekeeping.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cedafa008de5..7e7e61c00d61 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -637,9 +637,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->ktime_sec = seconds; /* Update the monotonic raw base */ - seconds = tk->raw_sec; - nsec = (u32)(tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift); - tk->tkr_raw.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); + tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } /* must hold timekeeper_lock */ -- cgit v1.2.3 From 3510ca20ece0150af6b10c77a74ff1b5c198e3e2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 27 Aug 2017 13:55:12 -0700 Subject: Minor page waitqueue cleanups Tim Chen and Kan Liang have been battling a customer load that shows extremely long page wakeup lists. The cause seems to be constant NUMA migration of a hot page that is shared across a lot of threads, but the actual root cause for the exact behavior has not been found. Tim has a patch that batches the wait list traversal at wakeup time, so that we at least don't get long uninterruptible cases where we traverse and wake up thousands of processes and get nasty latency spikes. That is likely 4.14 material, but we're still discussing the page waitqueue specific parts of it. In the meantime, I've tried to look at making the page wait queues less expensive, and failing miserably. If you have thousands of threads waiting for the same page, it will be painful. We'll need to try to figure out the NUMA balancing issue some day, in addition to avoiding the excessive spinlock hold times. That said, having tried to rewrite the page wait queues, I can at least fix up some of the braindamage in the current situation. In particular: (a) we don't want to continue walking the page wait list if the bit we're waiting for already got set again (which seems to be one of the patterns of the bad load). That makes no progress and just causes pointless cache pollution chasing the pointers. (b) we don't want to put the non-locking waiters always on the front of the queue, and the locking waiters always on the back. Not only is that unfair, it means that we wake up thousands of reading threads that will just end up being blocked by the writer later anyway. Also add a comment about the layout of 'struct wait_page_key' - there is an external user of it in the cachefiles code that means that it has to match the layout of 'struct wait_bit_key' in the two first members. It so happens to match, because 'struct page *' and 'unsigned long *' end up having the same values simply because the page flags are the first member in struct page. Cc: Tim Chen Cc: Kan Liang Cc: Mel Gorman Cc: Christopher Lameter Cc: Andi Kleen Cc: Davidlohr Bueso Cc: Peter Zijlstra Signed-off-by: Linus Torvalds --- kernel/sched/wait.c | 7 ++++--- mm/filemap.c | 11 ++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 17f11c6b0a9f..d6afed6d0752 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -70,9 +70,10 @@ static void __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, list_for_each_entry_safe(curr, next, &wq_head->head, entry) { unsigned flags = curr->flags; - - if (curr->func(curr, mode, wake_flags, key) && - (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + int ret = curr->func(curr, mode, wake_flags, key); + if (ret < 0) + break; + if (ret && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } } diff --git a/mm/filemap.c b/mm/filemap.c index a49702445ce0..baba290c276b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -885,6 +885,7 @@ void __init pagecache_init(void) page_writeback_init(); } +/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */ struct wait_page_key { struct page *page; int bit_nr; @@ -909,8 +910,10 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, if (wait_page->bit_nr != key->bit_nr) return 0; + + /* Stop walking if it's locked */ if (test_bit(key->bit_nr, &key->page->flags)) - return 0; + return -1; return autoremove_wake_function(wait, mode, sync, key); } @@ -964,6 +967,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, int ret = 0; init_wait(wait); + wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0; wait->func = wake_page_function; wait_page.page = page; wait_page.bit_nr = bit_nr; @@ -972,10 +976,7 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q, spin_lock_irq(&q->lock); if (likely(list_empty(&wait->entry))) { - if (lock) - __add_wait_queue_entry_tail_exclusive(q, wait); - else - __add_wait_queue(q, wait); + __add_wait_queue_entry_tail(q, wait); SetPageWaiters(page); } -- cgit v1.2.3 From f12f42acdbb577a12eecfcebbbec41c81505c4dc Mon Sep 17 00:00:00 2001 From: Meng Xu Date: Wed, 23 Aug 2017 17:07:50 -0400 Subject: perf/core: Fix potential double-fetch bug While examining the kernel source code, I found a dangerous operation that could turn into a double-fetch situation (a race condition bug) where the same userspace memory region are fetched twice into kernel with sanity checks after the first fetch while missing checks after the second fetch. 1. The first fetch happens in line 9573 get_user(size, &uattr->size). 2. Subsequently the 'size' variable undergoes a few sanity checks and transformations (line 9577 to 9584). 3. The second fetch happens in line 9610 copy_from_user(attr, uattr, size) 4. Given that 'uattr' can be fully controlled in userspace, an attacker can race condition to override 'uattr->size' to arbitrary value (say, 0xFFFFFFFF) after the first fetch but before the second fetch. The changed value will be copied to 'attr->size'. 5. There is no further checks on 'attr->size' until the end of this function, and once the function returns, we lose the context to verify that 'attr->size' conforms to the sanity checks performed in step 2 (line 9577 to 9584). 6. My manual analysis shows that 'attr->size' is not used elsewhere later, so, there is no working exploit against it right now. However, this could easily turns to an exploitable one if careless developers start to use 'attr->size' later. To fix this, override 'attr->size' from the second fetch to the one from the first fetch, regardless of what is actually copied in. In this way, it is assured that 'attr->size' is consistent with the checks performed after the first fetch. Signed-off-by: Meng Xu Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: meng.xu@gatech.edu Cc: sanidhya@gatech.edu Cc: taesoo@gatech.edu Link: http://lkml.kernel.org/r/1503522470-35531-1-git-send-email-meng.xu@gatech.edu Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3504125871d2..ce131d25622a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9611,6 +9611,8 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; + attr->size = size; + if (attr->__reserved_1) return -EINVAL; -- cgit v1.2.3 From 75e8387685f6c65feb195a4556110b58f852b848 Mon Sep 17 00:00:00 2001 From: Zhou Chengming Date: Fri, 25 Aug 2017 21:49:37 +0800 Subject: perf/ftrace: Fix double traces of perf on ftrace:function When running perf on the ftrace:function tracepoint, there is a bug which can be reproduced by: perf record -e ftrace:function -a sleep 20 & perf record -e ftrace:function ls perf script ls 10304 [005] 171.853235: ftrace:function: perf_output_begin ls 10304 [005] 171.853237: ftrace:function: perf_output_begin ls 10304 [005] 171.853239: ftrace:function: task_tgid_nr_ns ls 10304 [005] 171.853240: ftrace:function: task_tgid_nr_ns ls 10304 [005] 171.853242: ftrace:function: __task_pid_nr_ns ls 10304 [005] 171.853244: ftrace:function: __task_pid_nr_ns We can see that all the function traces are doubled. The problem is caused by the inconsistency of the register function perf_ftrace_event_register() with the probe function perf_ftrace_function_call(). The former registers one probe for every perf_event. And the latter handles all perf_events on the current cpu. So when two perf_events on the current cpu, the traces of them will be doubled. So this patch adds an extra parameter "event" for perf_tp_event, only send sample data to this event when it's not NULL. Signed-off-by: Zhou Chengming Reviewed-by: Jiri Olsa Acked-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: acme@kernel.org Cc: alexander.shishkin@linux.intel.com Cc: huawei.libin@huawei.com Link: http://lkml.kernel.org/r/1503668977-12526-1-git-send-email-zhouchengming1@huawei.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- include/linux/trace_events.h | 4 ++-- kernel/events/core.c | 13 +++++++++---- kernel/trace/trace_event_perf.c | 4 +++- kernel/trace/trace_kprobe.c | 4 ++-- kernel/trace/trace_syscalls.c | 4 ++-- kernel/trace/trace_uprobe.c | 2 +- 7 files changed, 20 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b14095bcf4bb..c00cd4b02f32 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1201,7 +1201,7 @@ extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, - struct task_struct *task); + struct task_struct *task, struct perf_event *event); extern void perf_bp_event(struct perf_event *event, void *data); #ifndef perf_misc_flags diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 536c80ff7ad9..5012b524283d 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -508,9 +508,9 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, static inline void perf_trace_buf_submit(void *raw_data, int size, int rctx, u16 type, u64 count, struct pt_regs *regs, void *head, - struct task_struct *task) + struct task_struct *task, struct perf_event *event) { - perf_tp_event(type, count, raw_data, size, regs, head, rctx, task); + perf_tp_event(type, count, raw_data, size, regs, head, rctx, task, event); } #endif diff --git a/kernel/events/core.c b/kernel/events/core.c index ce131d25622a..03ac9c8b02fb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7906,16 +7906,15 @@ void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, } } perf_tp_event(call->event.type, count, raw_data, size, regs, head, - rctx, task); + rctx, task, NULL); } EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit); void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, struct hlist_head *head, int rctx, - struct task_struct *task) + struct task_struct *task, struct perf_event *event) { struct perf_sample_data data; - struct perf_event *event; struct perf_raw_record raw = { .frag = { @@ -7929,9 +7928,15 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, perf_trace_buf_update(record, event_type); - hlist_for_each_entry_rcu(event, head, hlist_entry) { + /* Use the given event instead of the hlist */ + if (event) { if (perf_tp_event_match(event, &data, regs)) perf_swevent_event(event, count, &data, regs); + } else { + hlist_for_each_entry_rcu(event, head, hlist_entry) { + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_event(event, count, &data, regs); + } } /* diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 562fa69df5d3..13ba2d3f6a91 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -306,6 +306,7 @@ static void perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ops, struct pt_regs *pt_regs) { + struct perf_event *event; struct ftrace_entry *entry; struct hlist_head *head; struct pt_regs regs; @@ -329,8 +330,9 @@ perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, entry->ip = ip; entry->parent_ip = parent_ip; + event = container_of(ops, struct perf_event, ftrace_ops); perf_trace_buf_submit(entry, ENTRY_SIZE, rctx, TRACE_FN, - 1, ®s, head, NULL); + 1, ®s, head, NULL, event); #undef ENTRY_SIZE } diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c9b5aa10fbf9..8a907e12b6b9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1200,7 +1200,7 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) memset(&entry[1], 0, dsize); store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL); + head, NULL, NULL); } NOKPROBE_SYMBOL(kprobe_perf_func); @@ -1236,7 +1236,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, entry->ret_ip = (unsigned long)ri->ret_addr; store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize); perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL); + head, NULL, NULL); } NOKPROBE_SYMBOL(kretprobe_perf_func); #endif /* CONFIG_PERF_EVENTS */ diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 5e10395da88e..74d9a86eccc0 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -596,7 +596,7 @@ static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) (unsigned long *)&rec->args); perf_trace_buf_submit(rec, size, rctx, sys_data->enter_event->event.type, 1, regs, - head, NULL); + head, NULL, NULL); } static int perf_sysenter_enable(struct trace_event_call *call) @@ -667,7 +667,7 @@ static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); perf_trace_buf_submit(rec, size, rctx, sys_data->exit_event->event.type, - 1, regs, head, NULL); + 1, regs, head, NULL, NULL); } static int perf_sysexit_enable(struct trace_event_call *call) diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a7581fec9681..4525e0271a53 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1156,7 +1156,7 @@ static void __uprobe_perf_func(struct trace_uprobe *tu, } perf_trace_buf_submit(entry, size, rctx, call->event.type, 1, regs, - head, NULL); + head, NULL, NULL); out: preempt_enable(); } -- cgit v1.2.3 From 22cf8bc6cb0d48c6d31da8eccbd4201ab9b0c4c6 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 31 Aug 2017 16:15:23 -0700 Subject: kernel/kthread.c: kthread_worker: don't hog the cpu If the worker thread continues getting work, it will hog the cpu and rcu stall complains. Make it a good citizen. This is triggered in a loop block device test. Link: http://lkml.kernel.org/r/5de0a179b3184e1a2183fc503448b0269f24d75b.1503697127.git.shli@fb.com Signed-off-by: Shaohua Li Cc: Petr Mladek Cc: Thomas Gleixner Cc: Tejun Heo Cc: Oleg Nesterov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kthread.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index 26db528c1d88..1c19edf82427 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -637,6 +637,7 @@ repeat: schedule(); try_to_freeze(); + cond_resched(); goto repeat; } EXPORT_SYMBOL_GPL(kthread_worker_fn); -- cgit v1.2.3 From 355627f518978b5167256d27492fe0b343aaf2f2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 31 Aug 2017 16:15:26 -0700 Subject: mm, uprobes: fix multiple free of ->uprobes_state.xol_area Commit 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") made it possible to kill a forking task while it is waiting to acquire its ->mmap_sem for write, in dup_mmap(). However, it was overlooked that this introduced an new error path before the new mm_struct's ->uprobes_state.xol_area has been set to NULL after being copied from the old mm_struct by the memcpy in dup_mm(). For a task that has previously hit a uprobe tracepoint, this resulted in the 'struct xol_area' being freed multiple times if the task was killed at just the right time while forking. Fix it by setting ->uprobes_state.xol_area to NULL in mm_init() rather than in uprobe_dup_mmap(). With CONFIG_UPROBE_EVENTS=y, the bug can be reproduced by the same C program given by commit 2b7e8665b4ff ("fork: fix incorrect fput of ->exe_file causing use-after-free"), provided that a uprobe tracepoint has been set on the fork_thread() function. For example: $ gcc reproducer.c -o reproducer -lpthread $ nm reproducer | grep fork_thread 0000000000400719 t fork_thread $ echo "p $PWD/reproducer:0x719" > /sys/kernel/debug/tracing/uprobe_events $ echo 1 > /sys/kernel/debug/tracing/events/uprobes/enable $ ./reproducer Here is the use-after-free reported by KASAN: BUG: KASAN: use-after-free in uprobe_clear_state+0x1c4/0x200 Read of size 8 at addr ffff8800320a8b88 by task reproducer/198 CPU: 1 PID: 198 Comm: reproducer Not tainted 4.13.0-rc7-00015-g36fde05f3fb5 #255 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 Call Trace: dump_stack+0xdb/0x185 print_address_description+0x7e/0x290 kasan_report+0x23b/0x350 __asan_report_load8_noabort+0x19/0x20 uprobe_clear_state+0x1c4/0x200 mmput+0xd6/0x360 do_exit+0x740/0x1670 do_group_exit+0x13f/0x380 get_signal+0x597/0x17d0 do_signal+0x99/0x1df0 exit_to_usermode_loop+0x166/0x1e0 syscall_return_slowpath+0x258/0x2c0 entry_SYSCALL_64_fastpath+0xbc/0xbe ... Allocated by task 199: save_stack_trace+0x1b/0x20 kasan_kmalloc+0xfc/0x180 kmem_cache_alloc_trace+0xf3/0x330 __create_xol_area+0x10f/0x780 uprobe_notify_resume+0x1674/0x2210 exit_to_usermode_loop+0x150/0x1e0 prepare_exit_to_usermode+0x14b/0x180 retint_user+0x8/0x20 Freed by task 199: save_stack_trace+0x1b/0x20 kasan_slab_free+0xa8/0x1a0 kfree+0xba/0x210 uprobe_clear_state+0x151/0x200 mmput+0xd6/0x360 copy_process.part.8+0x605f/0x65d0 _do_fork+0x1a5/0xbd0 SyS_clone+0x19/0x20 do_syscall_64+0x22f/0x660 return_from_SYSCALL_64+0x0/0x7a Note: without KASAN, you may instead see a "Bad page state" message, or simply a general protection fault. Link: http://lkml.kernel.org/r/20170830033303.17927-1-ebiggers3@gmail.com Fixes: 7c051267931a ("mm, fork: make dup_mmap wait for mmap_sem for write killable") Signed-off-by: Eric Biggers Reported-by: Oleg Nesterov Acked-by: Oleg Nesterov Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Konstantin Khlebnikov Cc: Mark Rutland Cc: Michal Hocko Cc: Peter Zijlstra Cc: Vlastimil Babka Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/events/uprobes.c | 2 -- kernel/fork.c | 8 ++++++++ 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 0e137f98a50c..267f6ef91d97 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1262,8 +1262,6 @@ void uprobe_end_dup_mmap(void) void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm) { - newmm->uprobes_state.xol_area = NULL; - if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) { set_bit(MMF_HAS_UPROBES, &newmm->flags); /* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */ diff --git a/kernel/fork.c b/kernel/fork.c index cbbea277b3fb..b7e9e57b71ea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -785,6 +785,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p) #endif } +static void mm_init_uprobes_state(struct mm_struct *mm) +{ +#ifdef CONFIG_UPROBES + mm->uprobes_state.xol_area = NULL; +#endif +} + static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { @@ -812,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif + mm_init_uprobes_state(mm); if (current->mm) { mm->flags = current->mm->flags & MMF_INIT_MASK; -- cgit v1.2.3