From 488603b815a7514c7009e6fc339d74ed4a30f343 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Sat, 11 Jan 2020 04:53:38 -0500 Subject: sched/core: Don't skip remote tick for idle CPUs This will be used in the next patch to get a loadavg update from nohz cpus. The delta check is skipped because idle_sched_class doesn't update se.exec_start. Signed-off-by: Scott Wood Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/1578736419-14628-2-git-send-email-swood@redhat.com --- kernel/sched/core.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fc1dfc007604..cf8b33dc4513 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3669,22 +3669,24 @@ static void sched_tick_remote(struct work_struct *work) * statistics and checks timeslices in a time-independent way, regardless * of when exactly it is running. */ - if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) + if (!tick_nohz_tick_stopped_cpu(cpu)) goto out_requeue; rq_lock_irq(rq, &rf); curr = rq->curr; - if (is_idle_task(curr) || cpu_is_offline(cpu)) + if (cpu_is_offline(cpu)) goto out_unlock; update_rq_clock(rq); - delta = rq_clock_task(rq) - curr->se.exec_start; - /* - * Make sure the next tick runs within a reasonable - * amount of time. - */ - WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + if (!is_idle_task(curr)) { + /* + * Make sure the next tick runs within a reasonable + * amount of time. + */ + delta = rq_clock_task(rq) - curr->se.exec_start; + WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); + } curr->sched_class->task_tick(rq, curr, 0); out_unlock: -- cgit v1.2.3 From ebc0f83c78a2d26384401ecf2d2fa48063c0ee27 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Sat, 11 Jan 2020 04:53:39 -0500 Subject: timers/nohz: Update NOHZ load in remote tick The way loadavg is tracked during nohz only pays attention to the load upon entering nohz. This can be particularly noticeable if full nohz is entered while non-idle, and then the cpu goes idle and stays that way for a long time. Use the remote tick to ensure that full nohz cpus report their deltas within a reasonable time. [ swood: Added changelog and removed recheck of stopped tick. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Scott Wood Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/1578736419-14628-3-git-send-email-swood@redhat.com --- include/linux/sched/nohz.h | 2 ++ kernel/sched/core.c | 4 +++- kernel/sched/loadavg.c | 33 +++++++++++++++++++++++---------- 3 files changed, 28 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 1abe91ff6e4a..6d67e9a5af6b 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -15,9 +15,11 @@ static inline void nohz_balance_enter_idle(int cpu) { } #ifdef CONFIG_NO_HZ_COMMON void calc_load_nohz_start(void); +void calc_load_nohz_remote(struct rq *rq); void calc_load_nohz_stop(void); #else static inline void calc_load_nohz_start(void) { } +static inline void calc_load_nohz_remote(struct rq *rq) { } static inline void calc_load_nohz_stop(void) { } #endif /* CONFIG_NO_HZ_COMMON */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cf8b33dc4513..4ff03c27779e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3677,6 +3677,7 @@ static void sched_tick_remote(struct work_struct *work) if (cpu_is_offline(cpu)) goto out_unlock; + curr = rq->curr; update_rq_clock(rq); if (!is_idle_task(curr)) { @@ -3689,10 +3690,11 @@ static void sched_tick_remote(struct work_struct *work) } curr->sched_class->task_tick(rq, curr, 0); + calc_load_nohz_remote(rq); out_unlock: rq_unlock_irq(rq, &rf); - out_requeue: + /* * Run the remote tick once per second (1Hz). This arbitrary * frequency is large enough to avoid overload but short enough diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 28a516575c18..de22da666ac7 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void) return calc_load_idx & 1; } -void calc_load_nohz_start(void) +static void calc_load_nohz_fold(struct rq *rq) { - struct rq *this_rq = this_rq(); long delta; - /* - * We're going into NO_HZ mode, if there's any pending delta, fold it - * into the pending NO_HZ delta. - */ - delta = calc_load_fold_active(this_rq, 0); + delta = calc_load_fold_active(rq, 0); if (delta) { int idx = calc_load_write_idx(); @@ -248,6 +243,24 @@ void calc_load_nohz_start(void) } } +void calc_load_nohz_start(void) +{ + /* + * We're going into NO_HZ mode, if there's any pending delta, fold it + * into the pending NO_HZ delta. + */ + calc_load_nohz_fold(this_rq()); +} + +/* + * Keep track of the load for NOHZ_FULL, must be called between + * calc_load_nohz_{start,stop}(). + */ +void calc_load_nohz_remote(struct rq *rq) +{ + calc_load_nohz_fold(rq); +} + void calc_load_nohz_stop(void) { struct rq *this_rq = this_rq(); @@ -268,7 +281,7 @@ void calc_load_nohz_stop(void) this_rq->calc_load_update += LOAD_FREQ; } -static long calc_load_nohz_fold(void) +static long calc_load_nohz_read(void) { int idx = calc_load_read_idx(); long delta = 0; @@ -323,7 +336,7 @@ static void calc_global_nohz(void) } #else /* !CONFIG_NO_HZ_COMMON */ -static inline long calc_load_nohz_fold(void) { return 0; } +static inline long calc_load_nohz_read(void) { return 0; } static inline void calc_global_nohz(void) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks) /* * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. */ - delta = calc_load_nohz_fold(); + delta = calc_load_nohz_read(); if (delta) atomic_long_add(delta, &calc_load_tasks); -- cgit v1.2.3 From b396f52326de20ec974471b7b19168867b365cbf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 14 Jan 2020 10:13:20 +0000 Subject: sched/fair: Allow a small load imbalance between low utilisation SD_NUMA domains The CPU load balancer balances between different domains to spread load and strives to have equal balance everywhere. Communicating tasks can migrate so they are topologically close to each other but these decisions are independent. On a lightly loaded NUMA machine, two communicating tasks pulled together at wakeup time can be pushed apart by the load balancer. In isolation, the load balancer decision is fine but it ignores the tasks data locality and the wakeup/LB paths continually conflict. NUMA balancing is also a factor but it also simply conflicts with the load balancer. This patch allows a fixed degree of imbalance of two tasks to exist between NUMA domains regardless of utilisation levels. In many cases, this prevents communicating tasks being pulled apart. It was evaluated whether the imbalance should be scaled to the domain size. However, no additional benefit was measured across a range of workloads and machines and scaling adds the risk that lower domains have to be rebalanced. While this could change again in the future, such a change should specify the use case and benefit. The most obvious impact is on netperf TCP_STREAM -- two simple communicating tasks with some softirq offload depending on the transmission rate. 2-socket Haswell machine 48 core, HT enabled netperf-tcp -- mmtests config config-network-netperf-unbound baseline lbnuma-v3 Hmean 64 568.73 ( 0.00%) 577.56 * 1.55%* Hmean 128 1089.98 ( 0.00%) 1128.06 * 3.49%* Hmean 256 2061.72 ( 0.00%) 2104.39 * 2.07%* Hmean 1024 7254.27 ( 0.00%) 7557.52 * 4.18%* Hmean 2048 11729.20 ( 0.00%) 13350.67 * 13.82%* Hmean 3312 15309.08 ( 0.00%) 18058.95 * 17.96%* Hmean 4096 17338.75 ( 0.00%) 20483.66 * 18.14%* Hmean 8192 25047.12 ( 0.00%) 27806.84 * 11.02%* Hmean 16384 27359.55 ( 0.00%) 33071.88 * 20.88%* Stddev 64 2.16 ( 0.00%) 2.02 ( 6.53%) Stddev 128 2.31 ( 0.00%) 2.19 ( 5.05%) Stddev 256 11.88 ( 0.00%) 3.22 ( 72.88%) Stddev 1024 23.68 ( 0.00%) 7.24 ( 69.43%) Stddev 2048 79.46 ( 0.00%) 71.49 ( 10.03%) Stddev 3312 26.71 ( 0.00%) 57.80 (-116.41%) Stddev 4096 185.57 ( 0.00%) 96.15 ( 48.19%) Stddev 8192 245.80 ( 0.00%) 100.73 ( 59.02%) Stddev 16384 207.31 ( 0.00%) 141.65 ( 31.67%) In this case, there was a sizable improvement to performance and a general reduction in variance. However, this is not univeral. For most machines, the impact was roughly a 3% performance gain. Ops NUMA base-page range updates 19796.00 292.00 Ops NUMA PTE updates 19796.00 292.00 Ops NUMA PMD updates 0.00 0.00 Ops NUMA hint faults 16113.00 143.00 Ops NUMA hint local faults % 8407.00 142.00 Ops NUMA hint local percent 52.18 99.30 Ops NUMA pages migrated 4244.00 1.00 Without the patch, only 52.18% of sampled accesses are local. In an earlier changelog, 100% of sampled accesses are local and indeed on most machines, this was still the case. In this specific case, the local sampled rates was 99.3% but note the "base-page range updates" and "PTE updates". The activity with the patch is negligible as were the number of faults. The small number of pages migrated were related to shared libraries. A 2-socket Broadwell showed better results on average but are not presented for brevity as the performance was similar except it showed 100% of the sampled NUMA hints were local. The patch holds up for a 4-socket Haswell, an AMD EPYC and AMD Epyc 2 machine. For dbench, the impact depends on the filesystem used and the number of clients. On XFS, there is little difference as the clients typically communicate with workqueues which have a separate class of scheduler problem at the moment. For ext4, performance is generally better, particularly for small numbers of clients as NUMA balancing activity is negligible with the patch applied. A more interesting example is the Facebook schbench which uses a number of messaging threads to communicate with worker threads. In this configuration, one messaging thread is used per NUMA node and the number of worker threads is varied. The 50, 75, 90, 95, 99, 99.5 and 99.9 percentiles for response latency is then reported. Lat 50.00th-qrtle-1 44.00 ( 0.00%) 37.00 ( 15.91%) Lat 75.00th-qrtle-1 53.00 ( 0.00%) 41.00 ( 22.64%) Lat 90.00th-qrtle-1 57.00 ( 0.00%) 42.00 ( 26.32%) Lat 95.00th-qrtle-1 63.00 ( 0.00%) 43.00 ( 31.75%) Lat 99.00th-qrtle-1 76.00 ( 0.00%) 51.00 ( 32.89%) Lat 99.50th-qrtle-1 89.00 ( 0.00%) 52.00 ( 41.57%) Lat 99.90th-qrtle-1 98.00 ( 0.00%) 55.00 ( 43.88%) Lat 50.00th-qrtle-2 42.00 ( 0.00%) 42.00 ( 0.00%) Lat 75.00th-qrtle-2 48.00 ( 0.00%) 47.00 ( 2.08%) Lat 90.00th-qrtle-2 53.00 ( 0.00%) 52.00 ( 1.89%) Lat 95.00th-qrtle-2 55.00 ( 0.00%) 53.00 ( 3.64%) Lat 99.00th-qrtle-2 62.00 ( 0.00%) 60.00 ( 3.23%) Lat 99.50th-qrtle-2 63.00 ( 0.00%) 63.00 ( 0.00%) Lat 99.90th-qrtle-2 68.00 ( 0.00%) 66.00 ( 2.94% For higher worker threads, the differences become negligible but it's interesting to note the difference in wakeup latency at low utilisation and mpstat confirms that activity was almost all on one node until the number of worker threads increase. Hackbench generally showed neutral results across a range of machines. This is different to earlier versions of the patch which allowed imbalances for higher degrees of utilisation. perf bench pipe showed negligible differences in overall performance as the differences are very close to the noise. An earlier prototype of the patch showed major regressions for NAS C-class when running with only half of the available CPUs -- 20-30% performance hits were measured at the time. With this version of the patch, the impact is negligible with small gains/losses within the noise measured. This is because the number of threads far exceeds the small imbalance the aptch cares about. Similarly, there were report of regressions for the autonuma benchmark against earlier versions but again, normal load balancing now applies for that workload. In general, the patch simply seeks to avoid unnecessary cross-node migrations in the basic case where imbalances are very small. For low utilisation communicating workloads, this patch generally behaves better with less NUMA balancing activity. For high utilisation, there is no change in behaviour. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Valentin Schneider Reviewed-by: Vincent Guittot Reviewed-by: Srikar Dronamraju Acked-by: Phil Auld Tested-by: Phil Auld Link: https://lkml.kernel.org/r/20200114101319.GO3466@techsingularity.net --- kernel/sched/fair.c | 41 +++++++++++++++++++++++++++++------------ 1 file changed, 29 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index fe4e0d775375..25dffc03f0f6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -8658,10 +8658,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* * Try to use spare capacity of local group without overloading it or * emptying busiest. - * XXX Spreading tasks across NUMA nodes is not always the best policy - * and special care should be taken for SD_NUMA domain level before - * spreading the tasks. For now, load_balance() fully relies on - * NUMA_BALANCING and fbq_classify_group/rq to override the decision. */ if (local->group_type == group_has_spare) { if (busiest->group_type > group_fully_busy) { @@ -8701,16 +8697,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s env->migration_type = migrate_task; lsub_positive(&nr_diff, local->sum_nr_running); env->imbalance = nr_diff >> 1; - return; - } + } else { - /* - * If there is no overload, we just want to even the number of - * idle cpus. - */ - env->migration_type = migrate_task; - env->imbalance = max_t(long, 0, (local->idle_cpus - + /* + * If there is no overload, we just want to even the number of + * idle cpus. + */ + env->migration_type = migrate_task; + env->imbalance = max_t(long, 0, (local->idle_cpus - busiest->idle_cpus) >> 1); + } + + /* Consider allowing a small imbalance between NUMA groups */ + if (env->sd->flags & SD_NUMA) { + unsigned int imbalance_min; + + /* + * Compute an allowed imbalance based on a simple + * pair of communicating tasks that should remain + * local and ignore them. + * + * NOTE: Generally this would have been based on + * the domain size and this was evaluated. However, + * the benefit is similar across a range of workloads + * and machines but scaling by the domain size adds + * the risk that lower domains have to be rebalanced. + */ + imbalance_min = 2; + if (busiest->sum_nr_running <= imbalance_min) + env->imbalance = 0; + } + return; } -- cgit v1.2.3 From b562d140649966d4daedd0483a8fe59ad3bb465a Mon Sep 17 00:00:00 2001 From: Qais Yousef Date: Tue, 14 Jan 2020 21:09:47 +0000 Subject: sched/uclamp: Reject negative values in cpu_uclamp_write() The check to ensure that the new written value into cpu.uclamp.{min,max} is within range, [0:100], wasn't working because of the signed comparison 7301 if (req.percent > UCLAMP_PERCENT_SCALE) { 7302 req.ret = -ERANGE; 7303 return req; 7304 } # echo -1 > cpu.uclamp.min # cat cpu.uclamp.min 42949671.96 Cast req.percent into u64 to force the comparison to be unsigned and work as intended in capacity_from_percent(). # echo -1 > cpu.uclamp.min sh: write error: Numerical result out of range Fixes: 2480c093130f ("sched/uclamp: Extend CPU's cgroup controller") Signed-off-by: Qais Yousef Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200114210947.14083-1-qais.yousef@arm.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4ff03c27779e..55b9a9c53b91 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7264,7 +7264,7 @@ capacity_from_percent(char *buf) &req.percent); if (req.ret) return req; - if (req.percent > UCLAMP_PERCENT_SCALE) { + if ((u64)req.percent > UCLAMP_PERCENT_SCALE) { req.ret = -ERANGE; return req; } -- cgit v1.2.3 From e938b9c94164e4d981039f1cf6007d7453883e5a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Mon, 13 Jan 2020 08:50:27 +0800 Subject: sched/nohz: Optimize get_nohz_timer_target() On a machine, CPU 0 is used for housekeeping, the other 39 CPUs in the same socket are in nohz_full mode. We can observe huge time burn in the loop for seaching nearest busy housekeeper cpu by ftrace. 2) | get_nohz_timer_target() { 2) 0.240 us | housekeeping_test_cpu(); 2) 0.458 us | housekeeping_test_cpu(); ... 2) 0.292 us | housekeeping_test_cpu(); 2) 0.240 us | housekeeping_test_cpu(); 2) 0.227 us | housekeeping_any_cpu(); 2) + 43.460 us | } This patch optimizes the searching logic by finding a nearest housekeeper CPU in the housekeeping cpumask, it can minimize the worst searching time from ~44us to < 10us in my testing. In addition, the last iterated busy housekeeper can become a random candidate while current CPU is a better fallback if it is a housekeeper. Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Frederic Weisbecker Link: https://lkml.kernel.org/r/1578876627-11938-1-git-send-email-wanpengli@tencent.com --- kernel/sched/core.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 55b9a9c53b91..a8a5d5b6f5cf 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -552,27 +552,32 @@ void resched_cpu(int cpu) */ int get_nohz_timer_target(void) { - int i, cpu = smp_processor_id(); + int i, cpu = smp_processor_id(), default_cpu = -1; struct sched_domain *sd; - if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) - return cpu; + if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) { + if (!idle_cpu(cpu)) + return cpu; + default_cpu = cpu; + } rcu_read_lock(); for_each_domain(cpu, sd) { - for_each_cpu(i, sched_domain_span(sd)) { + for_each_cpu_and(i, sched_domain_span(sd), + housekeeping_cpumask(HK_FLAG_TIMER)) { if (cpu == i) continue; - if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { + if (!idle_cpu(i)) { cpu = i; goto unlock; } } } - if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) - cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + if (default_cpu == -1) + default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER); + cpu = default_cpu; unlock: rcu_read_unlock(); return cpu; -- cgit v1.2.3 From 2a4b03ffc69f2dedc6388e9a6438b5f4c133a40d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 14 Jan 2020 15:13:56 +0100 Subject: sched/fair: Prevent unlimited runtime on throttled group When a running task is moved on a throttled task group and there is no other task enqueued on the CPU, the task can keep running using 100% CPU whatever the allocated bandwidth for the group and although its cfs rq is throttled. Furthermore, the group entity of the cfs_rq and its parents are not enqueued but only set as curr on their respective cfs_rqs. We have the following sequence: sched_move_task -dequeue_task: dequeue task and group_entities. -put_prev_task: put task and group entities. -sched_change_group: move task to new group. -enqueue_task: enqueue only task but not group entities because cfs_rq is throttled. -set_next_task : set task and group_entities as current sched_entity of their cfs_rq. Another impact is that the root cfs_rq runnable_load_avg at root rq stays null because the group_entities are not enqueued. This situation will stay the same until an "external" event triggers a reschedule. Let trigger it immediately instead. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Ben Segall Link: https://lkml.kernel.org/r/1579011236-31256-1-git-send-email-vincent.guittot@linaro.org --- kernel/sched/core.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a8a5d5b6f5cf..89e54f3ed571 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7072,8 +7072,15 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); - if (running) + if (running) { set_next_task(rq, tsk); + /* + * After changing group, the running task may have joined a + * throttled one but it's still the running task. Trigger a + * resched to make sure that task can still run. + */ + resched_curr(rq); + } task_rq_unlock(rq, tsk, &rf); } -- cgit v1.2.3 From 8c8c5a4994a306c217fd061cbfc5903399fd4c1c Mon Sep 17 00:00:00 2001 From: Nicolas Saenz Julienne Date: Fri, 10 Jan 2020 18:19:33 +0100 Subject: dma-contiguous: CMA: give precedence to cmdline Although the device tree might contain a reserved-memory DT node dedicated as the default CMA pool, users might want to change CMA's parameters using the kernel command line for debugging purposes and whatnot. Honor this by bypassing the reserved memory CMA setup, which will ultimately end up freeing the memblock and allow the command line CMA configuration routine to run. Signed-off-by: Nicolas Saenz Julienne Reviewed-by: Phil Elwell Signed-off-by: Christoph Hellwig --- kernel/dma/contiguous.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index daa4e6eefdde..8bc6f2d670f9 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -302,9 +302,16 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); phys_addr_t mask = align - 1; unsigned long node = rmem->fdt_node; + bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL); struct cma *cma; int err; + if (size_cmdline != -1 && default_cma) { + pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n", + rmem->name); + return -EBUSY; + } + if (!of_get_flat_dt_prop(node, "reusable", NULL) || of_get_flat_dt_prop(node, "no-map", NULL)) return -EINVAL; @@ -322,7 +329,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem) /* Architecture specific contiguous memory fixup. */ dma_contiguous_early_fixup(rmem->base, rmem->size); - if (of_get_flat_dt_prop(node, "linux,cma-default", NULL)) + if (default_cma) dma_contiguous_set_default(cma); rmem->ops = &rmem_cma_ops; -- cgit v1.2.3 From 91ef26f914171cf753330f13724fd9142b5b1640 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 18:11:10 +0100 Subject: dma-direct: relax addressability checks in dma_direct_supported dma_direct_supported tries to find the minimum addressable bitmask based on the end pfn and optional magic that architectures can use to communicate the size of the magic ZONE_DMA that can be used for bounce buffering. But between the DMA offsets that can change per device (or sometimes even region), the fact the ZONE_DMA isn't even guaranteed to be the lowest addresses and failure of having proper interfaces to the MM code this fails at least for one arm subarchitecture. As all the legacy DMA implementations have supported 32-bit DMA masks, and 32-bit masks are guranteed to always work by the API contract (using bounce buffers if needed), we can short cut the complicated check and always return true without breaking existing assumptions. Hopefully we can properly clean up the interaction with the arch defined zones and the bootmem allocator eventually. Fixes: ad3c7b18c5b3 ("arm: use swiotlb for bounce buffering on LPAE configs") Reported-by: Peter Ujfalusi Signed-off-by: Christoph Hellwig Tested-by: Peter Ujfalusi --- kernel/dma/direct.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 6af7ae83c4ad..32ec69cdba54 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -472,28 +472,26 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma, } #endif /* CONFIG_MMU */ -/* - * Because 32-bit DMA masks are so common we expect every architecture to be - * able to satisfy them - either by not supporting more physical memory, or by - * providing a ZONE_DMA32. If neither is the case, the architecture needs to - * use an IOMMU instead of the direct mapping. - */ int dma_direct_supported(struct device *dev, u64 mask) { - u64 min_mask; - - if (IS_ENABLED(CONFIG_ZONE_DMA)) - min_mask = DMA_BIT_MASK(zone_dma_bits); - else - min_mask = DMA_BIT_MASK(32); + u64 min_mask = (max_pfn - 1) << PAGE_SHIFT; - min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT); + /* + * Because 32-bit DMA masks are so common we expect every architecture + * to be able to satisfy them - either by not supporting more physical + * memory, or by providing a ZONE_DMA32. If neither is the case, the + * architecture needs to use an IOMMU instead of the direct mapping. + */ + if (mask >= DMA_BIT_MASK(32)) + return 1; /* * This check needs to be against the actual bit mask value, so * use __phys_to_dma() here so that the SME encryption mask isn't * part of the check. */ + if (IS_ENABLED(CONFIG_ZONE_DMA)) + min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits)); return mask >= __phys_to_dma(dev, min_mask); } -- cgit v1.2.3 From 4a47cbae04844f0c5e2365aa6c217b61850bb832 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 14:44:38 +0100 Subject: dma-direct: improve swiotlb error reporting Untangle the way how dma_direct_map_page calls into swiotlb to be able to properly report errors where the swiotlb DMA address overflows the mask separately from overflows in the !swiotlb case. This means that siotlb_map now has to do a little more work that duplicates dma_direct_map_page, but doing so greatly simplifies the calling convention. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk --- include/linux/swiotlb.h | 11 +++-------- kernel/dma/direct.c | 16 +++++++--------- kernel/dma/swiotlb.c | 42 +++++++++++++++++++++++------------------- 3 files changed, 33 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index cde3dc18e21a..046bb94bd4d6 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -64,6 +64,9 @@ extern void swiotlb_tbl_sync_single(struct device *hwdev, size_t size, enum dma_data_direction dir, enum dma_sync_target target); +dma_addr_t swiotlb_map(struct device *dev, phys_addr_t phys, + size_t size, enum dma_data_direction dir, unsigned long attrs); + #ifdef CONFIG_SWIOTLB extern enum swiotlb_force swiotlb_force; extern phys_addr_t io_tlb_start, io_tlb_end; @@ -73,8 +76,6 @@ static inline bool is_swiotlb_buffer(phys_addr_t paddr) return paddr >= io_tlb_start && paddr < io_tlb_end; } -bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs); void __init swiotlb_exit(void); unsigned int swiotlb_max_segment(void); size_t swiotlb_max_mapping_size(struct device *dev); @@ -85,12 +86,6 @@ static inline bool is_swiotlb_buffer(phys_addr_t paddr) { return false; } -static inline bool swiotlb_map(struct device *dev, phys_addr_t *phys, - dma_addr_t *dma_addr, size_t size, enum dma_data_direction dir, - unsigned long attrs) -{ - return false; -} static inline void swiotlb_exit(void) { } diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 32ec69cdba54..594bddd04e01 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -357,13 +357,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl, EXPORT_SYMBOL(dma_direct_unmap_sg); #endif -static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr, - size_t size) -{ - return swiotlb_force != SWIOTLB_FORCE && - dma_capable(dev, dma_addr, size, true); -} - dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs) @@ -371,8 +364,13 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, phys_addr_t phys = page_to_phys(page) + offset; dma_addr_t dma_addr = phys_to_dma(dev, phys); - if (unlikely(!dma_direct_possible(dev, dma_addr, size)) && - !swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) { + if (unlikely(swiotlb_force == SWIOTLB_FORCE)) + return swiotlb_map(dev, phys, size, dir, attrs); + + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + if (swiotlb_force != SWIOTLB_NO_FORCE) + return swiotlb_map(dev, phys, size, dir, attrs); + report_addr(dev, dma_addr, size); return DMA_MAPPING_ERROR; } diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 9280d6f8271e..c19379fabd20 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -656,35 +657,38 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, } /* - * Create a swiotlb mapping for the buffer at @phys, and in case of DMAing + * Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing * to the device copy the data into it as well. */ -bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr, - size_t size, enum dma_data_direction dir, unsigned long attrs) +dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, + enum dma_data_direction dir, unsigned long attrs) { - trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force); + phys_addr_t swiotlb_addr; + dma_addr_t dma_addr; - if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) { - dev_warn_ratelimited(dev, - "Cannot do DMA to address %pa\n", phys); - return false; - } + trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size, + swiotlb_force); - /* Oh well, have to allocate and map a bounce buffer. */ - *phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start), - *phys, size, size, dir, attrs); - if (*phys == (phys_addr_t)DMA_MAPPING_ERROR) - return false; + swiotlb_addr = swiotlb_tbl_map_single(dev, + __phys_to_dma(dev, io_tlb_start), + paddr, size, size, dir, attrs); + if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR) + return DMA_MAPPING_ERROR; /* Ensure that the address returned is DMA'ble */ - *dma_addr = __phys_to_dma(dev, *phys); - if (unlikely(!dma_capable(dev, *dma_addr, size, true))) { - swiotlb_tbl_unmap_single(dev, *phys, size, size, dir, + dma_addr = __phys_to_dma(dev, swiotlb_addr); + if (unlikely(!dma_capable(dev, dma_addr, size, true))) { + swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC); - return false; + dev_WARN_ONCE(dev, 1, + "swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + return DMA_MAPPING_ERROR; } - return true; + if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC)) + arch_sync_dma_for_device(swiotlb_addr, size, dir); + return dma_addr; } size_t swiotlb_max_mapping_size(struct device *dev) -- cgit v1.2.3 From 75467ee48a5e04cf3ae3cb39aea6adee73aeff91 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 3 Feb 2020 14:54:50 +0100 Subject: dma-direct: improve DMA mask overflow reporting Remove the unset dma_mask case as that won't get into mapping calls anymore, and also report the other errors unconditonally and with a slightly improved message. Remove the now pointless report_addr helper. Signed-off-by: Christoph Hellwig Reviewed-by: Konrad Rzeszutek Wilk --- kernel/dma/direct.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c index 594bddd04e01..ac7956c38f69 100644 --- a/kernel/dma/direct.c +++ b/kernel/dma/direct.c @@ -23,18 +23,6 @@ */ unsigned int zone_dma_bits __ro_after_init = 24; -static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size) -{ - if (!dev->dma_mask) { - dev_err_once(dev, "DMA map on device without dma_mask\n"); - } else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) { - dev_err_once(dev, - "overflow %pad+%zu of DMA mask %llx bus limit %llx\n", - &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); - } - WARN_ON_ONCE(1); -} - static inline dma_addr_t phys_to_dma_direct(struct device *dev, phys_addr_t phys) { @@ -371,7 +359,9 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page, if (swiotlb_force != SWIOTLB_NO_FORCE) return swiotlb_map(dev, phys, size, dir, attrs); - report_addr(dev, dma_addr, size); + dev_WARN_ONCE(dev, 1, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); return DMA_MAPPING_ERROR; } @@ -409,7 +399,10 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr, dma_addr_t dma_addr = paddr; if (unlikely(!dma_capable(dev, dma_addr, size, false))) { - report_addr(dev, dma_addr, size); + dev_err_once(dev, + "DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n", + &dma_addr, size, *dev->dma_mask, dev->bus_dma_limit); + WARN_ON_ONCE(1); return DMA_MAPPING_ERROR; } -- cgit v1.2.3 From 52262ee567ad14c9606be25f3caddcefa3c514e4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 28 Jan 2020 15:40:06 +0000 Subject: sched/fair: Allow a per-CPU kthread waking a task to stack on the same CPU, to fix XFS performance regression The following XFS commit: 8ab39f11d974 ("xfs: prevent CIL push holdoff in log recovery") changed the logic from using bound workqueues to using unbound workqueues. Functionally this makes sense but it was observed at the time that the dbench performance dropped quite a lot and CPU migrations were increased. The current pattern of the task migration is straight-forward. With XFS, an IO issuer delegates work to xlog_cil_push_work ()on an unbound kworker. This runs on a nearby CPU and on completion, dbench wakes up on its old CPU as it is still idle and no migration occurs. dbench then queues the real IO on the blk_mq_requeue_work() work item which runs on a bound kworker which is forced to run on the same CPU as dbench. When IO completes, the bound kworker wakes dbench but as the kworker is a bound but, real task, the CPU is not considered idle and dbench gets migrated by select_idle_sibling() to a new CPU. dbench may ping-pong between two CPUs for a while but ultimately it starts a round-robin of all CPUs sharing the same LLC. High-frequency migration on each IO completion has poor performance overall. It has negative implications both in commication costs and power management. mpstat confirmed that at low thread counts that all CPUs sharing an LLC has low level of activity. Note that even if the CIL patch was reverted, there still would be migrations but the impact is less noticeable. It turns out that individually the scheduler, XFS, blk-mq and workqueues all made sensible decisions but in combination, the overall effect was sub-optimal. This patch special cases the IO issue/completion pattern and allows a bound kworker waker and a task wakee to stack on the same CPU if there is a strong chance they are directly related. The expectation is that the kworker is likely going back to sleep shortly. This is not guaranteed as the IO could be queued asynchronously but there is a very strong relationship between the task and kworker in this case that would justify stacking on the same CPU instead of migrating. There should be few concerns about kworker starvation given that the special casing is only when the kworker is the waker. DBench on XFS MMTests config: io-dbench4-async modified to run on a fresh XFS filesystem UMA machine with 8 cores sharing LLC 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack Amean 1 22.63 ( 0.00%) 20.54 * 9.23%* Amean 2 25.56 ( 0.00%) 23.40 * 8.44%* Amean 4 28.63 ( 0.00%) 27.85 * 2.70%* Amean 8 37.66 ( 0.00%) 37.68 ( -0.05%) Amean 64 469.47 ( 0.00%) 468.26 ( 0.26%) Stddev 1 1.00 ( 0.00%) 0.72 ( 28.12%) Stddev 2 1.62 ( 0.00%) 1.97 ( -21.54%) Stddev 4 2.53 ( 0.00%) 3.58 ( -41.19%) Stddev 8 5.30 ( 0.00%) 5.20 ( 1.92%) Stddev 64 86.36 ( 0.00%) 94.53 ( -9.46%) NUMA machine, 48 CPUs total, 24 CPUs share cache 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack-v1r2 Amean 1 58.69 ( 0.00%) 30.21 * 48.53%* Amean 2 60.90 ( 0.00%) 35.29 * 42.05%* Amean 4 66.77 ( 0.00%) 46.55 * 30.28%* Amean 8 81.41 ( 0.00%) 68.46 * 15.91%* Amean 16 113.29 ( 0.00%) 107.79 * 4.85%* Amean 32 199.10 ( 0.00%) 198.22 * 0.44%* Amean 64 478.99 ( 0.00%) 477.06 * 0.40%* Amean 128 1345.26 ( 0.00%) 1372.64 * -2.04%* Stddev 1 2.64 ( 0.00%) 4.17 ( -58.08%) Stddev 2 4.35 ( 0.00%) 5.38 ( -23.73%) Stddev 4 6.77 ( 0.00%) 6.56 ( 3.00%) Stddev 8 11.61 ( 0.00%) 10.91 ( 6.04%) Stddev 16 18.63 ( 0.00%) 19.19 ( -3.01%) Stddev 32 38.71 ( 0.00%) 38.30 ( 1.06%) Stddev 64 100.28 ( 0.00%) 91.24 ( 9.02%) Stddev 128 186.87 ( 0.00%) 160.34 ( 14.20%) Dbench has been modified to report the time to complete a single "load file". This is a more meaningful metric for dbench that a throughput metric as the benchmark makes many different system calls that are not throughput-related Patch shows a 9.23% and 48.53% reduction in the time to process a load file with the difference partially explained by the number of CPUs sharing a LLC. In a separate run, task migrations were almost eliminated by the patch for low client counts. In case people have issue with the metric used for the benchmark, this is a comparison of the throughputs as reported by dbench on the NUMA machine. dbench4 Throughput (misleading but traditional) 5.5.0-rc7 5.5.0-rc7 tipsched-20200124 kworkerstack-v1r2 Hmean 1 321.41 ( 0.00%) 617.82 * 92.22%* Hmean 2 622.87 ( 0.00%) 1066.80 * 71.27%* Hmean 4 1134.56 ( 0.00%) 1623.74 * 43.12%* Hmean 8 1869.96 ( 0.00%) 2212.67 * 18.33%* Hmean 16 2673.11 ( 0.00%) 2806.13 * 4.98%* Hmean 32 3032.74 ( 0.00%) 3039.54 ( 0.22%) Hmean 64 2514.25 ( 0.00%) 2498.96 * -0.61%* Hmean 128 1778.49 ( 0.00%) 1746.05 * -1.82%* Note that this is somewhat specific to XFS and ext4 shows no performance difference as it does not rely on kworkers in the same way. No major problem was observed running other workloads on different machines although not all tests have completed yet. Signed-off-by: Mel Gorman Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20200128154006.GD3466@techsingularity.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 11 ----------- kernel/sched/fair.c | 14 ++++++++++++++ kernel/sched/sched.h | 13 +++++++++++++ 3 files changed, 27 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 89e54f3ed571..1a9983da4408 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1447,17 +1447,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP -static inline bool is_per_cpu_kthread(struct task_struct *p) -{ - if (!(p->flags & PF_KTHREAD)) - return false; - - if (p->nr_cpus_allowed != 1) - return false; - - return true; -} - /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25dffc03f0f6..94c3b8469cf6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5912,6 +5912,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(prev) || sched_idle_cpu(prev))) return prev; + /* + * Allow a per-cpu kthread to stack with the wakee if the + * kworker thread and the tasks previous CPUs are the same. + * The assumption is that the wakee queued work for the + * per-cpu kthread that is now complete and the wakeup is + * essentially a sync wakeup. An obvious example of this + * pattern is IO completions. + */ + if (is_per_cpu_kthread(current) && + prev == smp_processor_id() && + this_rq()->nr_running <= 1) { + return prev; + } + /* Check a recently used CPU as a potential idle candidate: */ recent_used_cpu = p->recent_used_cpu; if (recent_used_cpu != prev && diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 1a88dc8ad11b..5876e6ba5903 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2479,3 +2479,16 @@ static inline void membarrier_switch_mm(struct rq *rq, { } #endif + +#ifdef CONFIG_SMP +static inline bool is_per_cpu_kthread(struct task_struct *p) +{ + if (!(p->flags & PF_KTHREAD)) + return false; + + if (p->nr_cpus_allowed != 1) + return false; + + return true; +} +#endif -- cgit v1.2.3 From 2bf0eb9b3b0d099b20b2c4736436b666d78b94d5 Mon Sep 17 00:00:00 2001 From: Hongbo Yao Date: Mon, 10 Feb 2020 09:14:41 +0800 Subject: bpf: Make btf_check_func_type_match() static Fix the following sparse warning: kernel/bpf/btf.c:4131:5: warning: symbol 'btf_check_func_type_match' was not declared. Should it be static? Reported-by: Hulk Robot Signed-off-by: Hongbo Yao Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20200210011441.147102-1-yaohongbo@huawei.com --- kernel/bpf/btf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 805c43b083e9..787140095e58 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4142,9 +4142,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, * EFAULT - verifier bug * 0 - 99% match. The last 1% is validated by the verifier. */ -int btf_check_func_type_match(struct bpf_verifier_log *log, - struct btf *btf1, const struct btf_type *t1, - struct btf *btf2, const struct btf_type *t2) +static int btf_check_func_type_match(struct bpf_verifier_log *log, + struct btf *btf1, const struct btf_type *t1, + struct btf *btf2, const struct btf_type *t2) { const struct btf_param *args1, *args2; const char *fn1, *fn2, *s1, *s2; -- cgit v1.2.3 From 6fcca0fa48118e6d63733eb4644c6cd880c15b8f Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Mon, 3 Feb 2020 13:22:16 -0800 Subject: sched/psi: Fix OOB write when writing 0 bytes to PSI files Issuing write() with count parameter set to 0 on any file under /proc/pressure/ will cause an OOB write because of the access to buf[buf_size-1] when NUL-termination is performed. Fix this by checking for buf_size to be non-zero. Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Acked-by: Johannes Weiner Link: https://lkml.kernel.org/r/20200203212216.7076-1-surenb@google.com --- kernel/sched/psi.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index db7b50bba3f1..38ccd49b9bf6 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -1199,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, if (static_branch_likely(&psi_disabled)) return -EOPNOTSUPP; + if (!nbytes) + return -EINVAL; + buf_size = min(nbytes, sizeof(buf)); if (copy_from_user(buf, user_buf, buf_size)) return -EFAULT; -- cgit v1.2.3 From 4104a562e0ca62e971089db9d3c47794a0d7d4eb Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sat, 1 Feb 2020 18:28:03 +0530 Subject: sched/core: Annotate curr pointer in rq with __rcu This patch fixes the following sparse warnings in sched/core.c and sched/membarrier.c: kernel/sched/core.c:2372:27: error: incompatible types in comparison expression kernel/sched/core.c:4061:17: error: incompatible types in comparison expression kernel/sched/core.c:6067:9: error: incompatible types in comparison expression kernel/sched/membarrier.c:108:21: error: incompatible types in comparison expression kernel/sched/membarrier.c:177:21: error: incompatible types in comparison expression kernel/sched/membarrier.c:243:21: error: incompatible types in comparison expression Signed-off-by: Madhuparna Bhowmik Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/20200201125803.20245-1-madhuparnabhowmik10@gmail.com --- kernel/sched/sched.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5876e6ba5903..9ea647835fd6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -896,7 +896,7 @@ struct rq { */ unsigned long nr_uninterruptible; - struct task_struct *curr; + struct task_struct __rcu *curr; struct task_struct *idle; struct task_struct *stop; unsigned long next_balance; -- cgit v1.2.3 From e9f5490c3574b435ce7fe7a71724aa3866babc7f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 9 Feb 2020 19:29:12 -0800 Subject: sched/fair: Fix kernel-doc warning in attach_entity_load_avg() Fix kernel-doc warning in kernel/sched/fair.c, caused by a recent function parameter removal: ../kernel/sched/fair.c:3526: warning: Excess function parameter 'flags' description in 'attach_entity_load_avg' Fixes: a4f9a0e51bbf ("sched/fair: Remove redundant call to cpufreq_update_util()") Signed-off-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Reviewed-by: Vincent Guittot Link: https://lkml.kernel.org/r/cbe964e4-6879-fd08-41c9-ef1917414af4@infradead.org --- kernel/sched/fair.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 94c3b8469cf6..3c8a379c357e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3516,7 +3516,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) * attach_entity_load_avg - attach this entity to its cfs_rq load avg * @cfs_rq: cfs_rq to attach to * @se: sched_entity to attach - * @flags: migration hints * * Must call update_cfs_rq_load_avg() before this, since we rely on * cfs_rq->avg.last_update_time being current. -- cgit v1.2.3 From e20d3a055a457a10a4c748ce5b7c2ed3173a1324 Mon Sep 17 00:00:00 2001 From: Johannes Krude Date: Wed, 12 Feb 2020 20:32:27 +0100 Subject: bpf, offload: Replace bitwise AND by logical AND in bpf_prog_offload_info_fill This if guards whether user-space wants a copy of the offload-jited bytecode and whether this bytecode exists. By erroneously doing a bitwise AND instead of a logical AND on user- and kernel-space buffer-size can lead to no data being copied to user-space especially when user-space size is a power of two and bigger then the kernel-space buffer. Fixes: fcfb126defda ("bpf: add new jited info fields in bpf_dev_offload and bpf_prog_info") Signed-off-by: Johannes Krude Signed-off-by: Daniel Borkmann Acked-by: Jakub Kicinski Link: https://lore.kernel.org/bpf/20200212193227.GA3769@phlox.h.transitiv.net --- kernel/bpf/offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/bpf/offload.c b/kernel/bpf/offload.c index 2c5dc6541ece..bd09290e3648 100644 --- a/kernel/bpf/offload.c +++ b/kernel/bpf/offload.c @@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info, ulen = info->jited_prog_len; info->jited_prog_len = aux->offload->jited_len; - if (info->jited_prog_len & ulen) { + if (info->jited_prog_len && ulen) { uinsns = u64_to_user_ptr(info->jited_prog_insns); ulen = min_t(u32, info->jited_prog_len, ulen); if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) { -- cgit v1.2.3 From 492e0d0d6f2eb4badfd2868addf9da0f651eba0e Mon Sep 17 00:00:00 2001 From: Brian Vazquez Date: Tue, 18 Feb 2020 09:25:52 -0800 Subject: bpf: Do not grab the bucket spinlock by default on htab batch ops Grabbing the spinlock for every bucket even if it's empty, was causing significant perfomance cost when traversing htab maps that have only a few entries. This patch addresses the issue by checking first the bucket_cnt, if the bucket has some entries then we go and grab the spinlock and proceed with the batching. Tested with a htab of size 50K and different value of populated entries. Before: Benchmark Time(ns) CPU(ns) --------------------------------------------- BM_DumpHashMap/1 2759655 2752033 BM_DumpHashMap/10 2933722 2930825 BM_DumpHashMap/200 3171680 3170265 BM_DumpHashMap/500 3639607 3635511 BM_DumpHashMap/1000 4369008 4364981 BM_DumpHashMap/5k 11171919 11134028 BM_DumpHashMap/20k 69150080 69033496 BM_DumpHashMap/39k 190501036 190226162 After: Benchmark Time(ns) CPU(ns) --------------------------------------------- BM_DumpHashMap/1 202707 200109 BM_DumpHashMap/10 213441 210569 BM_DumpHashMap/200 478641 472350 BM_DumpHashMap/500 980061 967102 BM_DumpHashMap/1000 1863835 1839575 BM_DumpHashMap/5k 8961836 8902540 BM_DumpHashMap/20k 69761497 69322756 BM_DumpHashMap/39k 187437830 186551111 Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Signed-off-by: Brian Vazquez Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20200218172552.215077-1-brianvv@google.com --- kernel/bpf/hashtab.c | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 2d182c4ee9d9..9194479a2fa7 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1259,7 +1259,8 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; - unsigned long flags; + unsigned long flags = 0; + bool locked = false; struct htab_elem *l; struct bucket *b; int ret = 0; @@ -1319,15 +1320,25 @@ again_nocopy: dst_val = values; b = &htab->buckets[batch]; head = &b->head; - raw_spin_lock_irqsave(&b->lock, flags); + /* do not grab the lock unless need it (bucket_cnt > 0). */ + if (locked) + raw_spin_lock_irqsave(&b->lock, flags); bucket_cnt = 0; hlist_nulls_for_each_entry_rcu(l, n, head, hash_node) bucket_cnt++; + if (bucket_cnt && !locked) { + locked = true; + goto again_nocopy; + } + if (bucket_cnt > (max_count - total)) { if (total == 0) ret = -ENOSPC; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1337,6 +1348,9 @@ again_nocopy: if (bucket_cnt > bucket_size) { bucket_size = bucket_cnt; + /* Note that since bucket_cnt > 0 here, it is implicit + * that the locked was grabbed, so release it. + */ raw_spin_unlock_irqrestore(&b->lock, flags); rcu_read_unlock(); this_cpu_dec(bpf_prog_active); @@ -1346,6 +1360,10 @@ again_nocopy: goto alloc; } + /* Next block is only safe to run if you have grabbed the lock */ + if (!locked) + goto next_batch; + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { memcpy(dst_key, l->key, key_size); @@ -1380,6 +1398,8 @@ again_nocopy: } raw_spin_unlock_irqrestore(&b->lock, flags); + locked = false; +next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. */ -- cgit v1.2.3 From b9aff38de2cb166476988020428985c5f7412ffc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 19 Feb 2020 15:47:57 -0800 Subject: bpf: Fix a potential deadlock with bpf_map_do_batch Commit 057996380a42 ("bpf: Add batch ops to all htab bpf map") added lookup_and_delete batch operation for hash table. The current implementation has bpf_lru_push_free() inside the bucket lock, which may cause a deadlock. syzbot reports: -> #2 (&htab->buckets[i].lock#2){....}: __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159 htab_lru_map_delete_node+0xce/0x2f0 kernel/bpf/hashtab.c:593 __bpf_lru_list_shrink_inactive kernel/bpf/bpf_lru_list.c:220 [inline] __bpf_lru_list_shrink+0xf9/0x470 kernel/bpf/bpf_lru_list.c:266 bpf_lru_list_pop_free_to_local kernel/bpf/bpf_lru_list.c:340 [inline] bpf_common_lru_pop_free kernel/bpf/bpf_lru_list.c:447 [inline] bpf_lru_pop_free+0x87c/0x1670 kernel/bpf/bpf_lru_list.c:499 prealloc_lru_pop+0x2c/0xa0 kernel/bpf/hashtab.c:132 __htab_lru_percpu_map_update_elem+0x67e/0xa90 kernel/bpf/hashtab.c:1069 bpf_percpu_hash_update+0x16e/0x210 kernel/bpf/hashtab.c:1585 bpf_map_update_value.isra.0+0x2d7/0x8e0 kernel/bpf/syscall.c:181 generic_map_update_batch+0x41f/0x610 kernel/bpf/syscall.c:1319 bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348 __do_sys_bpf+0x9b7/0x41e0 kernel/bpf/syscall.c:3460 __se_sys_bpf kernel/bpf/syscall.c:3355 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe -> #0 (&loc_l->lock){....}: check_prev_add kernel/locking/lockdep.c:2475 [inline] check_prevs_add kernel/locking/lockdep.c:2580 [inline] validate_chain kernel/locking/lockdep.c:2970 [inline] __lock_acquire+0x2596/0x4a00 kernel/locking/lockdep.c:3954 lock_acquire+0x190/0x410 kernel/locking/lockdep.c:4484 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x95/0xcd kernel/locking/spinlock.c:159 bpf_common_lru_push_free kernel/bpf/bpf_lru_list.c:516 [inline] bpf_lru_push_free+0x250/0x5b0 kernel/bpf/bpf_lru_list.c:555 __htab_map_lookup_and_delete_batch+0x8d4/0x1540 kernel/bpf/hashtab.c:1374 htab_lru_map_lookup_and_delete_batch+0x34/0x40 kernel/bpf/hashtab.c:1491 bpf_map_do_batch+0x3f5/0x510 kernel/bpf/syscall.c:3348 __do_sys_bpf+0x1f7d/0x41e0 kernel/bpf/syscall.c:3456 __se_sys_bpf kernel/bpf/syscall.c:3355 [inline] __x64_sys_bpf+0x73/0xb0 kernel/bpf/syscall.c:3355 do_syscall_64+0xfa/0x790 arch/x86/entry/common.c:294 entry_SYSCALL_64_after_hwframe+0x49/0xbe Possible unsafe locking scenario: CPU0 CPU2 ---- ---- lock(&htab->buckets[i].lock#2); lock(&l->lock); lock(&htab->buckets[i].lock#2); lock(&loc_l->lock); *** DEADLOCK *** To fix the issue, for htab_lru_map_lookup_and_delete_batch() in CPU0, let us do bpf_lru_push_free() out of the htab bucket lock. This can avoid the above deadlock scenario. Fixes: 057996380a42 ("bpf: Add batch ops to all htab bpf map") Reported-by: syzbot+a38ff3d9356388f2fb83@syzkaller.appspotmail.com Reported-by: syzbot+122b5421d14e68f29cd1@syzkaller.appspotmail.com Suggested-by: Hillf Danton Suggested-by: Martin KaFai Lau Signed-off-by: Yonghong Song Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Acked-by: Brian Vazquez Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20200219234757.3544014-1-yhs@fb.com --- kernel/bpf/hashtab.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 9194479a2fa7..a1468e3f5af2 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -56,6 +56,7 @@ struct htab_elem { union { struct bpf_htab *htab; struct pcpu_freelist_node fnode; + struct htab_elem *batch_flink; }; }; }; @@ -126,6 +127,17 @@ free_elems: bpf_map_area_free(htab->elems); } +/* The LRU list has a lock (lru_lock). Each htab bucket has a lock + * (bucket_lock). If both locks need to be acquired together, the lock + * order is always lru_lock -> bucket_lock and this only happens in + * bpf_lru_list.c logic. For example, certain code path of + * bpf_lru_pop_free(), which is called by function prealloc_lru_pop(), + * will acquire lru_lock first followed by acquiring bucket_lock. + * + * In hashtab.c, to avoid deadlock, lock acquisition of + * bucket_lock followed by lru_lock is not allowed. In such cases, + * bucket_lock needs to be released first before acquiring lru_lock. + */ static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key, u32 hash) { @@ -1256,6 +1268,7 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map, void __user *ukeys = u64_to_user_ptr(attr->batch.keys); void *ubatch = u64_to_user_ptr(attr->batch.in_batch); u32 batch, max_count, size, bucket_size; + struct htab_elem *node_to_free = NULL; u64 elem_map_flags, map_flags; struct hlist_nulls_head *head; struct hlist_nulls_node *n; @@ -1388,10 +1401,18 @@ again_nocopy: } if (do_delete) { hlist_nulls_del_rcu(&l->hash_node); - if (is_lru_map) - bpf_lru_push_free(&htab->lru, &l->lru_node); - else + + /* bpf_lru_push_free() will acquire lru_lock, which + * may cause deadlock. See comments in function + * prealloc_lru_pop(). Let us do bpf_lru_push_free() + * after releasing the bucket lock. + */ + if (is_lru_map) { + l->batch_flink = node_to_free; + node_to_free = l; + } else { free_htab_elem(htab, l); + } } dst_key += key_size; dst_val += value_size; @@ -1399,6 +1420,13 @@ again_nocopy: raw_spin_unlock_irqrestore(&b->lock, flags); locked = false; + + while (node_to_free) { + l = node_to_free; + node_to_free = node_to_free->batch_flink; + bpf_lru_push_free(&htab->lru, &l->lru_node); + } + next_batch: /* If we are not copying data, we can go to next bucket and avoid * unlocking the rcu. -- cgit v1.2.3 From 412c53a680a97cb1ae2c0ab60230e193bee86387 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 20 Feb 2020 20:03:54 -0800 Subject: y2038: remove unused time32 interfaces No users remain, so kill these off before we grow new ones. Link: http://lkml.kernel.org/r/20200110154232.4104492-3-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Thomas Gleixner Cc: Deepa Dinamani Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compat.h | 29 -------- include/linux/time32.h | 154 +----------------------------------------- include/linux/timekeeping32.h | 32 --------- include/linux/types.h | 5 -- kernel/compat.c | 64 ------------------ kernel/time/time.c | 43 ------------ 6 files changed, 1 insertion(+), 326 deletions(-) (limited to 'kernel') diff --git a/include/linux/compat.h b/include/linux/compat.h index 11083d84eb23..df2475be134a 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -248,15 +248,6 @@ typedef struct compat_siginfo { } _sifields; } compat_siginfo_t; -/* - * These functions operate on 32- or 64-bit specs depending on - * COMPAT_USE_64BIT_TIME, hence the void user pointer arguments. - */ -extern int compat_get_timespec(struct timespec *, const void __user *); -extern int compat_put_timespec(const struct timespec *, void __user *); -extern int compat_get_timeval(struct timeval *, const void __user *); -extern int compat_put_timeval(const struct timeval *, void __user *); - struct compat_iovec { compat_uptr_t iov_base; compat_size_t iov_len; @@ -416,26 +407,6 @@ int copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginf int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); -static inline int old_timeval32_compare(struct old_timeval32 *lhs, - struct old_timeval32 *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_usec - rhs->tv_usec; -} - -static inline int old_timespec32_compare(struct old_timespec32 *lhs, - struct old_timespec32 *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_nsec - rhs->tv_nsec; -} - extern int get_compat_sigset(sigset_t *set, const compat_sigset_t __user *compat); /* diff --git a/include/linux/time32.h b/include/linux/time32.h index cad4c3186002..cf9320cd2d0b 100644 --- a/include/linux/time32.h +++ b/include/linux/time32.h @@ -12,8 +12,6 @@ #include #include -#define TIME_T_MAX (__kernel_old_time_t)((1UL << ((sizeof(__kernel_old_time_t) << 3) - 1)) - 1) - typedef s32 old_time32_t; struct old_timespec32 { @@ -73,162 +71,12 @@ struct __kernel_timex; int get_old_timex32(struct __kernel_timex *, const struct old_timex32 __user *); int put_old_timex32(struct old_timex32 __user *, const struct __kernel_timex *); -#if __BITS_PER_LONG == 64 - -/* timespec64 is defined as timespec here */ -static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) -{ - return *(const struct timespec *)&ts64; -} - -static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) -{ - return *(const struct timespec64 *)&ts; -} - -#else -static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) -{ - struct timespec ret; - - ret.tv_sec = (time_t)ts64.tv_sec; - ret.tv_nsec = ts64.tv_nsec; - return ret; -} - -static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) -{ - struct timespec64 ret; - - ret.tv_sec = ts.tv_sec; - ret.tv_nsec = ts.tv_nsec; - return ret; -} -#endif - -static inline int timespec_equal(const struct timespec *a, - const struct timespec *b) -{ - return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec); -} - -/* - * lhs < rhs: return <0 - * lhs == rhs: return 0 - * lhs > rhs: return >0 - */ -static inline int timespec_compare(const struct timespec *lhs, const struct timespec *rhs) -{ - if (lhs->tv_sec < rhs->tv_sec) - return -1; - if (lhs->tv_sec > rhs->tv_sec) - return 1; - return lhs->tv_nsec - rhs->tv_nsec; -} - -/* - * Returns true if the timespec is norm, false if denorm: - */ -static inline bool timespec_valid(const struct timespec *ts) -{ - /* Dates before 1970 are bogus */ - if (ts->tv_sec < 0) - return false; - /* Can't have more nanoseconds then a second */ - if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) - return false; - return true; -} - -/** - * timespec_to_ns - Convert timespec to nanoseconds - * @ts: pointer to the timespec variable to be converted - * - * Returns the scalar nanosecond representation of the timespec - * parameter. - */ -static inline s64 timespec_to_ns(const struct timespec *ts) -{ - return ((s64) ts->tv_sec * NSEC_PER_SEC) + ts->tv_nsec; -} - /** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -extern struct timespec ns_to_timespec(const s64 nsec); - -/** - * timespec_add_ns - Adds nanoseconds to a timespec - * @a: pointer to timespec to be incremented - * @ns: unsigned nanoseconds value to be added - * - * This must always be inlined because its used from the x86-64 vdso, - * which cannot call other kernel functions. - */ -static __always_inline void timespec_add_ns(struct timespec *a, u64 ns) -{ - a->tv_sec += __iter_div_u64_rem(a->tv_nsec + ns, NSEC_PER_SEC, &ns); - a->tv_nsec = ns; -} - -static inline unsigned long mktime(const unsigned int year, - const unsigned int mon, const unsigned int day, - const unsigned int hour, const unsigned int min, - const unsigned int sec) -{ - return mktime64(year, mon, day, hour, min, sec); -} - -static inline bool timeval_valid(const struct timeval *tv) -{ - /* Dates before 1970 are bogus */ - if (tv->tv_sec < 0) - return false; - - /* Can't have more microseconds then a second */ - if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) - return false; - - return true; -} - -/** - * timeval_to_ns - Convert timeval to nanoseconds - * @ts: pointer to the timeval variable to be converted - * - * Returns the scalar nanosecond representation of the timeval - * parameter. - */ -static inline s64 timeval_to_ns(const struct timeval *tv) -{ - return ((s64) tv->tv_sec * NSEC_PER_SEC) + - tv->tv_usec * NSEC_PER_USEC; -} - -/** - * ns_to_timeval - Convert nanoseconds to timeval + * ns_to_kernel_old_timeval - Convert nanoseconds to timeval * @nsec: the nanoseconds value to be converted * * Returns the timeval representation of the nsec parameter. */ -extern struct timeval ns_to_timeval(const s64 nsec); extern struct __kernel_old_timeval ns_to_kernel_old_timeval(s64 nsec); -/* - * Old names for the 32-bit time_t interfaces, these will be removed - * when everything uses the new names. - */ -#define compat_time_t old_time32_t -#define compat_timeval old_timeval32 -#define compat_timespec old_timespec32 -#define compat_itimerspec old_itimerspec32 -#define ns_to_compat_timeval ns_to_old_timeval32 -#define get_compat_itimerspec64 get_old_itimerspec32 -#define put_compat_itimerspec64 put_old_itimerspec32 -#define compat_get_timespec64 get_old_timespec32 -#define compat_put_timespec64 put_old_timespec32 - #endif diff --git a/include/linux/timekeeping32.h b/include/linux/timekeeping32.h index cc59cc9e0e84..266017fc9ee9 100644 --- a/include/linux/timekeeping32.h +++ b/include/linux/timekeeping32.h @@ -11,36 +11,4 @@ static inline unsigned long get_seconds(void) return ktime_get_real_seconds(); } -static inline void getnstimeofday(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_real_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void ktime_get_ts(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void getrawmonotonic(struct timespec *ts) -{ - struct timespec64 ts64; - - ktime_get_raw_ts64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - -static inline void getboottime(struct timespec *ts) -{ - struct timespec64 ts64; - - getboottime64(&ts64); - *ts = timespec64_to_timespec(ts64); -} - #endif diff --git a/include/linux/types.h b/include/linux/types.h index eb870ad42919..d3021c879179 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -65,11 +65,6 @@ typedef __kernel_ssize_t ssize_t; typedef __kernel_ptrdiff_t ptrdiff_t; #endif -#ifndef _TIME_T -#define _TIME_T -typedef __kernel_old_time_t time_t; -#endif - #ifndef _CLOCK_T #define _CLOCK_T typedef __kernel_clock_t clock_t; diff --git a/kernel/compat.c b/kernel/compat.c index 95005f849c68..843dd17e6078 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -26,70 +26,6 @@ #include -static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __get_user(tv->tv_sec, &ctv->tv_sec) || - __get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv) -{ - return (!access_ok(ctv, sizeof(*ctv)) || - __put_user(tv->tv_sec, &ctv->tv_sec) || - __put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0; -} - -static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __get_user(ts->tv_sec, &cts->tv_sec) || - __get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts) -{ - return (!access_ok(cts, sizeof(*cts)) || - __put_user(ts->tv_sec, &cts->tv_sec) || - __put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0; -} - -int compat_get_timeval(struct timeval *tv, const void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_get_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_get_timeval); - -int compat_put_timeval(const struct timeval *tv, void __user *utv) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; - else - return __compat_put_timeval(tv, utv); -} -EXPORT_SYMBOL_GPL(compat_put_timeval); - -int compat_get_timespec(struct timespec *ts, const void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_get_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_get_timespec); - -int compat_put_timespec(const struct timespec *ts, void __user *uts) -{ - if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; - else - return __compat_put_timespec(ts, uts); -} -EXPORT_SYMBOL_GPL(compat_put_timespec); - #ifdef __ARCH_WANT_SYS_SIGPROCMASK /* diff --git a/kernel/time/time.c b/kernel/time/time.c index cdd7386115ff..3985b2b32d08 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -449,49 +449,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, } EXPORT_SYMBOL(mktime64); -/** - * ns_to_timespec - Convert nanoseconds to timespec - * @nsec: the nanoseconds value to be converted - * - * Returns the timespec representation of the nsec parameter. - */ -struct timespec ns_to_timespec(const s64 nsec) -{ - struct timespec ts; - s32 rem; - - if (!nsec) - return (struct timespec) {0, 0}; - - ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem); - if (unlikely(rem < 0)) { - ts.tv_sec--; - rem += NSEC_PER_SEC; - } - ts.tv_nsec = rem; - - return ts; -} -EXPORT_SYMBOL(ns_to_timespec); - -/** - * ns_to_timeval - Convert nanoseconds to timeval - * @nsec: the nanoseconds value to be converted - * - * Returns the timeval representation of the nsec parameter. - */ -struct timeval ns_to_timeval(const s64 nsec) -{ - struct timespec ts = ns_to_timespec(nsec); - struct timeval tv; - - tv.tv_sec = ts.tv_sec; - tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000; - - return tv; -} -EXPORT_SYMBOL(ns_to_timeval); - struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec) { struct timespec64 ts = ns_to_timespec64(nsec); -- cgit v1.2.3