diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-22 18:57:44 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-06-22 18:57:44 -0700 |
commit | 43224b96af3154cedd7220f7b90094905f07ac78 (patch) | |
tree | 44279acc4613b314ff031620fd62641db3c85b71 | |
parent | d70b3ef54ceaf1c7c92209f5a662a670d04cbed9 (diff) | |
parent | 1cb6c2151850584ee805fdcf088af0bb81f4b086 (diff) | |
download | linux-43224b96af3154cedd7220f7b90094905f07ac78.tar.bz2 |
Merge branch 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull timer updates from Thomas Gleixner:
"A rather largish update for everything time and timer related:
- Cache footprint optimizations for both hrtimers and timer wheel
- Lower the NOHZ impact on systems which have NOHZ or timer migration
disabled at runtime.
- Optimize run time overhead of hrtimer interrupt by making the clock
offset updates smarter
- hrtimer cleanups and removal of restrictions to tackle some
problems in sched/perf
- Some more leap second tweaks
- Another round of changes addressing the 2038 problem
- First step to change the internals of clock event devices by
introducing the necessary infrastructure
- Allow constant folding for usecs/msecs_to_jiffies()
- The usual pile of clockevent/clocksource driver updates
The hrtimer changes contain updates to sched, perf and x86 as they
depend on them plus changes all over the tree to cleanup API changes
and redundant code, which got copied all over the place. The y2038
changes touch s390 to remove the last non 2038 safe code related to
boot/persistant clock"
* 'timers-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (114 commits)
clocksource: Increase dependencies of timer-stm32 to limit build wreckage
timer: Minimize nohz off overhead
timer: Reduce timer migration overhead if disabled
timer: Stats: Simplify the flags handling
timer: Replace timer base by a cpu index
timer: Use hlist for the timer wheel hash buckets
timer: Remove FIFO "guarantee"
timers: Sanitize catchup_timer_jiffies() usage
hrtimer: Allow hrtimer::function() to free the timer
seqcount: Introduce raw_write_seqcount_barrier()
seqcount: Rename write_seqcount_barrier()
hrtimer: Fix hrtimer_is_queued() hole
hrtimer: Remove HRTIMER_STATE_MIGRATE
selftest: Timers: Avoid signal deadlock in leap-a-day
timekeeping: Copy the shadow-timekeeper over the real timekeeper last
clockevents: Check state instead of mode in suspend/resume path
selftests: timers: Add leap-second timer edge testing to leap-a-day.c
ntp: Do leapsecond adjustment in adjtimex read path
time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap second edge
ntp: Introduce and use SECS_PER_DAY macro instead of 86400
...
78 files changed, 2292 insertions, 1563 deletions
diff --git a/Documentation/devicetree/bindings/arm/armv7m_systick.txt b/Documentation/devicetree/bindings/arm/armv7m_systick.txt new file mode 100644 index 000000000000..7cf4a24601eb --- /dev/null +++ b/Documentation/devicetree/bindings/arm/armv7m_systick.txt @@ -0,0 +1,26 @@ +* ARMv7M System Timer + +ARMv7-M includes a system timer, known as SysTick. Current driver only +implements the clocksource feature. + +Required properties: +- compatible : Should be "arm,armv7m-systick" +- reg : The address range of the timer + +Required clocking property, have to be one of: +- clocks : The input clock of the timer +- clock-frequency : The rate in HZ in input of the ARM SysTick + +Examples: + +systick: timer@e000e010 { + compatible = "arm,armv7m-systick"; + reg = <0xe000e010 0x10>; + clocks = <&clk_systick>; +}; + +systick: timer@e000e010 { + compatible = "arm,armv7m-systick"; + reg = <0xe000e010 0x10>; + clock-frequency = <90000000>; +}; diff --git a/Documentation/devicetree/bindings/timer/nxp,lpc3220-timer.txt b/Documentation/devicetree/bindings/timer/nxp,lpc3220-timer.txt new file mode 100644 index 000000000000..51b05a0e70d1 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/nxp,lpc3220-timer.txt @@ -0,0 +1,26 @@ +* NXP LPC3220 timer + +The NXP LPC3220 timer is used on a wide range of NXP SoCs. This +includes LPC32xx, LPC178x, LPC18xx and LPC43xx parts. + +Required properties: +- compatible: + Should be "nxp,lpc3220-timer". +- reg: + Address and length of the register set. +- interrupts: + Reference to the timer interrupt +- clocks: + Should contain a reference to timer clock. +- clock-names: + Should contain "timerclk". + +Example: + +timer1: timer@40085000 { + compatible = "nxp,lpc3220-timer"; + reg = <0x40085000 0x1000>; + interrupts = <13>; + clocks = <&ccu1 CLK_CPU_TIMER1>; + clock-names = "timerclk"; +}; diff --git a/Documentation/devicetree/bindings/timer/st,stm32-timer.txt b/Documentation/devicetree/bindings/timer/st,stm32-timer.txt new file mode 100644 index 000000000000..8ef28e70d6e8 --- /dev/null +++ b/Documentation/devicetree/bindings/timer/st,stm32-timer.txt @@ -0,0 +1,22 @@ +. STMicroelectronics STM32 timer + +The STM32 MCUs family has several general-purpose 16 and 32 bits timers. + +Required properties: +- compatible : Should be "st,stm32-timer" +- reg : Address and length of the register set +- clocks : Reference on the timer input clock +- interrupts : Reference to the timer interrupt + +Optional properties: +- resets: Reference to a reset controller asserting the timer + +Example: + +timer5: timer@40000c00 { + compatible = "st,stm32-timer"; + reg = <0x40000c00 0x400>; + interrupts = <50>; + resets = <&rrc 259>; + clocks = <&clk_pmtr1>; +}; @@ -2,8 +2,9 @@ # Kbuild for top-level directory of the kernel # This file takes care of the following: # 1) Generate bounds.h -# 2) Generate asm-offsets.h (may need bounds.h) -# 3) Check for missing system calls +# 2) Generate timeconst.h +# 3) Generate asm-offsets.h (may need bounds.h and timeconst.h) +# 4) Check for missing system calls # Default sed regexp - multiline due to syntax constraints define sed-y @@ -47,7 +48,26 @@ $(obj)/$(bounds-file): kernel/bounds.s FORCE $(call filechk,offsets,__LINUX_BOUNDS_H__) ##### -# 2) Generate asm-offsets.h +# 2) Generate timeconst.h + +timeconst-file := include/generated/timeconst.h + +#always += $(timeconst-file) +targets += $(timeconst-file) + +quiet_cmd_gentimeconst = GEN $@ +define cmd_gentimeconst + (echo $(CONFIG_HZ) | bc -q $< ) > $@ +endef +define filechk_gentimeconst + (echo $(CONFIG_HZ) | bc -q $< ) +endef + +$(obj)/$(timeconst-file): kernel/time/timeconst.bc FORCE + $(call filechk,gentimeconst) + +##### +# 3) Generate asm-offsets.h # offsets-file := include/generated/asm-offsets.h @@ -57,7 +77,7 @@ targets += arch/$(SRCARCH)/kernel/asm-offsets.s # We use internal kbuild rules to avoid the "is up to date" message from make arch/$(SRCARCH)/kernel/asm-offsets.s: arch/$(SRCARCH)/kernel/asm-offsets.c \ - $(obj)/$(bounds-file) FORCE + $(obj)/$(timeconst-file) $(obj)/$(bounds-file) FORCE $(Q)mkdir -p $(dir $@) $(call if_changed_dep,cc_s_c) @@ -65,7 +85,7 @@ $(obj)/$(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE $(call filechk,offsets,__ASM_OFFSETS_H__) ##### -# 3) Check for missing system calls +# 4) Check for missing system calls # always += missing-syscalls @@ -77,5 +97,5 @@ quiet_cmd_syscalls = CALL $< missing-syscalls: scripts/checksyscalls.sh $(offsets-file) FORCE $(call cmd,syscalls) -# Keep these two files during make clean -no-clean-files := $(bounds-file) $(offsets-file) +# Keep these three files during make clean +no-clean-files := $(bounds-file) $(offsets-file) $(timeconst-file) diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 98eb2a579223..dcb6312a0b91 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -10,6 +10,7 @@ #define _ASM_S390_TIMEX_H #include <asm/lowcore.h> +#include <linux/time64.h> /* The value of the TOD clock for 1.1.1970. */ #define TOD_UNIX_EPOCH 0x7d91048bca000000ULL @@ -108,10 +109,10 @@ int get_sync_clock(unsigned long long *clock); void init_cpu_timer(void); unsigned long long monotonic_clock(void); -void tod_to_timeval(__u64, struct timespec *); +void tod_to_timeval(__u64 todval, struct timespec64 *xt); static inline -void stck_to_timespec(unsigned long long stck, struct timespec *ts) +void stck_to_timespec64(unsigned long long stck, struct timespec64 *ts) { tod_to_timeval(stck - TOD_UNIX_EPOCH, ts); } diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index c1f21aca76e7..6fca0e46464e 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -1457,23 +1457,24 @@ int debug_dflt_header_fn(debug_info_t * id, struct debug_view *view, int area, debug_entry_t * entry, char *out_buf) { - struct timespec time_spec; + struct timespec64 time_spec; char *except_str; unsigned long caller; int rc = 0; unsigned int level; level = entry->id.fields.level; - stck_to_timespec(entry->id.stck, &time_spec); + stck_to_timespec64(entry->id.stck, &time_spec); if (entry->id.fields.exception) except_str = "*"; else except_str = "-"; caller = ((unsigned long) entry->caller) & PSW_ADDR_INSN; - rc += sprintf(out_buf, "%02i %011lu:%06lu %1u %1s %02i %p ", - area, time_spec.tv_sec, time_spec.tv_nsec / 1000, level, - except_str, entry->id.fields.cpuid, (void *) caller); + rc += sprintf(out_buf, "%02i %011lld:%06lu %1u %1s %02i %p ", + area, (long long)time_spec.tv_sec, + time_spec.tv_nsec / 1000, level, except_str, + entry->id.fields.cpuid, (void *)caller); return rc; } EXPORT_SYMBOL(debug_dflt_header_fn); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 170ddd2018b3..9e733d965e08 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -76,7 +76,7 @@ unsigned long long monotonic_clock(void) } EXPORT_SYMBOL(monotonic_clock); -void tod_to_timeval(__u64 todval, struct timespec *xt) +void tod_to_timeval(__u64 todval, struct timespec64 *xt) { unsigned long long sec; @@ -181,12 +181,12 @@ static void timing_alert_interrupt(struct ext_code ext_code, static void etr_reset(void); static void stp_reset(void); -void read_persistent_clock(struct timespec *ts) +void read_persistent_clock64(struct timespec64 *ts) { tod_to_timeval(get_tod_clock() - TOD_UNIX_EPOCH, ts); } -void read_boot_clock(struct timespec *ts) +void read_boot_clock64(struct timespec64 *ts) { tod_to_timeval(sched_clock_base_cc - TOD_UNIX_EPOCH, ts); } diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 358c54ad20d4..5cbd4e64feb5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -204,9 +204,8 @@ again: static void rapl_start_hrtimer(struct rapl_pmu *pmu) { - __hrtimer_start_range_ns(&pmu->hrtimer, - pmu->timer_interval, 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&pmu->hrtimer, pmu->timer_interval, + HRTIMER_MODE_REL_PINNED); } static void rapl_stop_hrtimer(struct rapl_pmu *pmu) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 7c1de1610178..21b5e38c921b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -233,9 +233,8 @@ static enum hrtimer_restart uncore_pmu_hrtimer(struct hrtimer *hrtimer) void uncore_pmu_start_hrtimer(struct intel_uncore_box *box) { - __hrtimer_start_range_ns(&box->hrtimer, - ns_to_ktime(box->hrtimer_duration), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&box->hrtimer, ns_to_ktime(box->hrtimer_duration), + HRTIMER_MODE_REL_PINNED); } void uncore_pmu_cancel_hrtimer(struct intel_uncore_box *box) diff --git a/drivers/clocksource/Kconfig b/drivers/clocksource/Kconfig index 51d7865fdddb..32164ba3d36a 100644 --- a/drivers/clocksource/Kconfig +++ b/drivers/clocksource/Kconfig @@ -106,6 +106,16 @@ config CLKSRC_EFM32 Support to use the timers of EFM32 SoCs as clock source and clock event device. +config CLKSRC_LPC32XX + bool + select CLKSRC_MMIO + select CLKSRC_OF + +config CLKSRC_STM32 + bool "Clocksource for STM32 SoCs" if !ARCH_STM32 + depends on OF && ARM && (ARCH_STM32 || COMPILE_TEST) + select CLKSRC_MMIO + config ARM_ARCH_TIMER bool select CLKSRC_OF if OF @@ -139,6 +149,13 @@ config CLKSRC_ARM_GLOBAL_TIMER_SCHED_CLOCK help Use ARM global timer clock source as sched_clock +config ARMV7M_SYSTICK + bool + select CLKSRC_OF if OF + select CLKSRC_MMIO + help + This options enables support for the ARMv7M system timer unit + config ATMEL_PIT select CLKSRC_OF if OF def_bool SOC_AT91SAM9 || SOC_SAMA5 diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 5b85f6adb258..1831a588b988 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -36,7 +36,9 @@ obj-$(CONFIG_ARCH_NSPIRE) += zevio-timer.o obj-$(CONFIG_ARCH_BCM_MOBILE) += bcm_kona_timer.o obj-$(CONFIG_CADENCE_TTC_TIMER) += cadence_ttc_timer.o obj-$(CONFIG_CLKSRC_EFM32) += time-efm32.o +obj-$(CONFIG_CLKSRC_STM32) += timer-stm32.o obj-$(CONFIG_CLKSRC_EXYNOS_MCT) += exynos_mct.o +obj-$(CONFIG_CLKSRC_LPC32XX) += time-lpc32xx.o obj-$(CONFIG_CLKSRC_SAMSUNG_PWM) += samsung_pwm_timer.o obj-$(CONFIG_FSL_FTM_TIMER) += fsl_ftm_timer.o obj-$(CONFIG_VF_PIT_TIMER) += vf_pit_timer.o @@ -45,6 +47,7 @@ obj-$(CONFIG_MTK_TIMER) += mtk_timer.o obj-$(CONFIG_ARM_ARCH_TIMER) += arm_arch_timer.o obj-$(CONFIG_ARM_GLOBAL_TIMER) += arm_global_timer.o +obj-$(CONFIG_ARMV7M_SYSTICK) += armv7m_systick.o obj-$(CONFIG_CLKSRC_METAG_GENERIC) += metag_generic.o obj-$(CONFIG_ARCH_HAS_TICK_BROADCAST) += dummy_timer.o obj-$(CONFIG_ARCH_KEYSTONE) += timer-keystone.o diff --git a/drivers/clocksource/armv7m_systick.c b/drivers/clocksource/armv7m_systick.c new file mode 100644 index 000000000000..addfd2c64f54 --- /dev/null +++ b/drivers/clocksource/armv7m_systick.c @@ -0,0 +1,79 @@ +/* + * Copyright (C) Maxime Coquelin 2015 + * Author: Maxime Coquelin <mcoquelin.stm32@gmail.com> + * License terms: GNU General Public License (GPL), version 2 + */ + +#include <linux/kernel.h> +#include <linux/clocksource.h> +#include <linux/clockchips.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/clk.h> +#include <linux/bitops.h> + +#define SYST_CSR 0x00 +#define SYST_RVR 0x04 +#define SYST_CVR 0x08 +#define SYST_CALIB 0x0c + +#define SYST_CSR_ENABLE BIT(0) + +#define SYSTICK_LOAD_RELOAD_MASK 0x00FFFFFF + +static void __init system_timer_of_register(struct device_node *np) +{ + struct clk *clk = NULL; + void __iomem *base; + u32 rate; + int ret; + + base = of_iomap(np, 0); + if (!base) { + pr_warn("system-timer: invalid base address\n"); + return; + } + + ret = of_property_read_u32(np, "clock-frequency", &rate); + if (ret) { + clk = of_clk_get(np, 0); + if (IS_ERR(clk)) + goto out_unmap; + + ret = clk_prepare_enable(clk); + if (ret) + goto out_clk_put; + + rate = clk_get_rate(clk); + if (!rate) + goto out_clk_disable; + } + + writel_relaxed(SYSTICK_LOAD_RELOAD_MASK, base + SYST_RVR); + writel_relaxed(SYST_CSR_ENABLE, base + SYST_CSR); + + ret = clocksource_mmio_init(base + SYST_CVR, "arm_system_timer", rate, + 200, 24, clocksource_mmio_readl_down); + if (ret) { + pr_err("failed to init clocksource (%d)\n", ret); + if (clk) + goto out_clk_disable; + else + goto out_unmap; + } + + pr_info("ARM System timer initialized as clocksource\n"); + + return; + +out_clk_disable: + clk_disable_unprepare(clk); +out_clk_put: + clk_put(clk); +out_unmap: + iounmap(base); + pr_warn("ARM System timer register failed (%d)\n", ret); +} + +CLOCKSOURCE_OF_DECLARE(arm_systick, "arm,armv7m-systick", + system_timer_of_register); diff --git a/drivers/clocksource/asm9260_timer.c b/drivers/clocksource/asm9260_timer.c index 2c9c993727c8..4c2ba59897e8 100644 --- a/drivers/clocksource/asm9260_timer.c +++ b/drivers/clocksource/asm9260_timer.c @@ -178,7 +178,7 @@ static void __init asm9260_timer_init(struct device_node *np) unsigned long rate; priv.base = of_io_request_and_map(np, 0, np->name); - if (!priv.base) + if (IS_ERR(priv.base)) panic("%s: unable to map resource", np->name); clk = of_clk_get(np, 0); diff --git a/drivers/clocksource/exynos_mct.c b/drivers/clocksource/exynos_mct.c index 83564c9cfdbe..935b05936dbd 100644 --- a/drivers/clocksource/exynos_mct.c +++ b/drivers/clocksource/exynos_mct.c @@ -209,7 +209,7 @@ static void exynos4_frc_resume(struct clocksource *cs) exynos4_mct_frc_start(); } -struct clocksource mct_frc = { +static struct clocksource mct_frc = { .name = "mct-frc", .rating = 400, .read = exynos4_frc_read, @@ -413,7 +413,7 @@ static inline void exynos4_tick_set_mode(enum clock_event_mode mode, } } -static int exynos4_mct_tick_clear(struct mct_clock_event_device *mevt) +static void exynos4_mct_tick_clear(struct mct_clock_event_device *mevt) { struct clock_event_device *evt = &mevt->evt; @@ -426,12 +426,8 @@ static int exynos4_mct_tick_clear(struct mct_clock_event_device *mevt) exynos4_mct_tick_stop(mevt); /* Clear the MCT tick interrupt */ - if (readl_relaxed(reg_base + mevt->base + MCT_L_INT_CSTAT_OFFSET) & 1) { + if (readl_relaxed(reg_base + mevt->base + MCT_L_INT_CSTAT_OFFSET) & 1) exynos4_mct_write(0x1, mevt->base + MCT_L_INT_CSTAT_OFFSET); - return 1; - } else { - return 0; - } } static irqreturn_t exynos4_mct_tick_isr(int irq, void *dev_id) @@ -564,18 +560,6 @@ out_irq: free_percpu_irq(mct_irqs[MCT_L0_IRQ], &percpu_mct_tick); } -void __init mct_init(void __iomem *base, int irq_g0, int irq_l0, int irq_l1) -{ - mct_irqs[MCT_G0_IRQ] = irq_g0; - mct_irqs[MCT_L0_IRQ] = irq_l0; - mct_irqs[MCT_L1_IRQ] = irq_l1; - mct_int_type = MCT_INT_SPI; - - exynos4_timer_resources(NULL, base); - exynos4_clocksource_init(); - exynos4_clockevent_init(); -} - static void __init mct_init_dt(struct device_node *np, unsigned int int_type) { u32 nr_irqs, i; diff --git a/drivers/clocksource/qcom-timer.c b/drivers/clocksource/qcom-timer.c index 098c542e5c53..cba2d015564c 100644 --- a/drivers/clocksource/qcom-timer.c +++ b/drivers/clocksource/qcom-timer.c @@ -40,8 +40,6 @@ #define GPT_HZ 32768 -#define MSM_DGT_SHIFT 5 - static void __iomem *event_base; static void __iomem *sts_base; @@ -232,7 +230,6 @@ err: register_current_timer_delay(&msm_delay_timer); } -#ifdef CONFIG_ARCH_QCOM static void __init msm_dt_timer_init(struct device_node *np) { u32 freq; @@ -285,59 +282,3 @@ static void __init msm_dt_timer_init(struct device_node *np) } CLOCKSOURCE_OF_DECLARE(kpss_timer, "qcom,kpss-timer", msm_dt_timer_init); CLOCKSOURCE_OF_DECLARE(scss_timer, "qcom,scss-timer", msm_dt_timer_init); -#else - -static int __init msm_timer_map(phys_addr_t addr, u32 event, u32 source, - u32 sts) -{ - void __iomem *base; - - base = ioremap(addr, SZ_256); - if (!base) { - pr_err("Failed to map timer base\n"); - return -ENOMEM; - } - event_base = base + event; - source_base = base + source; - if (sts) - sts_base = base + sts; - - return 0; -} - -static notrace cycle_t msm_read_timer_count_shift(struct clocksource *cs) -{ - /* - * Shift timer count down by a constant due to unreliable lower bits - * on some targets. - */ - return msm_read_timer_count(cs) >> MSM_DGT_SHIFT; -} - -void __init msm7x01_timer_init(void) -{ - struct clocksource *cs = &msm_clocksource; - - if (msm_timer_map(0xc0100000, 0x0, 0x10, 0x0)) - return; - cs->read = msm_read_timer_count_shift; - cs->mask = CLOCKSOURCE_MASK((32 - MSM_DGT_SHIFT)); - /* 600 KHz */ - msm_timer_init(19200000 >> MSM_DGT_SHIFT, 32 - MSM_DGT_SHIFT, 7, - false); -} - -void __init msm7x30_timer_init(void) -{ - if (msm_timer_map(0xc0100000, 0x4, 0x24, 0x80)) - return; - msm_timer_init(24576000 / 4, 32, 1, false); -} - -void __init qsd8x50_timer_init(void) -{ - if (msm_timer_map(0xAC100000, 0x0, 0x10, 0x34)) - return; - msm_timer_init(19200000 / 4, 32, 7, false); -} -#endif diff --git a/drivers/clocksource/time-lpc32xx.c b/drivers/clocksource/time-lpc32xx.c new file mode 100644 index 000000000000..a1c06a2bc77c --- /dev/null +++ b/drivers/clocksource/time-lpc32xx.c @@ -0,0 +1,272 @@ +/* + * Clocksource driver for NXP LPC32xx/18xx/43xx timer + * + * Copyright (C) 2015 Joachim Eastwood <manabian@gmail.com> + * + * Based on: + * time-efm32 Copyright (C) 2013 Pengutronix + * mach-lpc32xx/timer.c Copyright (C) 2009 - 2010 NXP Semiconductors + * + * This file is licensed under the terms of the GNU General Public + * License version 2. This program is licensed "as is" without any + * warranty of any kind, whether express or implied. + * + */ + +#define pr_fmt(fmt) "%s: " fmt, __func__ + +#include <linux/clk.h> +#include <linux/clockchips.h> +#include <linux/clocksource.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/kernel.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> +#include <linux/sched_clock.h> + +#define LPC32XX_TIMER_IR 0x000 +#define LPC32XX_TIMER_IR_MR0INT BIT(0) +#define LPC32XX_TIMER_TCR 0x004 +#define LPC32XX_TIMER_TCR_CEN BIT(0) +#define LPC32XX_TIMER_TCR_CRST BIT(1) +#define LPC32XX_TIMER_TC 0x008 +#define LPC32XX_TIMER_PR 0x00c +#define LPC32XX_TIMER_MCR 0x014 +#define LPC32XX_TIMER_MCR_MR0I BIT(0) +#define LPC32XX_TIMER_MCR_MR0R BIT(1) +#define LPC32XX_TIMER_MCR_MR0S BIT(2) +#define LPC32XX_TIMER_MR0 0x018 +#define LPC32XX_TIMER_CTCR 0x070 + +struct lpc32xx_clock_event_ddata { + struct clock_event_device evtdev; + void __iomem *base; +}; + +/* Needed for the sched clock */ +static void __iomem *clocksource_timer_counter; + +static u64 notrace lpc32xx_read_sched_clock(void) +{ + return readl(clocksource_timer_counter); +} + +static int lpc32xx_clkevt_next_event(unsigned long delta, + struct clock_event_device *evtdev) +{ + struct lpc32xx_clock_event_ddata *ddata = + container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev); + + /* + * Place timer in reset and program the delta in the prescale + * register (PR). When the prescale counter matches the value + * in PR the counter register is incremented and the compare + * match will trigger. After setup the timer is released from + * reset and enabled. + */ + writel_relaxed(LPC32XX_TIMER_TCR_CRST, ddata->base + LPC32XX_TIMER_TCR); + writel_relaxed(delta, ddata->base + LPC32XX_TIMER_PR); + writel_relaxed(LPC32XX_TIMER_TCR_CEN, ddata->base + LPC32XX_TIMER_TCR); + + return 0; +} + +static int lpc32xx_clkevt_shutdown(struct clock_event_device *evtdev) +{ + struct lpc32xx_clock_event_ddata *ddata = + container_of(evtdev, struct lpc32xx_clock_event_ddata, evtdev); + + /* Disable the timer */ + writel_relaxed(0, ddata->base + LPC32XX_TIMER_TCR); + + return 0; +} + +static int lpc32xx_clkevt_oneshot(struct clock_event_device *evtdev) +{ + /* + * When using oneshot, we must also disable the timer + * to wait for the first call to set_next_event(). + */ + return lpc32xx_clkevt_shutdown(evtdev); +} + +static irqreturn_t lpc32xx_clock_event_handler(int irq, void *dev_id) +{ + struct lpc32xx_clock_event_ddata *ddata = dev_id; + + /* Clear match on channel 0 */ + writel_relaxed(LPC32XX_TIMER_IR_MR0INT, ddata->base + LPC32XX_TIMER_IR); + + ddata->evtdev.event_handler(&ddata->evtdev); + + return IRQ_HANDLED; +} + +static struct lpc32xx_clock_event_ddata lpc32xx_clk_event_ddata = { + .evtdev = { + .name = "lpc3220 clockevent", + .features = CLOCK_EVT_FEAT_ONESHOT, + .rating = 300, + .set_next_event = lpc32xx_clkevt_next_event, + .set_state_shutdown = lpc32xx_clkevt_shutdown, + .set_state_oneshot = lpc32xx_clkevt_oneshot, + }, +}; + +static int __init lpc32xx_clocksource_init(struct device_node *np) +{ + void __iomem *base; + unsigned long rate; + struct clk *clk; + int ret; + + clk = of_clk_get_by_name(np, "timerclk"); + if (IS_ERR(clk)) { + pr_err("clock get failed (%lu)\n", PTR_ERR(clk)); + return PTR_ERR(clk); + } + + ret = clk_prepare_enable(clk); + if (ret) { + pr_err("clock enable failed (%d)\n", ret); + goto err_clk_enable; + } + + base = of_iomap(np, 0); + if (!base) { + pr_err("unable to map registers\n"); + ret = -EADDRNOTAVAIL; + goto err_iomap; + } + + /* + * Disable and reset timer then set it to free running timer + * mode (CTCR) with no prescaler (PR) or match operations (MCR). + * After setup the timer is released from reset and enabled. + */ + writel_relaxed(LPC32XX_TIMER_TCR_CRST, base + LPC32XX_TIMER_TCR); + writel_relaxed(0, base + LPC32XX_TIMER_PR); + writel_relaxed(0, base + LPC32XX_TIMER_MCR); + writel_relaxed(0, base + LPC32XX_TIMER_CTCR); + writel_relaxed(LPC32XX_TIMER_TCR_CEN, base + LPC32XX_TIMER_TCR); + + rate = clk_get_rate(clk); + ret = clocksource_mmio_init(base + LPC32XX_TIMER_TC, "lpc3220 timer", + rate, 300, 32, clocksource_mmio_readl_up); + if (ret) { + pr_err("failed to init clocksource (%d)\n", ret); + goto err_clocksource_init; + } + + clocksource_timer_counter = base + LPC32XX_TIMER_TC; + sched_clock_register(lpc32xx_read_sched_clock, 32, rate); + + return 0; + +err_clocksource_init: + iounmap(base); +err_iomap: + clk_disable_unprepare(clk); +err_clk_enable: + clk_put(clk); + return ret; +} + +static int __init lpc32xx_clockevent_init(struct device_node *np) +{ + void __iomem *base; + unsigned long rate; + struct clk *clk; + int ret, irq; + + clk = of_clk_get_by_name(np, "timerclk"); + if (IS_ERR(clk)) { + pr_err("clock get failed (%lu)\n", PTR_ERR(clk)); + return PTR_ERR(clk); + } + + ret = clk_prepare_enable(clk); + if (ret) { + pr_err("clock enable failed (%d)\n", ret); + goto err_clk_enable; + } + + base = of_iomap(np, 0); + if (!base) { + pr_err("unable to map registers\n"); + ret = -EADDRNOTAVAIL; + goto err_iomap; + } + + irq = irq_of_parse_and_map(np, 0); + if (!irq) { + pr_err("get irq failed\n"); + ret = -ENOENT; + goto err_irq; + } + + /* + * Disable timer and clear any pending interrupt (IR) on match + * channel 0 (MR0). Configure a compare match value of 1 on MR0 + * and enable interrupt, reset on match and stop on match (MCR). + */ + writel_relaxed(0, base + LPC32XX_TIMER_TCR); + writel_relaxed(0, base + LPC32XX_TIMER_CTCR); + writel_relaxed(LPC32XX_TIMER_IR_MR0INT, base + LPC32XX_TIMER_IR); + writel_relaxed(1, base + LPC32XX_TIMER_MR0); + writel_relaxed(LPC32XX_TIMER_MCR_MR0I | LPC32XX_TIMER_MCR_MR0R | + LPC32XX_TIMER_MCR_MR0S, base + LPC32XX_TIMER_MCR); + + rate = clk_get_rate(clk); + lpc32xx_clk_event_ddata.base = base; + clockevents_config_and_register(&lpc32xx_clk_event_ddata.evtdev, + rate, 1, -1); + + ret = request_irq(irq, lpc32xx_clock_event_handler, + IRQF_TIMER | IRQF_IRQPOLL, "lpc3220 clockevent", + &lpc32xx_clk_event_ddata); + if (ret) { + pr_err("request irq failed\n"); + goto err_irq; + } + + return 0; + +err_irq: + iounmap(base); +err_iomap: + clk_disable_unprepare(clk); +err_clk_enable: + clk_put(clk); + return ret; +} + +/* + * This function asserts that we have exactly one clocksource and one + * clock_event_device in the end. + */ +static void __init lpc32xx_timer_init(struct device_node *np) +{ + static int has_clocksource, has_clockevent; + int ret; + + if (!has_clocksource) { + ret = lpc32xx_clocksource_init(np); + if (!ret) { + has_clocksource = 1; + return; + } + } + + if (!has_clockevent) { + ret = lpc32xx_clockevent_init(np); + if (!ret) { + has_clockevent = 1; + return; + } + } +} +CLOCKSOURCE_OF_DECLARE(lpc32xx_timer, "nxp,lpc3220-timer", lpc32xx_timer_init); diff --git a/drivers/clocksource/timer-integrator-ap.c b/drivers/clocksource/timer-integrator-ap.c index b9efd30513d5..c97d1980c0f8 100644 --- a/drivers/clocksource/timer-integrator-ap.c +++ b/drivers/clocksource/timer-integrator-ap.c @@ -166,7 +166,7 @@ static void __init integrator_ap_timer_init_of(struct device_node *node) struct device_node *sec_node; base = of_io_request_and_map(node, 0, "integrator-timer"); - if (!base) + if (IS_ERR(base)) return; clk = of_clk_get(node, 0); diff --git a/drivers/clocksource/timer-stm32.c b/drivers/clocksource/timer-stm32.c new file mode 100644 index 000000000000..a97e8b50701c --- /dev/null +++ b/drivers/clocksource/timer-stm32.c @@ -0,0 +1,184 @@ +/* + * Copyright (C) Maxime Coquelin 2015 + * Author: Maxime Coquelin <mcoquelin.stm32@gmail.com> + * License terms: GNU General Public License (GPL), version 2 + * + * Inspired by time-efm32.c from Uwe Kleine-Koenig + */ + +#include <linux/kernel.h> +#include <linux/clocksource.h> +#include <linux/clockchips.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/of.h> +#include <linux/of_address.h> +#include <linux/of_irq.h> +#include <linux/clk.h> +#include <linux/reset.h> + +#define TIM_CR1 0x00 +#define TIM_DIER 0x0c +#define TIM_SR 0x10 +#define TIM_EGR 0x14 +#define TIM_PSC 0x28 +#define TIM_ARR 0x2c + +#define TIM_CR1_CEN BIT(0) +#define TIM_CR1_OPM BIT(3) +#define TIM_CR1_ARPE BIT(7) + +#define TIM_DIER_UIE BIT(0) + +#define TIM_SR_UIF BIT(0) + +#define TIM_EGR_UG BIT(0) + +struct stm32_clock_event_ddata { + struct clock_event_device evtdev; + unsigned periodic_top; + void __iomem *base; +}; + +static void stm32_clock_event_set_mode(enum clock_event_mode mode, + struct clock_event_device *evtdev) +{ + struct stm32_clock_event_ddata *data = + container_of(evtdev, struct stm32_clock_event_ddata, evtdev); + void *base = data->base; + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + writel_relaxed(data->periodic_top, base + TIM_ARR); + writel_relaxed(TIM_CR1_ARPE | TIM_CR1_CEN, base + TIM_CR1); + break; + + case CLOCK_EVT_MODE_ONESHOT: + default: + writel_relaxed(0, base + TIM_CR1); + break; + } +} + +static int stm32_clock_event_set_next_event(unsigned long evt, + struct clock_event_device *evtdev) +{ + struct stm32_clock_event_ddata *data = + container_of(evtdev, struct stm32_clock_event_ddata, evtdev); + + writel_relaxed(evt, data->base + TIM_ARR); + writel_relaxed(TIM_CR1_ARPE | TIM_CR1_OPM | TIM_CR1_CEN, + data->base + TIM_CR1); + + return 0; +} + +static irqreturn_t stm32_clock_event_handler(int irq, void *dev_id) +{ + struct stm32_clock_event_ddata *data = dev_id; + + writel_relaxed(0, data->base + TIM_SR); + + data->evtdev.event_handler(&data->evtdev); + + return IRQ_HANDLED; +} + +static struct stm32_clock_event_ddata clock_event_ddata = { + .evtdev = { + .name = "stm32 clockevent", + .features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_PERIODIC, + .set_mode = stm32_clock_event_set_mode, + .set_next_event = stm32_clock_event_set_next_event, + .rating = 200, + }, +}; + +static void __init stm32_clockevent_init(struct device_node *np) +{ + struct stm32_clock_event_ddata *data = &clock_event_ddata; + struct clk *clk; + struct reset_control *rstc; + unsigned long rate, max_delta; + int irq, ret, bits, prescaler = 1; + + clk = of_clk_get(np, 0); + if (IS_ERR(clk)) { + ret = PTR_ERR(clk); + pr_err("failed to get clock for clockevent (%d)\n", ret); + goto err_clk_get; + } + + ret = clk_prepare_enable(clk); + if (ret) { + pr_err("failed to enable timer clock for clockevent (%d)\n", + ret); + goto err_clk_enable; + } + + rate = clk_get_rate(clk); + + rstc = of_reset_control_get(np, NULL); + if (!IS_ERR(rstc)) { + reset_control_assert(rstc); + reset_control_deassert(rstc); + } + + data->base = of_iomap(np, 0); + if (!data->base) { + pr_err("failed to map registers for clockevent\n"); + goto err_iomap; + } + + irq = irq_of_parse_and_map(np, 0); + if (!irq) { + pr_err("%s: failed to get irq.\n", np->full_name); + goto err_get_irq; + } + + /* Detect whether the timer is 16 or 32 bits */ + writel_relaxed(~0U, data->base + TIM_ARR); + max_delta = readl_relaxed(data->base + TIM_ARR); + if (max_delta == ~0U) { + prescaler = 1; + bits = 32; + } else { + prescaler = 1024; + bits = 16; + } + writel_relaxed(0, data->base + TIM_ARR); + + writel_relaxed(prescaler - 1, data->base + TIM_PSC); + writel_relaxed(TIM_EGR_UG, data->base + TIM_EGR); + writel_relaxed(TIM_DIER_UIE, data->base + TIM_DIER); + writel_relaxed(0, data->base + TIM_SR); + + data->periodic_top = DIV_ROUND_CLOSEST(rate, prescaler * HZ); + + clockevents_config_and_register(&data->evtdev, + DIV_ROUND_CLOSEST(rate, prescaler), + 0x1, max_delta); + + ret = request_irq(irq, stm32_clock_event_handler, IRQF_TIMER, + "stm32 clockevent", data); + if (ret) { + pr_err("%s: failed to request irq.\n", np->full_name); + goto err_get_irq; + } + + pr_info("%s: STM32 clockevent driver initialized (%d bits)\n", + np->full_name, bits); + + return; + +err_get_irq: + iounmap(data->base); +err_iomap: + clk_disable_unprepare(clk); +err_clk_enable: + clk_put(clk); +err_clk_get: + return; +} + +CLOCKSOURCE_OF_DECLARE(stm32, "st,stm32-timer", stm32_clockevent_init); diff --git a/drivers/clocksource/timer-sun5i.c b/drivers/clocksource/timer-sun5i.c index 28aa4b7bb602..0ffb4ea7c925 100644 --- a/drivers/clocksource/timer-sun5i.c +++ b/drivers/clocksource/timer-sun5i.c @@ -324,7 +324,7 @@ static void __init sun5i_timer_init(struct device_node *node) int irq; timer_base = of_io_request_and_map(node, 0, of_node_full_name(node)); - if (!timer_base) + if (IS_ERR(timer_base)) panic("Can't map registers"); irq = irq_of_parse_and_map(node, 0); diff --git a/drivers/power/reset/ltc2952-poweroff.c b/drivers/power/reset/ltc2952-poweroff.c index 1e08195551fe..5f855f99bdfc 100644 --- a/drivers/power/reset/ltc2952-poweroff.c +++ b/drivers/power/reset/ltc2952-poweroff.c @@ -158,7 +158,6 @@ static irqreturn_t ltc2952_poweroff_handler(int irq, void *dev_id) HRTIMER_MODE_REL); } else { hrtimer_cancel(&data->timer_trigger); - /* omitting return value check, timer should have been valid */ } return IRQ_HANDLED; } diff --git a/fs/dcache.c b/fs/dcache.c index 37b5afdaf698..592c4b582495 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -322,17 +322,17 @@ static void dentry_free(struct dentry *dentry) } /** - * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups + * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups * @dentry: the target dentry * After this call, in-progress rcu-walk path lookup will fail. This * should be called after unhashing, and after changing d_inode (if * the dentry has not already been unhashed). */ -static inline void dentry_rcuwalk_barrier(struct dentry *dentry) +static inline void dentry_rcuwalk_invalidate(struct dentry *dentry) { - assert_spin_locked(&dentry->d_lock); - /* Go through a barrier */ - write_seqcount_barrier(&dentry->d_seq); + lockdep_assert_held(&dentry->d_lock); + /* Go through am invalidation barrier */ + write_seqcount_invalidate(&dentry->d_seq); } /* @@ -372,7 +372,7 @@ static void dentry_unlink_inode(struct dentry * dentry) struct inode *inode = dentry->d_inode; __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); - dentry_rcuwalk_barrier(dentry); + dentry_rcuwalk_invalidate(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -494,7 +494,7 @@ void __d_drop(struct dentry *dentry) __hlist_bl_del(&dentry->d_hash); dentry->d_hash.pprev = NULL; hlist_bl_unlock(b); - dentry_rcuwalk_barrier(dentry); + dentry_rcuwalk_invalidate(dentry); } } EXPORT_SYMBOL(__d_drop); @@ -1752,7 +1752,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) if (inode) hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); __d_set_inode_and_type(dentry, inode, add_flags); - dentry_rcuwalk_barrier(dentry); + dentry_rcuwalk_invalidate(dentry); spin_unlock(&dentry->d_lock); fsnotify_d_instantiate(dentry, inode); } diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h index a899402a5a0e..52f3b7da4f2d 100644 --- a/include/linux/alarmtimer.h +++ b/include/linux/alarmtimer.h @@ -43,8 +43,8 @@ struct alarm { void alarm_init(struct alarm *alarm, enum alarmtimer_type type, enum alarmtimer_restart (*function)(struct alarm *, ktime_t)); -int alarm_start(struct alarm *alarm, ktime_t start); -int alarm_start_relative(struct alarm *alarm, ktime_t start); +void alarm_start(struct alarm *alarm, ktime_t start); +void alarm_start_relative(struct alarm *alarm, ktime_t start); void alarm_restart(struct alarm *alarm); int alarm_try_to_cancel(struct alarm *alarm); int alarm_cancel(struct alarm *alarm); diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 96c280b2c263..597a1e836f22 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -37,12 +37,15 @@ enum clock_event_mode { * reached from DETACHED or SHUTDOWN. * ONESHOT: Device is programmed to generate event only once. Can be reached * from DETACHED or SHUTDOWN. + * ONESHOT_STOPPED: Device was programmed in ONESHOT mode and is temporarily + * stopped. */ enum clock_event_state { CLOCK_EVT_STATE_DETACHED, CLOCK_EVT_STATE_SHUTDOWN, CLOCK_EVT_STATE_PERIODIC, CLOCK_EVT_STATE_ONESHOT, + CLOCK_EVT_STATE_ONESHOT_STOPPED, }; /* @@ -84,12 +87,13 @@ enum clock_event_state { * @mult: nanosecond to cycles multiplier * @shift: nanoseconds to cycles divisor (power of two) * @mode: operating mode, relevant only to ->set_mode(), OBSOLETE - * @state: current state of the device, assigned by the core code + * @state_use_accessors:current state of the device, assigned by the core code * @features: features * @retries: number of forced programming retries * @set_mode: legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME. * @set_state_periodic: switch state to periodic, if !set_mode * @set_state_oneshot: switch state to oneshot, if !set_mode + * @set_state_oneshot_stopped: switch state to oneshot_stopped, if !set_mode * @set_state_shutdown: switch state to shutdown, if !set_mode * @tick_resume: resume clkevt device, if !set_mode * @broadcast: function to broadcast events @@ -113,7 +117,7 @@ struct clock_event_device { u32 mult; u32 shift; enum clock_event_mode mode; - enum clock_event_state state; + enum clock_event_state state_use_accessors; unsigned int features; unsigned long retries; @@ -121,11 +125,12 @@ struct clock_event_device { * State transition callback(s): Only one of the two groups should be * defined: * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME. - * - set_state_{shutdown|periodic|oneshot}(), tick_resume(). + * - set_state_{shutdown|periodic|oneshot|oneshot_stopped}(), tick_resume(). */ void (*set_mode)(enum clock_event_mode mode, struct clock_event_device *); int (*set_state_periodic)(struct clock_event_device *); int (*set_state_oneshot)(struct clock_event_device *); + int (*set_state_oneshot_stopped)(struct clock_event_device *); int (*set_state_shutdown)(struct clock_event_device *); int (*tick_resume)(struct clock_event_device *); @@ -144,6 +149,32 @@ struct clock_event_device { struct module *owner; } ____cacheline_aligned; +/* Helpers to verify state of a clockevent device */ +static inline bool clockevent_state_detached(struct clock_event_device *dev) +{ + return dev->state_use_accessors == CLOCK_EVT_STATE_DETACHED; +} + +static inline bool clockevent_state_shutdown(struct clock_event_device *dev) +{ + return dev->state_use_accessors == CLOCK_EVT_STATE_SHUTDOWN; +} + +static inline bool clockevent_state_periodic(struct clock_event_device *dev) +{ + return dev->state_use_accessors == CLOCK_EVT_STATE_PERIODIC; +} + +static inline bool clockevent_state_oneshot(struct clock_event_device *dev) +{ + return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT; +} + +static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev) +{ + return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED; +} + /* * Calculate a multiplication factor for scaled math, which is used to convert * nanoseconds based values to clock ticks: diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index d27d0152271f..278dd279a7a8 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -181,7 +181,6 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift) extern int clocksource_unregister(struct clocksource*); extern void clocksource_touch_watchdog(void); -extern struct clocksource* clocksource_get_next(void); extern void clocksource_change_rating(struct clocksource *cs, int rating); extern void clocksource_suspend(void); extern void clocksource_resume(void); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 05f6df1fdf5b..76dd4f0da5ca 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -53,34 +53,25 @@ enum hrtimer_restart { * * 0x00 inactive * 0x01 enqueued into rbtree - * 0x02 callback function running - * 0x04 timer is migrated to another cpu * - * Special cases: - * 0x03 callback function running and enqueued - * (was requeued on another CPU) - * 0x05 timer was migrated on CPU hotunplug + * The callback state is not part of the timer->state because clearing it would + * mean touching the timer after the callback, this makes it impossible to free + * the timer from the callback function. * - * The "callback function running and enqueued" status is only possible on - * SMP. It happens for example when a posix timer expired and the callback + * Therefore we track the callback state in: + * + * timer->base->cpu_base->running == timer + * + * On SMP it is possible to have a "callback function running and enqueued" + * status. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer * and reacquiring the base lock of the hrtimer, another CPU can deliver the - * signal and rearm the timer. We have to preserve the callback running state, - * as otherwise the timer could be removed before the softirq code finishes the - * the handling of the timer. - * - * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state - * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This - * also affects HRTIMER_STATE_MIGRATE where the preservation is not - * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is - * enqueued on the new cpu. + * signal and rearm the timer. * * All state transitions are protected by cpu_base->lock. */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 -#define HRTIMER_STATE_CALLBACK 0x02 -#define HRTIMER_STATE_MIGRATE 0x04 /** * struct hrtimer - the basic hrtimer structure @@ -130,6 +121,12 @@ struct hrtimer_sleeper { struct task_struct *task; }; +#ifdef CONFIG_64BIT +# define HRTIMER_CLOCK_BASE_ALIGN 64 +#else +# define HRTIMER_CLOCK_BASE_ALIGN 32 +#endif + /** * struct hrtimer_clock_base - the timer base for a specific clock * @cpu_base: per cpu clock base @@ -137,9 +134,7 @@ struct hrtimer_sleeper { * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @active: red black tree root node for the active timers - * @resolution: the resolution of the clock, in nanoseconds * @get_time: function to retrieve the current time of the clock - * @softirq_time: the time when running the hrtimer queue in the softirq * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { @@ -147,11 +142,9 @@ struct hrtimer_clock_base { int index; clockid_t clockid; struct timerqueue_head active; - ktime_t resolution; ktime_t (*get_time)(void); - ktime_t softirq_time; ktime_t offset; -}; +} __attribute__((__aligned__(HRTIMER_CLOCK_BASE_ALIGN))); enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, @@ -165,11 +158,16 @@ enum hrtimer_base_type { * struct hrtimer_cpu_base - the per cpu clock bases * @lock: lock protecting the base and associated clock bases * and timers + * @seq: seqcount around __run_hrtimer + * @running: pointer to the currently running hrtimer * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers - * @clock_was_set: Indicates that clock was set from irq context. + * @clock_was_set_seq: Sequence counter of clock was set events + * @migration_enabled: The migration of hrtimers to other cpus is enabled + * @nohz_active: The nohz functionality is enabled * @expires_next: absolute time of the next event which was scheduled * via clock_set_next_event() + * @next_timer: Pointer to the first expiring timer * @in_hrtirq: hrtimer_interrupt() is currently executing * @hres_active: State of high resolution mode * @hang_detected: The last hrtimer interrupt detected a hang @@ -178,27 +176,38 @@ enum hrtimer_base_type { * @nr_hangs: Total number of hrtimer interrupt hangs * @max_hang_time: Maximum time spent in hrtimer_interrupt * @clock_base: array of clock bases for this cpu + * + * Note: next_timer is just an optimization for __remove_hrtimer(). + * Do not dereference the pointer because it is not reliable on + * cross cpu removals. */ struct hrtimer_cpu_base { raw_spinlock_t lock; + seqcount_t seq; + struct hrtimer *running; unsigned int cpu; unsigned int active_bases; - unsigned int clock_was_set; + unsigned int clock_was_set_seq; + bool migration_enabled; + bool nohz_active; #ifdef CONFIG_HIGH_RES_TIMERS + unsigned int in_hrtirq : 1, + hres_active : 1, + hang_detected : 1; ktime_t expires_next; - int in_hrtirq; - int hres_active; - int hang_detected; - unsigned long nr_events; - unsigned long nr_retries; - unsigned long nr_hangs; - ktime_t max_hang_time; + struct hrtimer *next_timer; + unsigned int nr_events; + unsigned int nr_retries; + unsigned int nr_hangs; + unsigned int max_hang_time; #endif struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; -}; +} ____cacheline_aligned; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { + BUILD_BUG_ON(sizeof(struct hrtimer_clock_base) > HRTIMER_CLOCK_BASE_ALIGN); + timer->node.expires = time; timer->_softexpires = time; } @@ -262,19 +271,16 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) return ktime_sub(timer->node.expires, timer->base->get_time()); } -#ifdef CONFIG_HIGH_RES_TIMERS -struct clock_event_device; - -extern void hrtimer_interrupt(struct clock_event_device *dev); - -/* - * In high resolution mode the time reference must be read accurate - */ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) { return timer->base->get_time(); } +#ifdef CONFIG_HIGH_RES_TIMERS +struct clock_event_device; + +extern void hrtimer_interrupt(struct clock_event_device *dev); + static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return timer->base->cpu_base->hres_active; @@ -295,21 +301,16 @@ extern void hrtimer_peek_ahead_timers(void); extern void clock_was_set_delayed(void); +extern unsigned int hrtimer_resolution; + #else # define MONOTONIC_RES_NSEC LOW_RES_NSEC # define KTIME_MONOTONIC_RES KTIME_LOW_RES -static inline void hrtimer_peek_ahead_timers(void) { } +#define hrtimer_resolution (unsigned int)LOW_RES_NSEC -/* - * In non high resolution mode the time reference is taken from - * the base softirq time variable. - */ -static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) -{ - return timer->base->softirq_time; -} +static inline void hrtimer_peek_ahead_timers(void) { } static inline int hrtimer_is_hres_active(struct hrtimer *timer) { @@ -353,49 +354,47 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ -extern int hrtimer_start(struct hrtimer *timer, ktime_t tim, - const enum hrtimer_mode mode); -extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, +extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long range_ns, const enum hrtimer_mode mode); -extern int -__hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, - const enum hrtimer_mode mode, int wakeup); + +/** + * hrtimer_start - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + */ +static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ + hrtimer_start_range_ns(timer, tim, 0, mode); +} extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); -static inline int hrtimer_start_expires(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void hrtimer_start_expires(struct hrtimer *timer, + enum hrtimer_mode mode) { unsigned long delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); - return hrtimer_start_range_ns(timer, soft, delta, mode); + hrtimer_start_range_ns(timer, soft, delta, mode); } -static inline int hrtimer_restart(struct hrtimer *timer) +static inline void hrtimer_restart(struct hrtimer *timer) { - return hrtimer_start_expires(timer, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } /* Query timers: */ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer); -extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp); -extern ktime_t hrtimer_get_next_event(void); +extern u64 hrtimer_get_next_event(void); -/* - * A timer is active, when it is enqueued into the rbtree or the - * callback function is running or it's in the state of being migrated - * to another cpu. - */ -static inline int hrtimer_active(const struct hrtimer *timer) -{ - return timer->state != HRTIMER_STATE_INACTIVE; -} +extern bool hrtimer_active(const struct hrtimer *timer); /* * Helper function to check, whether the timer is on one of the queues @@ -411,14 +410,29 @@ static inline int hrtimer_is_queued(struct hrtimer *timer) */ static inline int hrtimer_callback_running(struct hrtimer *timer) { - return timer->state & HRTIMER_STATE_CALLBACK; + return timer->base->cpu_base->running == timer; } /* Forward a hrtimer so it expires after now: */ extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); -/* Forward a hrtimer so it expires after the hrtimer's current now */ +/** + * hrtimer_forward_now - forward the timer expiry so it expires after now + * @timer: hrtimer to forward + * @interval: the interval to forward + * + * Forward the timer expiry so it will expire after the current time + * of the hrtimer clock base. Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. + */ static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { @@ -443,7 +457,6 @@ extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ extern void hrtimer_run_queues(void); -extern void hrtimer_run_pending(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 950ae4501826..be7e75c945e9 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -413,7 +413,8 @@ enum BLOCK_IOPOLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, - HRTIMER_SOFTIRQ, + HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the + numbering. Sigh! */ RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS @@ -592,10 +593,10 @@ tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, clockid_t which_clock, enum hrtimer_mode mode); static inline -int tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, - const enum hrtimer_mode mode) +void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, + const enum hrtimer_mode mode) { - return hrtimer_start(&ttimer->timer, time, mode); + hrtimer_start(&ttimer->timer, time, mode); } static inline diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index c367cbdf73ab..535fd3bb1ba8 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -7,6 +7,7 @@ #include <linux/time.h> #include <linux/timex.h> #include <asm/param.h> /* for HZ */ +#include <generated/timeconst.h> /* * The following defines establish the engineering parameters of the PLL @@ -288,8 +289,133 @@ static inline u64 jiffies_to_nsecs(const unsigned long j) return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC; } -extern unsigned long msecs_to_jiffies(const unsigned int m); -extern unsigned long usecs_to_jiffies(const unsigned int u); +extern unsigned long __msecs_to_jiffies(const unsigned int m); +#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) +/* + * HZ is equal to or smaller than 1000, and 1000 is a nice round + * multiple of HZ, divide with the factor between them, but round + * upwards: + */ +static inline unsigned long _msecs_to_jiffies(const unsigned int m) +{ + return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); +} +#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) +/* + * HZ is larger than 1000, and HZ is a nice round multiple of 1000 - + * simply multiply with the factor between them. + * + * But first make sure the multiplication result cannot overflow: + */ +static inline unsigned long _msecs_to_jiffies(const unsigned int m) +{ + if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return m * (HZ / MSEC_PER_SEC); +} +#else +/* + * Generic case - multiply, round and divide. But first check that if + * we are doing a net multiplication, that we wouldn't overflow: + */ +static inline unsigned long _msecs_to_jiffies(const unsigned int m) +{ + if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + + return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32; +} +#endif +/** + * msecs_to_jiffies: - convert milliseconds to jiffies + * @m: time in milliseconds + * + * conversion is done as follows: + * + * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows. + * for the details see __msecs_to_jiffies() + * + * msecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __msecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * the HZ range specific helpers _msecs_to_jiffies() are called both + * directly here and from __msecs_to_jiffies() in the case where + * constant folding is not possible. + */ +static inline unsigned long msecs_to_jiffies(const unsigned int m) +{ + if (__builtin_constant_p(m)) { + if ((int)m < 0) + return MAX_JIFFY_OFFSET; + return _msecs_to_jiffies(m); + } else { + return __msecs_to_jiffies(m); + } +} + +extern unsigned long __usecs_to_jiffies(const unsigned int u); +#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ + return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); +} +#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ + return u * (HZ / USEC_PER_SEC); +} +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ +#else +static inline unsigned long _usecs_to_jiffies(const unsigned int u) +{ + return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) + >> USEC_TO_HZ_SHR32; +} +#endif + +/** + * usecs_to_jiffies: - convert microseconds to jiffies + * @u: time in microseconds + * + * conversion is done as follows: + * + * - 'too large' values [that would result in larger than + * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. + * + * - all other values are converted to jiffies by either multiplying + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows as for msecs_to_jiffies. + * + * usecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __usecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * the HZ range specific helpers _usecs_to_jiffies() are called both + * directly here and from __msecs_to_jiffies() in the case where + * constant folding is not possible. + */ +static inline unsigned long usecs_to_jiffies(const unsigned int u) +{ + if (__builtin_constant_p(u)) { + if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) + return MAX_JIFFY_OFFSET; + return _usecs_to_jiffies(u); + } else { + return __usecs_to_jiffies(u); + } +} + extern unsigned long timespec_to_jiffies(const struct timespec *value); extern void jiffies_to_timespec(const unsigned long jiffies, struct timespec *value); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a204d5266f5f..1b82d44b0a02 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -562,8 +562,12 @@ struct perf_cpu_context { struct perf_event_context *task_ctx; int active_oncpu; int exclusive; + + raw_spinlock_t hrtimer_lock; struct hrtimer hrtimer; ktime_t hrtimer_interval; + unsigned int hrtimer_active; + struct pmu *unique_pmu; struct perf_cgroup *cgrp; }; diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 03a899aabd17..33a056bb886f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -44,6 +44,8 @@ #include <linux/debugobjects.h> #include <linux/bug.h> #include <linux/compiler.h> +#include <linux/ktime.h> + #include <asm/barrier.h> extern int rcu_expedited; /* for sysctl */ @@ -1100,9 +1102,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) __kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head)) #ifdef CONFIG_TINY_RCU -static inline int rcu_needs_cpu(unsigned long *delta_jiffies) +static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt) { - *delta_jiffies = ULONG_MAX; + *nextevt = KTIME_MAX; return 0; } #endif /* #ifdef CONFIG_TINY_RCU */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 3fa4a43ab415..456879143f89 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -31,7 +31,7 @@ #define __LINUX_RCUTREE_H void rcu_note_context_switch(void); -int rcu_needs_cpu(unsigned long *delta_jiffies); +int rcu_needs_cpu(u64 basem, u64 *nextevt); void rcu_cpu_stall_reset(void); /* diff --git a/include/linux/sched.h b/include/linux/sched.h index d4193d5613cf..30364cb58b1f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -345,14 +345,10 @@ extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) extern void nohz_balance_enter_idle(int cpu); extern void set_cpu_sd_state_idle(void); -extern int get_nohz_timer_target(int pinned); +extern int get_nohz_timer_target(void); #else static inline void nohz_balance_enter_idle(int cpu) { } static inline void set_cpu_sd_state_idle(void) { } -static inline int get_nohz_timer_target(int pinned) -{ - return smp_processor_id(); -} #endif /* diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 596a0e007c62..c9e4731cf10b 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -57,24 +57,12 @@ extern unsigned int sysctl_numa_balancing_scan_size; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; extern unsigned int sysctl_sched_time_avg; -extern unsigned int sysctl_timer_migration; extern unsigned int sysctl_sched_shares_window; int sched_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos); #endif -#ifdef CONFIG_SCHED_DEBUG -static inline unsigned int get_sysctl_timer_migration(void) -{ - return sysctl_timer_migration; -} -#else -static inline unsigned int get_sysctl_timer_migration(void) -{ - return 1; -} -#endif /* * control realtime throttling: diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 5f68d0a391ce..486e685a226a 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -233,6 +233,47 @@ static inline void raw_write_seqcount_end(seqcount_t *s) s->sequence++; } +/** + * raw_write_seqcount_barrier - do a seq write barrier + * @s: pointer to seqcount_t + * + * This can be used to provide an ordering guarantee instead of the + * usual consistency guarantee. It is one wmb cheaper, because we can + * collapse the two back-to-back wmb()s. + * + * seqcount_t seq; + * bool X = true, Y = false; + * + * void read(void) + * { + * bool x, y; + * + * do { + * int s = read_seqcount_begin(&seq); + * + * x = X; y = Y; + * + * } while (read_seqcount_retry(&seq, s)); + * + * BUG_ON(!x && !y); + * } + * + * void write(void) + * { + * Y = true; + * + * raw_write_seqcount_barrier(seq); + * + * X = false; + * } + */ +static inline void raw_write_seqcount_barrier(seqcount_t *s) +{ + s->sequence++; + smp_wmb(); + s->sequence++; +} + /* * raw_write_seqcount_latch - redirect readers to even/odd copy * @s: pointer to seqcount_t @@ -266,13 +307,13 @@ static inline void write_seqcount_end(seqcount_t *s) } /** - * write_seqcount_barrier - invalidate in-progress read-side seq operations + * write_seqcount_invalidate - invalidate in-progress read-side seq operations * @s: pointer to seqcount_t * - * After write_seqcount_barrier, no read-side seq operations will complete + * After write_seqcount_invalidate, no read-side seq operations will complete * successfully and see data older than this. */ -static inline void write_seqcount_barrier(seqcount_t *s) +static inline void write_seqcount_invalidate(seqcount_t *s) { smp_wmb(); s->sequence+=2; diff --git a/include/linux/time64.h b/include/linux/time64.h index a3831478d9cf..77b5df2acd2a 100644 --- a/include/linux/time64.h +++ b/include/linux/time64.h @@ -2,6 +2,7 @@ #define _LINUX_TIME64_H #include <uapi/linux/time.h> +#include <linux/math64.h> typedef __s64 time64_t; @@ -28,6 +29,7 @@ struct timespec64 { #define FSEC_PER_SEC 1000000000000000LL /* Located here for timespec[64]_valid_strict */ +#define TIME64_MAX ((s64)~((u64)1 << 63)) #define KTIME_MAX ((s64)~((u64)1 << 63)) #define KTIME_SEC_MAX (KTIME_MAX / NSEC_PER_SEC) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index fb86963859c7..25247220b4b7 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -49,6 +49,8 @@ struct tk_read_base { * @offs_boot: Offset clock monotonic -> clock boottime * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds + * @clock_was_set_seq: The sequence number of clock was set events + * @next_leap_ktime: CLOCK_MONOTONIC time value of a pending leap-second * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP @@ -60,6 +62,9 @@ struct tk_read_base { * shifted nano seconds. * @ntp_error_shift: Shift conversion between clock shifted nano seconds and * ntp shifted nano seconds. + * @last_warning: Warning ratelimiter (DEBUG_TIMEKEEPING) + * @underflow_seen: Underflow warning flag (DEBUG_TIMEKEEPING) + * @overflow_seen: Overflow warning flag (DEBUG_TIMEKEEPING) * * Note: For timespec(64) based interfaces wall_to_monotonic is what * we need to add to xtime (or xtime corrected for sub jiffie times) @@ -85,6 +90,8 @@ struct timekeeper { ktime_t offs_boot; ktime_t offs_tai; s32 tai_offset; + unsigned int clock_was_set_seq; + ktime_t next_leap_ktime; struct timespec64 raw_time; /* The following members are for timekeeping internal use */ @@ -104,6 +111,18 @@ struct timekeeper { s64 ntp_error; u32 ntp_error_shift; u32 ntp_err_mult; +#ifdef CONFIG_DEBUG_TIMEKEEPING + long last_warning; + /* + * These simple flag variables are managed + * without locks, which is racy, but they are + * ok since we don't really care about being + * super precise about how many events were + * seen, just that a problem was observed. + */ + int underflow_seen; + int overflow_seen; +#endif }; #ifdef CONFIG_GENERIC_TIME_VSYSCALL diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 99176af216af..3aa72e648650 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -163,6 +163,7 @@ extern ktime_t ktime_get(void); extern ktime_t ktime_get_with_offset(enum tk_offsets offs); extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs); extern ktime_t ktime_get_raw(void); +extern u32 ktime_get_resolution_ns(void); /** * ktime_get_real - get the real (wall-) time in ktime_t format @@ -266,7 +267,6 @@ extern int persistent_clock_is_local; extern void read_persistent_clock(struct timespec *ts); extern void read_persistent_clock64(struct timespec64 *ts); -extern void read_boot_clock(struct timespec *ts); extern void read_boot_clock64(struct timespec64 *ts); extern int update_persistent_clock(struct timespec now); extern int update_persistent_clock64(struct timespec64 now); diff --git a/include/linux/timer.h b/include/linux/timer.h index 8c5a197e1587..61aa61dc410c 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -14,27 +14,23 @@ struct timer_list { * All fields that change during normal runtime grouped to the * same cacheline */ - struct list_head entry; - unsigned long expires; - struct tvec_base *base; - - void (*function)(unsigned long); - unsigned long data; - - int slack; + struct hlist_node entry; + unsigned long expires; + void (*function)(unsigned long); + unsigned long data; + u32 flags; + int slack; #ifdef CONFIG_TIMER_STATS - int start_pid; - void *start_site; - char start_comm[16]; + int start_pid; + void *start_site; + char start_comm[16]; #endif #ifdef CONFIG_LOCKDEP - struct lockdep_map lockdep_map; + struct lockdep_map lockdep_map; #endif }; -extern struct tvec_base boot_tvec_bases; - #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting the lockdep_map key @@ -49,9 +45,6 @@ extern struct tvec_base boot_tvec_bases; #endif /* - * Note that all tvec_bases are at least 4 byte aligned and lower two bits - * of base in timer_list is guaranteed to be zero. Use them for flags. - * * A deferrable timer will work normally when the system is busy, but * will not cause a CPU to come out of idle just to service it; instead, * the timer will be serviced when the CPU eventually wakes up with a @@ -65,17 +58,18 @@ extern struct tvec_base boot_tvec_bases; * workqueue locking issues. It's not meant for executing random crap * with interrupts disabled. Abuse is monitored! */ -#define TIMER_DEFERRABLE 0x1LU -#define TIMER_IRQSAFE 0x2LU - -#define TIMER_FLAG_MASK 0x3LU +#define TIMER_CPUMASK 0x0007FFFF +#define TIMER_MIGRATING 0x00080000 +#define TIMER_BASEMASK (TIMER_CPUMASK | TIMER_MIGRATING) +#define TIMER_DEFERRABLE 0x00100000 +#define TIMER_IRQSAFE 0x00200000 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \ - .entry = { .prev = TIMER_ENTRY_STATIC }, \ + .entry = { .next = TIMER_ENTRY_STATIC }, \ .function = (_function), \ .expires = (_expires), \ .data = (_data), \ - .base = (void *)((unsigned long)&boot_tvec_bases + (_flags)), \ + .flags = (_flags), \ .slack = -1, \ __TIMER_LOCKDEP_MAP_INITIALIZER( \ __FILE__ ":" __stringify(__LINE__)) \ @@ -168,7 +162,7 @@ static inline void init_timer_on_stack_key(struct timer_list *timer, */ static inline int timer_pending(const struct timer_list * timer) { - return timer->entry.next != NULL; + return timer->entry.pprev != NULL; } extern void add_timer_on(struct timer_list *timer, int cpu); @@ -188,26 +182,16 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz); #define NEXT_TIMER_MAX_DELTA ((1UL << 30) - 1) /* - * Return when the next timer-wheel timeout occurs (in absolute jiffies), - * locks the timer base and does the comparison against the given - * jiffie. - */ -extern unsigned long get_next_timer_interrupt(unsigned long now); - -/* * Timer-statistics info: */ #ifdef CONFIG_TIMER_STATS extern int timer_stats_active; -#define TIMER_STATS_FLAG_DEFERRABLE 0x1 - extern void init_timer_stats(void); extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, - unsigned int timer_flag); + void *timerf, char *comm, u32 flags); extern void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr); @@ -254,6 +238,15 @@ extern void run_local_timers(void); struct hrtimer; extern enum hrtimer_restart it_real_fn(struct hrtimer *); +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +#include <linux/sysctl.h> + +extern unsigned int sysctl_timer_migration; +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif + unsigned long __round_jiffies(unsigned long j, int cpu); unsigned long __round_jiffies_relative(unsigned long j, int cpu); unsigned long round_jiffies(unsigned long j); diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index a520fd70a59f..7eec17ad7fa1 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -16,10 +16,10 @@ struct timerqueue_head { }; -extern void timerqueue_add(struct timerqueue_head *head, - struct timerqueue_node *node); -extern void timerqueue_del(struct timerqueue_head *head, - struct timerqueue_node *node); +extern bool timerqueue_add(struct timerqueue_head *head, + struct timerqueue_node *node); +extern bool timerqueue_del(struct timerqueue_head *head, + struct timerqueue_node *node); extern struct timerqueue_node *timerqueue_iterate_next( struct timerqueue_node *node); diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h index 68c2c2000f02..073b9ac245ba 100644 --- a/include/trace/events/timer.h +++ b/include/trace/events/timer.h @@ -43,15 +43,18 @@ DEFINE_EVENT(timer_class, timer_init, */ TRACE_EVENT(timer_start, - TP_PROTO(struct timer_list *timer, unsigned long expires), + TP_PROTO(struct timer_list *timer, + unsigned long expires, + unsigned int flags), - TP_ARGS(timer, expires), + TP_ARGS(timer, expires, flags), TP_STRUCT__entry( __field( void *, timer ) __field( void *, function ) __field( unsigned long, expires ) __field( unsigned long, now ) + __field( unsigned int, flags ) ), TP_fast_assign( @@ -59,11 +62,12 @@ TRACE_EVENT(timer_start, __entry->function = timer->function; __entry->expires = expires; __entry->now = jiffies; + __entry->flags = flags; ), - TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld]", + TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] flags=0x%08x", __entry->timer, __entry->function, __entry->expires, - (long)__entry->expires - __entry->now) + (long)__entry->expires - __entry->now, __entry->flags) ); /** diff --git a/kernel/events/core.c b/kernel/events/core.c index f2003b97ddc9..8e13f3e54ec3 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -51,9 +51,11 @@ static struct workqueue_struct *perf_wq; +typedef int (*remote_function_f)(void *); + struct remote_function_call { struct task_struct *p; - int (*func)(void *info); + remote_function_f func; void *info; int ret; }; @@ -86,7 +88,7 @@ static void remote_function(void *data) * -EAGAIN - when the process moved away */ static int -task_function_call(struct task_struct *p, int (*func) (void *info), void *info) +task_function_call(struct task_struct *p, remote_function_f func, void *info) { struct remote_function_call data = { .p = p, @@ -110,7 +112,7 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info) * * returns: @func return value or -ENXIO when the cpu is offline */ -static int cpu_function_call(int cpu, int (*func) (void *info), void *info) +static int cpu_function_call(int cpu, remote_function_f func, void *info) { struct remote_function_call data = { .p = NULL, @@ -747,62 +749,31 @@ perf_cgroup_mark_enabled(struct perf_event *event, /* * function must be called with interrupts disbled */ -static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr) +static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr) { struct perf_cpu_context *cpuctx; - enum hrtimer_restart ret = HRTIMER_NORESTART; int rotations = 0; WARN_ON(!irqs_disabled()); cpuctx = container_of(hr, struct perf_cpu_context, hrtimer); - rotations = perf_rotate_context(cpuctx); - /* - * arm timer if needed - */ - if (rotations) { + raw_spin_lock(&cpuctx->hrtimer_lock); + if (rotations) hrtimer_forward_now(hr, cpuctx->hrtimer_interval); - ret = HRTIMER_RESTART; - } - - return ret; -} - -/* CPU is going down */ -void perf_cpu_hrtimer_cancel(int cpu) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; - unsigned long flags; - - if (WARN_ON(cpu != smp_processor_id())) - return; - - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - if (pmu->task_ctx_nr == perf_sw_context) - continue; - - hrtimer_cancel(&cpuctx->hrtimer); - } - - rcu_read_unlock(); + else + cpuctx->hrtimer_active = 0; + raw_spin_unlock(&cpuctx->hrtimer_lock); - local_irq_restore(flags); + return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART; } -static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) +static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) { - struct hrtimer *hr = &cpuctx->hrtimer; + struct hrtimer *timer = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; - int timer; + u64 interval; /* no multiplexing needed for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) @@ -812,31 +783,36 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu) * check default is sane, if not set then force to * default interval (1/tick) */ - timer = pmu->hrtimer_interval_ms; - if (timer < 1) - timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; + interval = pmu->hrtimer_interval_ms; + if (interval < 1) + interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER; - cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); + cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval); - hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED); - hr->function = perf_cpu_hrtimer_handler; + raw_spin_lock_init(&cpuctx->hrtimer_lock); + hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); + timer->function = perf_mux_hrtimer_handler; } -static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx) +static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx) { - struct hrtimer *hr = &cpuctx->hrtimer; + struct hrtimer *timer = &cpuctx->hrtimer; struct pmu *pmu = cpuctx->ctx.pmu; + unsigned long flags; /* not for SW PMU */ if (pmu->task_ctx_nr == perf_sw_context) - return; + return 0; - if (hrtimer_active(hr)) - return; + raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags); + if (!cpuctx->hrtimer_active) { + cpuctx->hrtimer_active = 1; + hrtimer_forward_now(timer, cpuctx->hrtimer_interval); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); + } + raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags); - if (!hrtimer_callback_running(hr)) - __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval, - 0, HRTIMER_MODE_REL_PINNED, 0); + return 0; } void perf_pmu_disable(struct pmu *pmu) @@ -1935,7 +1911,7 @@ group_sched_in(struct perf_event *group_event, if (event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -1982,7 +1958,7 @@ group_error: pmu->cancel_txn(pmu); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); return -EAGAIN; } @@ -2255,7 +2231,7 @@ static int __perf_event_enable(void *info) */ if (leader != event) { group_sched_out(leader, cpuctx, ctx); - perf_cpu_hrtimer_restart(cpuctx); + perf_mux_hrtimer_restart(cpuctx); } if (leader->attr.pinned) { update_group_times(leader); @@ -6897,9 +6873,8 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) } else { period = max_t(u64, 10000, hwc->sample_period); } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&hwc->hrtimer, ns_to_ktime(period), + HRTIMER_MODE_REL_PINNED); } static void perf_swevent_cancel_hrtimer(struct perf_event *event) @@ -7200,6 +7175,8 @@ perf_event_mux_interval_ms_show(struct device *dev, return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms); } +static DEFINE_MUTEX(mux_interval_mutex); + static ssize_t perf_event_mux_interval_ms_store(struct device *dev, struct device_attribute *attr, @@ -7219,17 +7196,21 @@ perf_event_mux_interval_ms_store(struct device *dev, if (timer == pmu->hrtimer_interval_ms) return count; + mutex_lock(&mux_interval_mutex); pmu->hrtimer_interval_ms = timer; /* update all cpuctx for this PMU */ - for_each_possible_cpu(cpu) { + get_online_cpus(); + for_each_online_cpu(cpu) { struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer); - if (hrtimer_active(&cpuctx->hrtimer)) - hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval); + cpu_function_call(cpu, + (remote_function_f)perf_mux_hrtimer_restart, cpuctx); } + put_online_cpus(); + mutex_unlock(&mux_interval_mutex); return count; } @@ -7334,7 +7315,7 @@ skip_type: lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); cpuctx->ctx.pmu = pmu; - __perf_cpu_hrtimer_init(cpuctx, cpu); + __perf_mux_hrtimer_init(cpuctx, cpu); cpuctx->unique_pmu = pmu; } diff --git a/kernel/futex.c b/kernel/futex.c index aacc706f85fc..ea6ca0bca525 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2064,11 +2064,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, queue_me(q, hb); /* Arm the timer */ - if (timeout) { + if (timeout) hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } /* * If we have been removed from the hash list, then another task diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 30ec5b46cd8c..36573e96a477 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1182,11 +1182,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, set_current_state(state); /* Setup the timer, when timeout != NULL */ - if (unlikely(timeout)) { + if (unlikely(timeout)) hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&timeout->timer)) - timeout->task = NULL; - } ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 32664347091a..013485fb2b06 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1375,9 +1375,9 @@ static void rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ -int rcu_needs_cpu(unsigned long *delta_jiffies) +int rcu_needs_cpu(u64 basemono, u64 *nextevt) { - *delta_jiffies = ULONG_MAX; + *nextevt = KTIME_MAX; return IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ? 0 : rcu_cpu_has_callbacks(NULL); } @@ -1439,8 +1439,6 @@ module_param(rcu_idle_gp_delay, int, 0644); static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; module_param(rcu_idle_lazy_gp_delay, int, 0644); -extern int tick_nohz_active; - /* * Try to advance callbacks for all flavors of RCU on the current CPU, but * only if it has been awhile since the last time we did so. Afterwards, @@ -1487,12 +1485,13 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) * * The caller must have disabled interrupts. */ -int rcu_needs_cpu(unsigned long *dj) +int rcu_needs_cpu(u64 basemono, u64 *nextevt) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + unsigned long dj; if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) { - *dj = ULONG_MAX; + *nextevt = KTIME_MAX; return 0; } @@ -1501,7 +1500,7 @@ int rcu_needs_cpu(unsigned long *dj) /* If no callbacks, RCU doesn't need the CPU. */ if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) { - *dj = ULONG_MAX; + *nextevt = KTIME_MAX; return 0; } @@ -1515,11 +1514,12 @@ int rcu_needs_cpu(unsigned long *dj) /* Request timer delay depending on laziness, and round. */ if (!rdtp->all_lazy) { - *dj = round_up(rcu_idle_gp_delay + jiffies, + dj = round_up(rcu_idle_gp_delay + jiffies, rcu_idle_gp_delay) - jiffies; } else { - *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; + dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; } + *nextevt = basemono + dj * TICK_NSEC; return 0; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f89ca9bcf42a..c9a707b59331 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -90,26 +90,6 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period) -{ - unsigned long delta; - ktime_t soft, hard, now; - - for (;;) { - if (hrtimer_active(period_timer)) - break; - - now = hrtimer_cb_get_time(period_timer); - hrtimer_forward(period_timer, now, period); - - soft = hrtimer_get_softexpires(period_timer); - hard = hrtimer_get_expires(period_timer); - delta = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(period_timer, soft, delta, - HRTIMER_MODE_ABS_PINNED, 0); - } -} - DEFINE_MUTEX(sched_domains_mutex); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -355,12 +335,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) #ifdef CONFIG_SMP -static int __hrtick_restart(struct rq *rq) +static void __hrtick_restart(struct rq *rq) { struct hrtimer *timer = &rq->hrtick_timer; - ktime_t time = hrtimer_get_softexpires(timer); - return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED); } /* @@ -440,8 +419,8 @@ void hrtick_start(struct rq *rq, u64 delay) * doesn't make sense. Rely on vruntime for fairness. */ delay = max_t(u64, delay, 10000LL); - __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0, - HRTIMER_MODE_REL_PINNED, 0); + hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), + HRTIMER_MODE_REL_PINNED); } static inline void init_hrtick(void) @@ -639,13 +618,12 @@ void resched_cpu(int cpu) * selecting an idle cpu will add more delays to the timers than intended * (as that cpu's timer base may not be uptodate wrt jiffies etc). */ -int get_nohz_timer_target(int pinned) +int get_nohz_timer_target(void) { - int cpu = smp_processor_id(); - int i; + int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu)) + if (!idle_cpu(cpu)) return cpu; rcu_read_lock(); @@ -7126,8 +7104,6 @@ void __init sched_init_smp(void) } #endif /* CONFIG_SMP */ -const_debug unsigned int sysctl_timer_migration = 1; - int in_sched_functions(unsigned long addr) { return in_lock_functions(addr) || @@ -8163,10 +8139,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) __refill_cfs_bandwidth_runtime(cfs_b); /* restart the period timer (if active) to handle new period expiry */ - if (runtime_enabled && cfs_b->timer_active) { - /* force a reprogram */ - __start_cfs_bandwidth(cfs_b, true); - } + if (runtime_enabled) + start_cfs_bandwidth(cfs_b); raw_spin_unlock_irq(&cfs_b->lock); for_each_online_cpu(i) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 392e8fb94db3..eac20c557a55 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -503,8 +503,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); ktime_t now, act; - ktime_t soft, hard; - unsigned long range; s64 delta; if (boosted) @@ -527,15 +525,9 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted) if (ktime_us_delta(act, now) < 0) return 0; - hrtimer_set_expires(&dl_se->dl_timer, act); + hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS); - soft = hrtimer_get_softexpires(&dl_se->dl_timer); - hard = hrtimer_get_expires(&dl_se->dl_timer); - range = ktime_to_ns(ktime_sub(hard, soft)); - __hrtimer_start_range_ns(&dl_se->dl_timer, soft, - range, HRTIMER_MODE_ABS, 0); - - return hrtimer_active(&dl_se->dl_timer); + return 1; } /* diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 704683cc9042..315c68e015d9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -232,8 +232,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) #endif #endif #ifdef CONFIG_CFS_BANDWIDTH - SEQ_printf(m, " .%-30s: %d\n", "tg->cfs_bandwidth.timer_active", - cfs_rq->tg->cfs_bandwidth.timer_active); SEQ_printf(m, " .%-30s: %d\n", "throttled", cfs_rq->throttled); SEQ_printf(m, " .%-30s: %d\n", "throttle_count", diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 433061d984ea..40a7fcbf491e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3504,16 +3504,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) if (cfs_b->quota == RUNTIME_INF) amount = min_amount; else { - /* - * If the bandwidth pool has become inactive, then at least one - * period must have elapsed since the last consumption. - * Refresh the global state and ensure bandwidth timer becomes - * active. - */ - if (!cfs_b->timer_active) { - __refill_cfs_bandwidth_runtime(cfs_b); - __start_cfs_bandwidth(cfs_b, false); - } + start_cfs_bandwidth(cfs_b); if (cfs_b->runtime > 0) { amount = min(cfs_b->runtime, min_amount); @@ -3662,6 +3653,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; long task_delta, dequeue = 1; + bool empty; se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))]; @@ -3691,13 +3683,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); raw_spin_lock(&cfs_b->lock); + empty = list_empty(&cfs_rq->throttled_list); + /* * Add to the _head_ of the list, so that an already-started * distribute_cfs_runtime will not see us */ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq); - if (!cfs_b->timer_active) - __start_cfs_bandwidth(cfs_b, false); + + /* + * If we're the first throttled task, make sure the bandwidth + * timer is running. + */ + if (empty) + start_cfs_bandwidth(cfs_b); + raw_spin_unlock(&cfs_b->lock); } @@ -3812,13 +3812,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) if (cfs_b->idle && !throttled) goto out_deactivate; - /* - * if we have relooped after returning idle once, we need to update our - * status as actually running, so that other cpus doing - * __start_cfs_bandwidth will stop trying to cancel us. - */ - cfs_b->timer_active = 1; - __refill_cfs_bandwidth_runtime(cfs_b); if (!throttled) { @@ -3863,7 +3856,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) return 0; out_deactivate: - cfs_b->timer_active = 0; return 1; } @@ -3878,7 +3870,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; * Are we near the end of the current quota period? * * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the - * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of + * hrtimer base being cleared by hrtimer_start. In the case of * migrate_hrtimers, base is never cleared, so we are fine. */ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) @@ -3906,8 +3898,9 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b) if (runtime_refresh_within(cfs_b, min_left)) return; - start_bandwidth_timer(&cfs_b->slack_timer, - ns_to_ktime(cfs_bandwidth_slack_period)); + hrtimer_start(&cfs_b->slack_timer, + ns_to_ktime(cfs_bandwidth_slack_period), + HRTIMER_MODE_REL); } /* we know any runtime found here is valid as update_curr() precedes return */ @@ -4027,6 +4020,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, slack_timer); + do_sched_cfs_slack_timer(cfs_b); return HRTIMER_NORESTART; @@ -4036,20 +4030,19 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); - ktime_t now; int overrun; int idle = 0; raw_spin_lock(&cfs_b->lock); for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, cfs_b->period); - + overrun = hrtimer_forward_now(timer, cfs_b->period); if (!overrun) break; idle = do_sched_cfs_period_timer(cfs_b, overrun); } + if (idle) + cfs_b->period_active = 0; raw_spin_unlock(&cfs_b->lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; @@ -4063,7 +4056,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) cfs_b->period = ns_to_ktime(default_cfs_period()); INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq); - hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED); cfs_b->period_timer.function = sched_cfs_period_timer; hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cfs_b->slack_timer.function = sched_cfs_slack_timer; @@ -4075,28 +4068,15 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) INIT_LIST_HEAD(&cfs_rq->throttled_list); } -/* requires cfs_b->lock, may release to reprogram timer */ -void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force) +void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) { - /* - * The timer may be active because we're trying to set a new bandwidth - * period or because we're racing with the tear-down path - * (timer_active==0 becomes visible before the hrtimer call-back - * terminates). In either case we ensure that it's re-programmed - */ - while (unlikely(hrtimer_active(&cfs_b->period_timer)) && - hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) { - /* bounce the lock to allow do_sched_cfs_period_timer to run */ - raw_spin_unlock(&cfs_b->lock); - cpu_relax(); - raw_spin_lock(&cfs_b->lock); - /* if someone else restarted the timer then we're done */ - if (!force && cfs_b->timer_active) - return; - } + lockdep_assert_held(&cfs_b->lock); - cfs_b->timer_active = 1; - start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period); + if (!cfs_b->period_active) { + cfs_b->period_active = 1; + hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); + hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); + } } static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 560d2fa623c3..7d7093c51f8d 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -18,19 +18,22 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) { struct rt_bandwidth *rt_b = container_of(timer, struct rt_bandwidth, rt_period_timer); - ktime_t now; - int overrun; int idle = 0; + int overrun; + raw_spin_lock(&rt_b->rt_runtime_lock); for (;;) { - now = hrtimer_cb_get_time(timer); - overrun = hrtimer_forward(timer, now, rt_b->rt_period); - + overrun = hrtimer_forward_now(timer, rt_b->rt_period); if (!overrun) break; + raw_spin_unlock(&rt_b->rt_runtime_lock); idle = do_sched_rt_period_timer(rt_b, overrun); + raw_spin_lock(&rt_b->rt_runtime_lock); } + if (idle) + rt_b->rt_period_active = 0; + raw_spin_unlock(&rt_b->rt_runtime_lock); return idle ? HRTIMER_NORESTART : HRTIMER_RESTART; } @@ -52,11 +55,12 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF) return; - if (hrtimer_active(&rt_b->rt_period_timer)) - return; - raw_spin_lock(&rt_b->rt_runtime_lock); - start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period); + if (!rt_b->rt_period_active) { + rt_b->rt_period_active = 1; + hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); + hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); + } raw_spin_unlock(&rt_b->rt_runtime_lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d62b2882232b..aea7c1f393cb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -137,6 +137,7 @@ struct rt_bandwidth { ktime_t rt_period; u64 rt_runtime; struct hrtimer rt_period_timer; + unsigned int rt_period_active; }; void __dl_clear_params(struct task_struct *p); @@ -221,7 +222,7 @@ struct cfs_bandwidth { s64 hierarchical_quota; u64 runtime_expires; - int idle, timer_active; + int idle, period_active; struct hrtimer period_timer, slack_timer; struct list_head throttled_cfs_rq; @@ -312,7 +313,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); -extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force); +extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq); extern void free_rt_sched_group(struct task_group *tg); @@ -1409,8 +1410,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period); - /* * __task_rq_lock - lock the rq @p resides on. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2082b1a88fb9..b13e9d2de302 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -349,15 +349,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "timer_migration", - .data = &sysctl_timer_migration, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &zero, - .extra2 = &one, - }, #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -1132,6 +1123,15 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one, }, +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) + { + .procname = "timer_migration", + .data = &sysctl_timer_migration, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = timer_migration_handler, + }, +#endif { } }; diff --git a/kernel/time/Makefile b/kernel/time/Makefile index 01f0312419b3..ffc4cc3dcd47 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -13,19 +13,4 @@ obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o -$(obj)/time.o: $(obj)/timeconst.h - -quiet_cmd_hzfile = HZFILE $@ - cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@ - -targets += hz.bc -$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE - $(call if_changed,hzfile) - -quiet_cmd_bc = BC $@ - cmd_bc = bc -q $(filter-out FORCE,$^) > $@ - -targets += timeconst.h -$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE - $(call if_changed,bc) - +$(obj)/time.o: $(objtree)/include/config/ diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 1b001ed1edb9..7fbba635a549 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init); * @alarm: ptr to alarm to set * @start: time to run the alarm */ -int alarm_start(struct alarm *alarm, ktime_t start) +void alarm_start(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; unsigned long flags; - int ret; spin_lock_irqsave(&base->lock, flags); alarm->node.expires = start; alarmtimer_enqueue(base, alarm); - ret = hrtimer_start(&alarm->timer, alarm->node.expires, - HRTIMER_MODE_ABS); + hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS); spin_unlock_irqrestore(&base->lock, flags); - return ret; } EXPORT_SYMBOL_GPL(alarm_start); @@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start); * @alarm: ptr to alarm to set * @start: time relative to now to run the alarm */ -int alarm_start_relative(struct alarm *alarm, ktime_t start) +void alarm_start_relative(struct alarm *alarm, ktime_t start) { struct alarm_base *base = &alarm_bases[alarm->type]; start = ktime_add(start, base->gettime()); - return alarm_start(alarm, start); + alarm_start(alarm, start); } EXPORT_SYMBOL_GPL(alarm_start_relative); @@ -495,12 +492,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, */ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) { - clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid; - if (!alarmtimer_get_rtcdev()) return -EINVAL; - return hrtimer_get_res(baseid, tp); + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; } /** diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 637a09461c1d..08ccc3da3ca0 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -94,8 +94,8 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) } EXPORT_SYMBOL_GPL(clockevent_delta2ns); -static int __clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state) +static int __clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) { /* Transition with legacy set_mode() callback */ if (dev->set_mode) { @@ -134,32 +134,44 @@ static int __clockevents_set_state(struct clock_event_device *dev, return -ENOSYS; return dev->set_state_oneshot(dev); + case CLOCK_EVT_STATE_ONESHOT_STOPPED: + /* Core internal bug */ + if (WARN_ONCE(!clockevent_state_oneshot(dev), + "Current state: %d\n", + clockevent_get_state(dev))) + return -EINVAL; + + if (dev->set_state_oneshot_stopped) + return dev->set_state_oneshot_stopped(dev); + else + return -ENOSYS; + default: return -ENOSYS; } } /** - * clockevents_set_state - set the operating state of a clock event device + * clockevents_switch_state - set the operating state of a clock event device * @dev: device to modify * @state: new state * * Must be called with interrupts disabled ! */ -void clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state) +void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state) { - if (dev->state != state) { - if (__clockevents_set_state(dev, state)) + if (clockevent_get_state(dev) != state) { + if (__clockevents_switch_state(dev, state)) return; - dev->state = state; + clockevent_set_state(dev, state); /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ - if (state == CLOCK_EVT_STATE_ONESHOT) { + if (clockevent_state_oneshot(dev)) { if (unlikely(!dev->mult)) { dev->mult = 1; WARN_ON(1); @@ -174,7 +186,7 @@ void clockevents_set_state(struct clock_event_device *dev, */ void clockevents_shutdown(struct clock_event_device *dev) { - clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event.tv64 = KTIME_MAX; } @@ -248,7 +260,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; dev->retries++; @@ -285,7 +297,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; dev->retries++; @@ -317,9 +329,13 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, dev->next_event = expires; - if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) + if (clockevent_state_shutdown(dev)) return 0; + /* We must be in ONESHOT state here */ + WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", + clockevent_get_state(dev)); + /* Shortcut for clockevent devices that can deal with ktime. */ if (dev->features & CLOCK_EVT_FEAT_KTIME) return dev->set_next_ktime(expires, dev); @@ -362,7 +378,7 @@ static int clockevents_replace(struct clock_event_device *ced) struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { - if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED) + if (dev == ced || !clockevent_state_detached(dev)) continue; if (!tick_check_replacement(newdev, dev)) @@ -388,7 +404,7 @@ static int clockevents_replace(struct clock_event_device *ced) static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ - if (ced->state == CLOCK_EVT_STATE_DETACHED) { + if (clockevent_state_detached(ced)) { list_del_init(&ced->list); return 0; } @@ -445,7 +461,8 @@ static int clockevents_sanity_check(struct clock_event_device *dev) if (dev->set_mode) { /* We shouldn't be supporting new modes now */ WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || - dev->set_state_shutdown || dev->tick_resume); + dev->set_state_shutdown || dev->tick_resume || + dev->set_state_oneshot_stopped); BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); return 0; @@ -480,7 +497,7 @@ void clockevents_register_device(struct clock_event_device *dev) BUG_ON(clockevents_sanity_check(dev)); /* Initialize state to DETACHED */ - dev->state = CLOCK_EVT_STATE_DETACHED; + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); @@ -545,11 +562,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); - if (dev->state == CLOCK_EVT_STATE_ONESHOT) + if (clockevent_state_oneshot(dev)) return clockevents_program_event(dev, dev->next_event, false); - if (dev->state == CLOCK_EVT_STATE_PERIODIC) - return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); + if (clockevent_state_periodic(dev)) + return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } @@ -603,13 +620,13 @@ void clockevents_exchange_device(struct clock_event_device *old, */ if (old) { module_put(old->owner); - clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED); + clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED); list_del(&old->list); list_add(&old->list, &clockevents_released); } if (new) { - BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); + BUG_ON(!clockevent_state_detached(new)); clockevents_shutdown(new); } } @@ -622,7 +639,7 @@ void clockevents_suspend(void) struct clock_event_device *dev; list_for_each_entry_reverse(dev, &clockevent_devices, list) - if (dev->suspend) + if (dev->suspend && !clockevent_state_detached(dev)) dev->suspend(dev); } @@ -634,7 +651,7 @@ void clockevents_resume(void) struct clock_event_device *dev; list_for_each_entry(dev, &clockevent_devices, list) - if (dev->resume) + if (dev->resume && !clockevent_state_detached(dev)) dev->resume(dev); } @@ -665,7 +682,7 @@ void tick_cleanup_dead_cpu(int cpu) if (cpumask_test_cpu(cpu, dev->cpumask) && cpumask_weight(dev->cpumask) == 1 && !tick_is_broadcast_device(dev)) { - BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); + BUG_ON(!clockevent_state_detached(dev)); list_del(&dev->list); } } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 15facb1b9c60..841b72f720e8 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -23,6 +23,8 @@ * o Allow clocksource drivers to be unregistered */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/device.h> #include <linux/clocksource.h> #include <linux/init.h> @@ -216,10 +218,11 @@ static void clocksource_watchdog(unsigned long data) /* Check the deviation from the watchdog clocksource. */ if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { - pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name); - pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", + pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", + cs->name); + pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", watchdog->name, wdnow, wdlast, watchdog->mask); - pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", + pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", cs->name, csnow, cslast, cs->mask); __clocksource_unstable(cs); continue; @@ -567,9 +570,8 @@ static void __clocksource_select(bool skipcur) */ if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) { /* Override clocksource cannot be used. */ - printk(KERN_WARNING "Override clocksource %s is not " - "HRT compatible. Cannot switch while in " - "HRT/NOHZ mode\n", cs->name); + pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n", + cs->name); override_name[0] = 0; } else /* Override clocksource can be used. */ @@ -708,8 +710,8 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq clocksource_update_max_deferment(cs); - pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", - cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); + pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", + cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); @@ -1008,12 +1010,10 @@ __setup("clocksource=", boot_override_clocksource); static int __init boot_override_clock(char* str) { if (!strcmp(str, "pmtmr")) { - printk("Warning: clock=pmtmr is deprecated. " - "Use clocksource=acpi_pm.\n"); + pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n"); return boot_override_clocksource("acpi_pm"); } - printk("Warning! clock= boot option is deprecated. " - "Use clocksource=xyz\n"); + pr_warn("clock= boot option is deprecated - use clocksource=xyz\n"); return boot_override_clocksource(str); } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 93ef7190bdea..5c7ae4b641c4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -66,33 +66,29 @@ */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { - .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), + .seq = SEQCNT_ZERO(hrtimer_bases.seq), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, - .resolution = KTIME_LOW_RES, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, - .resolution = KTIME_LOW_RES, }, } }; @@ -109,27 +105,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id) return hrtimer_clock_to_base_table[clock_id]; } - -/* - * Get the coarse grained time at the softirq based on xtime and - * wall_to_monotonic. - */ -static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) -{ - ktime_t xtim, mono, boot, tai; - ktime_t off_real, off_boot, off_tai; - - mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai); - boot = ktime_add(mono, off_boot); - xtim = ktime_add(mono, off_real); - tai = ktime_add(mono, off_tai); - - base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim; - base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono; - base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot; - base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai; -} - /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -137,6 +112,18 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) #ifdef CONFIG_SMP /* + * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() + * such that hrtimer_callback_running() can unconditionally dereference + * timer->base->cpu_base + */ +static struct hrtimer_cpu_base migration_cpu_base = { + .seq = SEQCNT_ZERO(migration_cpu_base), + .clock_base = { { .cpu_base = &migration_cpu_base, }, }, +}; + +#define migration_base migration_cpu_base.clock_base[0] + +/* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. @@ -145,8 +132,8 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. + * possible to set timer->base = &migration_base and drop the lock: the timer + * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, @@ -156,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, for (;;) { base = timer->base; - if (likely(base != NULL)) { + if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; @@ -190,6 +177,24 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) #endif } +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + if (pinned || !base->migration_enabled) + return this_cpu_ptr(&hrtimer_bases); + return &per_cpu(hrtimer_bases, get_nohz_timer_target()); +} +#else +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) +{ + return this_cpu_ptr(&hrtimer_bases); +} +#endif + /* * Switch the timer base to the current CPU when possible. */ @@ -197,14 +202,13 @@ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { + struct hrtimer_cpu_base *new_cpu_base, *this_base; struct hrtimer_clock_base *new_base; - struct hrtimer_cpu_base *new_cpu_base; - int this_cpu = smp_processor_id(); - int cpu = get_nohz_timer_target(pinned); int basenum = base->index; + this_base = this_cpu_ptr(&hrtimer_bases); + new_cpu_base = get_target_base(this_base, pinned); again: - new_cpu_base = &per_cpu(hrtimer_bases, cpu); new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { @@ -220,22 +224,24 @@ again: if (unlikely(hrtimer_callback_running(timer))) return base; - /* See the comment in lock_timer_base() */ - timer->base = NULL; + /* See the comment in lock_hrtimer_base() */ + timer->base = &migration_base; raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); + new_cpu_base = this_base; timer->base = base; goto again; } timer->base = new_base; } else { - if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { - cpu = this_cpu; + if (new_cpu_base != this_base && + hrtimer_check_target(timer, new_base)) { + new_cpu_base = this_base; goto again; } } @@ -443,24 +449,35 @@ static inline void debug_deactivate(struct hrtimer *timer) } #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer *timer) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + cpu_base->next_timer = timer; +#endif +} + static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) { struct hrtimer_clock_base *base = cpu_base->clock_base; ktime_t expires, expires_next = { .tv64 = KTIME_MAX }; - int i; + unsigned int active = cpu_base->active_bases; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) { + hrtimer_update_next_timer(cpu_base, NULL); + for (; active; base++, active >>= 1) { struct timerqueue_node *next; struct hrtimer *timer; - next = timerqueue_getnext(&base->active); - if (!next) + if (!(active & 0x01)) continue; + next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - if (expires.tv64 < expires_next.tv64) + if (expires.tv64 < expires_next.tv64) { expires_next = expires; + hrtimer_update_next_timer(cpu_base, timer); + } } /* * clock_was_set() might have changed base->offset of any of @@ -473,6 +490,16 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) } #endif +static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) +{ + ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; + ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; + ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; + + return ktime_get_update_offsets_now(&base->clock_was_set_seq, + offs_real, offs_boot, offs_tai); +} + /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS @@ -480,6 +507,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base) * High resolution timer enabled ? */ static int hrtimer_hres_enabled __read_mostly = 1; +unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; +EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode @@ -508,9 +537,14 @@ static inline int hrtimer_is_hres_enabled(void) /* * Is the high resolution mode active ? */ +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) +{ + return cpu_base->hres_active; +} + static inline int hrtimer_hres_active(void) { - return __this_cpu_read(hrtimer_bases.hres_active); + return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } /* @@ -521,7 +555,12 @@ static inline int hrtimer_hres_active(void) static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { - ktime_t expires_next = __hrtimer_get_next_event(cpu_base); + ktime_t expires_next; + + if (!cpu_base->hres_active) + return; + + expires_next = __hrtimer_get_next_event(cpu_base); if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64) return; @@ -545,63 +584,53 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) if (cpu_base->hang_detected) return; - if (cpu_base->expires_next.tv64 != KTIME_MAX) - tick_program_event(cpu_base->expires_next, 1); + tick_program_event(cpu_base->expires_next, 1); } /* - * Shared reprogramming for clock_realtime and clock_monotonic - * * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * - * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming - * and no expiry check happens. The timer gets enqueued into the rbtree. The - * reprogramming and expiry check is done in the hrtimer_interrupt or in the - * softirq. - * * Called with interrupts disabled and base->cpu_base.lock held */ -static int hrtimer_reprogram(struct hrtimer *timer, - struct hrtimer_clock_base *base) +static void hrtimer_reprogram(struct hrtimer *timer, + struct hrtimer_clock_base *base) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); - int res; WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* - * When the callback is running, we do not reprogram the clock event - * device. The timer callback is either running on a different CPU or - * the callback is executed in the hrtimer_interrupt context. The - * reprogramming is handled either by the softirq, which called the - * callback or at the end of the hrtimer_interrupt. + * If the timer is not on the current cpu, we cannot reprogram + * the other cpus clock event device. */ - if (hrtimer_callback_running(timer)) - return 0; + if (base->cpu_base != cpu_base) + return; + + /* + * If the hrtimer interrupt is running, then it will + * reevaluate the clock bases and reprogram the clock event + * device. The callbacks are always executed in hard interrupt + * context so we don't need an extra check for a running + * callback. + */ + if (cpu_base->in_hrtirq) + return; /* * CLOCK_REALTIME timer might be requested with an absolute - * expiry time which is less than base->offset. Nothing wrong - * about that, just avoid to call into the tick code, which - * has now objections against negative expiry values. + * expiry time which is less than base->offset. Set it to 0. */ if (expires.tv64 < 0) - return -ETIME; + expires.tv64 = 0; if (expires.tv64 >= cpu_base->expires_next.tv64) - return 0; + return; - /* - * When the target cpu of the timer is currently executing - * hrtimer_interrupt(), then we do not touch the clock event - * device. hrtimer_interrupt() will reevaluate all clock bases - * before reprogramming the device. - */ - if (cpu_base->in_hrtirq) - return 0; + /* Update the pointer to the next expiring timer */ + cpu_base->next_timer = timer; /* * If a hang was detected in the last timer interrupt then we @@ -610,15 +639,14 @@ static int hrtimer_reprogram(struct hrtimer *timer, * to make progress. */ if (cpu_base->hang_detected) - return 0; + return; /* - * Clockevents returns -ETIME, when the event was in the past. + * Program the timer hardware. We enforce the expiry for + * events which are already in the past. */ - res = tick_program_event(expires, 0); - if (!IS_ERR_VALUE(res)) - cpu_base->expires_next = expires; - return res; + cpu_base->expires_next = expires; + tick_program_event(expires, 1); } /* @@ -630,15 +658,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) base->hres_active = 0; } -static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) -{ - ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; - ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; - ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; - - return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai); -} - /* * Retrigger next event is called after clock was set * @@ -648,7 +667,7 @@ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); - if (!hrtimer_hres_active()) + if (!base->hres_active) return; raw_spin_lock(&base->lock); @@ -662,29 +681,19 @@ static void retrigger_next_event(void *arg) */ static int hrtimer_switch_to_hres(void) { - int i, cpu = smp_processor_id(); - struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu); - unsigned long flags; - - if (base->hres_active) - return 1; - - local_irq_save(flags); + struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { - local_irq_restore(flags); printk(KERN_WARNING "Could not switch to high resolution " - "mode on CPU %d\n", cpu); + "mode on CPU %d\n", base->cpu); return 0; } base->hres_active = 1; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) - base->clock_base[i].resolution = KTIME_HIGH_RES; + hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); - local_irq_restore(flags); return 1; } @@ -706,6 +715,7 @@ void clock_was_set_delayed(void) #else +static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; } static inline int hrtimer_hres_active(void) { return 0; } static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline int hrtimer_switch_to_hres(void) { return 0; } @@ -803,6 +813,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) * * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. + * + * Can be safely called from the callback function of @timer. If + * called from other contexts @timer must neither be enqueued nor + * running the callback and the caller needs to take care of + * serialization. + * + * Note: This only updates the timer expiry value and does not requeue + * the timer. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { @@ -814,8 +832,11 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) if (delta.tv64 < 0) return 0; - if (interval.tv64 < timer->base->resolution.tv64) - interval.tv64 = timer->base->resolution.tv64; + if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + return 0; + + if (interval.tv64 < hrtimer_resolution) + interval.tv64 = hrtimer_resolution; if (unlikely(delta.tv64 >= interval.tv64)) { s64 incr = ktime_to_ns(interval); @@ -849,16 +870,11 @@ static int enqueue_hrtimer(struct hrtimer *timer, { debug_activate(timer); - timerqueue_add(&base->active, &timer->node); base->cpu_base->active_bases |= 1 << base->index; - /* - * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the - * state of a possibly running callback. - */ - timer->state |= HRTIMER_STATE_ENQUEUED; + timer->state = HRTIMER_STATE_ENQUEUED; - return (&timer->node == base->active.next); + return timerqueue_add(&base->active, &timer->node); } /* @@ -875,39 +891,38 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, unsigned long newstate, int reprogram) { - struct timerqueue_node *next_timer; - if (!(timer->state & HRTIMER_STATE_ENQUEUED)) - goto out; + struct hrtimer_cpu_base *cpu_base = base->cpu_base; + unsigned int state = timer->state; + + timer->state = newstate; + if (!(state & HRTIMER_STATE_ENQUEUED)) + return; + + if (!timerqueue_del(&base->active, &timer->node)) + cpu_base->active_bases &= ~(1 << base->index); - next_timer = timerqueue_getnext(&base->active); - timerqueue_del(&base->active, &timer->node); - if (&timer->node == next_timer) { #ifdef CONFIG_HIGH_RES_TIMERS - /* Reprogram the clock event device. if enabled */ - if (reprogram && hrtimer_hres_active()) { - ktime_t expires; - - expires = ktime_sub(hrtimer_get_expires(timer), - base->offset); - if (base->cpu_base->expires_next.tv64 == expires.tv64) - hrtimer_force_reprogram(base->cpu_base, 1); - } + /* + * Note: If reprogram is false we do not update + * cpu_base->next_timer. This happens when we remove the first + * timer on a remote cpu. No harm as we never dereference + * cpu_base->next_timer. So the worst thing what can happen is + * an superflous call to hrtimer_force_reprogram() on the + * remote cpu later on if the same timer gets enqueued again. + */ + if (reprogram && timer == cpu_base->next_timer) + hrtimer_force_reprogram(cpu_base, 1); #endif - } - if (!timerqueue_getnext(&base->active)) - base->cpu_base->active_bases &= ~(1 << base->index); -out: - timer->state = newstate; } /* * remove hrtimer, called with base lock held */ static inline int -remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) +remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state; + unsigned long state = timer->state; int reprogram; /* @@ -921,30 +936,35 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base) debug_deactivate(timer); timer_stats_hrtimer_clear_start_info(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); - /* - * We must preserve the CALLBACK state flag here, - * otherwise we could move the timer base in - * switch_hrtimer_base. - */ - state = timer->state & HRTIMER_STATE_CALLBACK; + + if (!restart) + state = HRTIMER_STATE_INACTIVE; + __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; } -int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode, - int wakeup) +/** + * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU + * @timer: the timer to be added + * @tim: expiry time + * @delta_ns: "slack" range for the timer + * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or + * relative (HRTIMER_MODE_REL) + */ +void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, + unsigned long delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; - int ret, leftmost; + int leftmost; base = lock_hrtimer_base(timer, &flags); /* Remove an active timer from the queue: */ - ret = remove_hrtimer(timer, base); + remove_hrtimer(timer, base, true); if (mode & HRTIMER_MODE_REL) { tim = ktime_add_safe(tim, base->get_time()); @@ -956,7 +976,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * timeouts. This will go away with the GTOD framework. */ #ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, base->resolution); + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); #endif } @@ -968,85 +988,25 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, timer_stats_hrtimer_set_start_info(timer); leftmost = enqueue_hrtimer(timer, new_base); - - if (!leftmost) { - unlock_hrtimer_base(timer, &flags); - return ret; - } + if (!leftmost) + goto unlock; if (!hrtimer_is_hres_active(timer)) { /* * Kick to reschedule the next tick to handle the new timer * on dynticks target. */ - wake_up_nohz_cpu(new_base->cpu_base->cpu); - } else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) && - hrtimer_reprogram(timer, new_base)) { - /* - * Only allow reprogramming if the new base is on this CPU. - * (it might still be on another CPU if the timer was pending) - * - * XXX send_remote_softirq() ? - */ - if (wakeup) { - /* - * We need to drop cpu_base->lock to avoid a - * lock ordering issue vs. rq->lock. - */ - raw_spin_unlock(&new_base->cpu_base->lock); - raise_softirq_irqoff(HRTIMER_SOFTIRQ); - local_irq_restore(flags); - return ret; - } else { - __raise_softirq_irqoff(HRTIMER_SOFTIRQ); - } + if (new_base->cpu_base->nohz_active) + wake_up_nohz_cpu(new_base->cpu_base->cpu); + } else { + hrtimer_reprogram(timer, new_base); } - +unlock: unlock_hrtimer_base(timer, &flags); - - return ret; -} -EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns); - -/** - * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @delta_ns: "slack" range for the timer - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** - * hrtimer_start - (re)start an hrtimer on the current CPU - * @timer: the timer to be added - * @tim: expiry time - * @mode: expiry mode: absolute (HRTIMER_MODE_ABS) or - * relative (HRTIMER_MODE_REL) - * - * Returns: - * 0 on success - * 1 when the timer was active - */ -int -hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) -{ - return __hrtimer_start_range_ns(timer, tim, 0, mode, 1); -} -EXPORT_SYMBOL_GPL(hrtimer_start); - - -/** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop * @@ -1062,10 +1022,19 @@ int hrtimer_try_to_cancel(struct hrtimer *timer) unsigned long flags; int ret = -1; + /* + * Check lockless first. If the timer is not active (neither + * enqueued nor running the callback, nothing to do here. The + * base lock does not serialize against a concurrent enqueue, + * so we can avoid taking it. + */ + if (!hrtimer_active(timer)) + return 0; + base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) - ret = remove_hrtimer(timer, base); + ret = remove_hrtimer(timer, base, false); unlock_hrtimer_base(timer, &flags); @@ -1115,26 +1084,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining); /** * hrtimer_get_next_event - get the time until next expiry event * - * Returns the delta to the next expiry event or KTIME_MAX if no timer - * is pending. + * Returns the next expiry time or KTIME_MAX if no timer is pending. */ -ktime_t hrtimer_get_next_event(void) +u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t mindelta = { .tv64 = KTIME_MAX }; + u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); - if (!hrtimer_hres_active()) - mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base), - ktime_get()); + if (!__hrtimer_hres_active(cpu_base)) + expires = __hrtimer_get_next_event(cpu_base).tv64; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); - if (mindelta.tv64 < 0) - mindelta.tv64 = 0; - return mindelta; + return expires; } #endif @@ -1176,37 +1141,73 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, } EXPORT_SYMBOL_GPL(hrtimer_init); -/** - * hrtimer_get_res - get the timer resolution for a clock - * @which_clock: which clock to query - * @tp: pointer to timespec variable to store the resolution +/* + * A timer is active, when it is enqueued into the rbtree or the + * callback function is running or it's in the state of being migrated + * to another cpu. * - * Store the resolution of the clock selected by @which_clock in the - * variable pointed to by @tp. + * It is important for this function to not return a false negative. */ -int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) +bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_cpu_base *cpu_base; - int base = hrtimer_clockid_to_base(which_clock); + unsigned int seq; - cpu_base = raw_cpu_ptr(&hrtimer_bases); - *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution); + do { + cpu_base = READ_ONCE(timer->base->cpu_base); + seq = raw_read_seqcount_begin(&cpu_base->seq); - return 0; + if (timer->state != HRTIMER_STATE_INACTIVE || + cpu_base->running == timer) + return true; + + } while (read_seqcount_retry(&cpu_base->seq, seq) || + cpu_base != READ_ONCE(timer->base->cpu_base)); + + return false; } -EXPORT_SYMBOL_GPL(hrtimer_get_res); +EXPORT_SYMBOL_GPL(hrtimer_active); -static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) +/* + * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 + * distinct sections: + * + * - queued: the timer is queued + * - callback: the timer is being ran + * - post: the timer is inactive or (re)queued + * + * On the read side we ensure we observe timer->state and cpu_base->running + * from the same section, if anything changed while we looked at it, we retry. + * This includes timer->base changing because sequence numbers alone are + * insufficient for that. + * + * The sequence numbers are required because otherwise we could still observe + * a false negative if the read side got smeared over multiple consequtive + * __run_hrtimer() invocations. + */ + +static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, + struct hrtimer_clock_base *base, + struct hrtimer *timer, ktime_t *now) { - struct hrtimer_clock_base *base = timer->base; - struct hrtimer_cpu_base *cpu_base = base->cpu_base; enum hrtimer_restart (*fn)(struct hrtimer *); int restart; - WARN_ON(!irqs_disabled()); + lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); - __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); + cpu_base->running = timer; + + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe cpu_base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&cpu_base->seq); + + __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); timer_stats_account_hrtimer(timer); fn = timer->function; @@ -1222,58 +1223,43 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now) raw_spin_lock(&cpu_base->lock); /* - * Note: We clear the CALLBACK bit after enqueue_hrtimer and + * Note: We clear the running state after enqueue_hrtimer and * we do not reprogramm the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() + * + * Note: Because we dropped the cpu_base->lock above, + * hrtimer_start_range_ns() can have popped in and enqueued the timer + * for us already. */ - if (restart != HRTIMER_NORESTART) { - BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); + if (restart != HRTIMER_NORESTART && + !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base); - } - WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK)); + /* + * Separate the ->running assignment from the ->state assignment. + * + * As with a regular write barrier, this ensures the read side in + * hrtimer_active() cannot observe cpu_base->running == NULL && + * timer->state == INACTIVE. + */ + raw_write_seqcount_barrier(&cpu_base->seq); - timer->state &= ~HRTIMER_STATE_CALLBACK; + WARN_ON_ONCE(cpu_base->running != timer); + cpu_base->running = NULL; } -#ifdef CONFIG_HIGH_RES_TIMERS - -/* - * High resolution timer interrupt - * Called with interrupts disabled - */ -void hrtimer_interrupt(struct clock_event_device *dev) +static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now) { - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t expires_next, now, entry_time, delta; - int i, retries = 0; - - BUG_ON(!cpu_base->hres_active); - cpu_base->nr_events++; - dev->next_event.tv64 = KTIME_MAX; - - raw_spin_lock(&cpu_base->lock); - entry_time = now = hrtimer_update_base(cpu_base); -retry: - cpu_base->in_hrtirq = 1; - /* - * We set expires_next to KTIME_MAX here with cpu_base->lock - * held to prevent that a timer is enqueued in our queue via - * the migration code. This does not affect enqueueing of - * timers which run their callback and need to be requeued on - * this CPU. - */ - cpu_base->expires_next.tv64 = KTIME_MAX; + struct hrtimer_clock_base *base = cpu_base->clock_base; + unsigned int active = cpu_base->active_bases; - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { - struct hrtimer_clock_base *base; + for (; active; base++, active >>= 1) { struct timerqueue_node *node; ktime_t basenow; - if (!(cpu_base->active_bases & (1 << i))) + if (!(active & 0x01)) continue; - base = cpu_base->clock_base + i; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { @@ -1296,9 +1282,42 @@ retry: if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) break; - __run_hrtimer(timer, &basenow); + __run_hrtimer(cpu_base, base, timer, &basenow); } } +} + +#ifdef CONFIG_HIGH_RES_TIMERS + +/* + * High resolution timer interrupt + * Called with interrupts disabled + */ +void hrtimer_interrupt(struct clock_event_device *dev) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t expires_next, now, entry_time, delta; + int retries = 0; + + BUG_ON(!cpu_base->hres_active); + cpu_base->nr_events++; + dev->next_event.tv64 = KTIME_MAX; + + raw_spin_lock(&cpu_base->lock); + entry_time = now = hrtimer_update_base(cpu_base); +retry: + cpu_base->in_hrtirq = 1; + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + + __hrtimer_run_queues(cpu_base, now); + /* Reevaluate the clock bases for the next expiry */ expires_next = __hrtimer_get_next_event(cpu_base); /* @@ -1310,8 +1329,7 @@ retry: raw_spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ - if (expires_next.tv64 == KTIME_MAX || - !tick_program_event(expires_next, 0)) { + if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } @@ -1344,8 +1362,8 @@ retry: cpu_base->hang_detected = 1; raw_spin_unlock(&cpu_base->lock); delta = ktime_sub(now, entry_time); - if (delta.tv64 > cpu_base->max_hang_time.tv64) - cpu_base->max_hang_time = delta; + if ((unsigned int)delta.tv64 > cpu_base->max_hang_time) + cpu_base->max_hang_time = (unsigned int) delta.tv64; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. @@ -1363,7 +1381,7 @@ retry: * local version of hrtimer_peek_ahead_timers() called with interrupts * disabled. */ -static void __hrtimer_peek_ahead_timers(void) +static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; @@ -1375,29 +1393,6 @@ static void __hrtimer_peek_ahead_timers(void) hrtimer_interrupt(td->evtdev); } -/** - * hrtimer_peek_ahead_timers -- run soft-expired timers now - * - * hrtimer_peek_ahead_timers will peek at the timer queue of - * the current cpu and check if there are any timers for which - * the soft expires time has passed. If any such timers exist, - * they are run immediately and then removed from the timer queue. - * - */ -void hrtimer_peek_ahead_timers(void) -{ - unsigned long flags; - - local_irq_save(flags); - __hrtimer_peek_ahead_timers(); - local_irq_restore(flags); -} - -static void run_hrtimer_softirq(struct softirq_action *h) -{ - hrtimer_peek_ahead_timers(); -} - #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } @@ -1405,66 +1400,32 @@ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* - * Called from timer softirq every jiffy, expire hrtimers: - * - * For HRT its the fall back code to run the softirq in the timer - * softirq context in case the hrtimer initialization failed or has - * not been done yet. + * Called from run_local_timers in hardirq context every jiffy */ -void hrtimer_run_pending(void) +void hrtimer_run_queues(void) { - if (hrtimer_hres_active()) + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t now; + + if (__hrtimer_hres_active(cpu_base)) return; /* - * This _is_ ugly: We have to check in the softirq context, - * whether we can switch to highres and / or nohz mode. The - * clocksource switch happens in the timer interrupt with - * xtime_lock held. Notification from there only sets the - * check bit in the tick_oneshot code, otherwise we might - * deadlock vs. xtime_lock. + * This _is_ ugly: We have to check periodically, whether we + * can switch to highres and / or nohz mode. The clocksource + * switch happens with xtime_lock held. Notification from + * there only sets the check bit in the tick_oneshot code, + * otherwise we might deadlock vs. xtime_lock. */ - if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) + if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); -} - -/* - * Called from hardirq context every jiffy - */ -void hrtimer_run_queues(void) -{ - struct timerqueue_node *node; - struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - struct hrtimer_clock_base *base; - int index, gettime = 1; - - if (hrtimer_hres_active()) return; - - for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) { - base = &cpu_base->clock_base[index]; - if (!timerqueue_getnext(&base->active)) - continue; - - if (gettime) { - hrtimer_get_softirq_time(cpu_base); - gettime = 0; - } - - raw_spin_lock(&cpu_base->lock); - - while ((node = timerqueue_getnext(&base->active))) { - struct hrtimer *timer; - - timer = container_of(node, struct hrtimer, node); - if (base->softirq_time.tv64 <= - hrtimer_get_expires_tv64(timer)) - break; - - __run_hrtimer(timer, &base->softirq_time); - } - raw_spin_unlock(&cpu_base->lock); } + + raw_spin_lock(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); + __hrtimer_run_queues(cpu_base, now); + raw_spin_unlock(&cpu_base->lock); } /* @@ -1497,8 +1458,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t->timer, mode); - if (!hrtimer_active(&t->timer)) - t->task = NULL; if (likely(t->task)) freezable_schedule(); @@ -1642,11 +1601,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, debug_deactivate(timer); /* - * Mark it as STATE_MIGRATE not INACTIVE otherwise the + * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ - __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0); + __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not @@ -1657,9 +1616,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, * event device. */ enqueue_hrtimer(timer, new_base); - - /* Clear the migration state bit */ - timer->state &= ~HRTIMER_STATE_MIGRATE; } } @@ -1731,9 +1687,6 @@ void __init hrtimers_init(void) hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&hrtimers_nb); -#ifdef CONFIG_HIGH_RES_TIMERS - open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq); -#endif } /** @@ -1772,8 +1725,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, hrtimer_init_sleeper(&t, current); hrtimer_start_expires(&t.timer, mode); - if (!hrtimer_active(&t.timer)) - t.task = NULL; if (likely(t.task)) schedule(); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 7a681003001c..fb4d98c7fd43 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -35,6 +35,7 @@ unsigned long tick_nsec; static u64 tick_length; static u64 tick_length_base; +#define SECS_PER_DAY 86400 #define MAX_TICKADJ 500LL /* usecs */ #define MAX_TICKADJ_SCALED \ (((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ) @@ -76,6 +77,9 @@ static long time_adjust; /* constant (boot-param configurable) NTP tick adjustment (upscaled) */ static s64 ntp_tick_adj; +/* second value of the next pending leapsecond, or TIME64_MAX if no leap */ +static time64_t ntp_next_leap_sec = TIME64_MAX; + #ifdef CONFIG_NTP_PPS /* @@ -349,6 +353,7 @@ void ntp_clear(void) tick_length = tick_length_base; time_offset = 0; + ntp_next_leap_sec = TIME64_MAX; /* Clear PPS state variables */ pps_clear(); } @@ -359,6 +364,21 @@ u64 ntp_tick_length(void) return tick_length; } +/** + * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t + * + * Provides the time of the next leapsecond against CLOCK_REALTIME in + * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending. + */ +ktime_t ntp_get_next_leap(void) +{ + ktime_t ret; + + if ((time_state == TIME_INS) && (time_status & STA_INS)) + return ktime_set(ntp_next_leap_sec, 0); + ret.tv64 = KTIME_MAX; + return ret; +} /* * this routine handles the overflow of the microsecond field @@ -382,15 +402,21 @@ int second_overflow(unsigned long secs) */ switch (time_state) { case TIME_OK: - if (time_status & STA_INS) + if (time_status & STA_INS) { time_state = TIME_INS; - else if (time_status & STA_DEL) + ntp_next_leap_sec = secs + SECS_PER_DAY - + (secs % SECS_PER_DAY); + } else if (time_status & STA_DEL) { time_state = TIME_DEL; + ntp_next_leap_sec = secs + SECS_PER_DAY - + ((secs+1) % SECS_PER_DAY); + } break; case TIME_INS: - if (!(time_status & STA_INS)) + if (!(time_status & STA_INS)) { + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - else if (secs % 86400 == 0) { + } else if (secs % SECS_PER_DAY == 0) { leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE @@ -398,19 +424,21 @@ int second_overflow(unsigned long secs) } break; case TIME_DEL: - if (!(time_status & STA_DEL)) + if (!(time_status & STA_DEL)) { + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - else if ((secs + 1) % 86400 == 0) { + } else if ((secs + 1) % SECS_PER_DAY == 0) { leap = 1; + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n"); } break; case TIME_OOP: + ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; break; - case TIME_WAIT: if (!(time_status & (STA_INS | STA_DEL))) time_state = TIME_OK; @@ -547,6 +575,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) { time_state = TIME_OK; time_status = STA_UNSYNC; + ntp_next_leap_sec = TIME64_MAX; /* restart PPS frequency calibration */ pps_reset_freq_interval(); } @@ -711,6 +740,24 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai) if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; + /* Handle leapsec adjustments */ + if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) { + if ((time_state == TIME_INS) && (time_status & STA_INS)) { + result = TIME_OOP; + txc->tai++; + txc->time.tv_sec--; + } + if ((time_state == TIME_DEL) && (time_status & STA_DEL)) { + result = TIME_WAIT; + txc->tai--; + txc->time.tv_sec++; + } + if ((time_state == TIME_OOP) && + (ts->tv_sec == ntp_next_leap_sec)) { + result = TIME_WAIT; + } + } + return result; } diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index bbd102ad9df7..65430504ca26 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -5,6 +5,7 @@ extern void ntp_init(void); extern void ntp_clear(void); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ extern u64 ntp_tick_length(void); +extern ktime_t ntp_get_next_leap(void); extern int second_overflow(unsigned long secs); extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31ea01f42e1f..31d11ac9fa47 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp) return 0; } +static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp) +{ + tp->tv_sec = 0; + tp->tv_nsec = hrtimer_resolution; + return 0; +} + /* * Initialize everything, well, just everything in Posix clocks/timers ;) */ static __init int init_posix_timers(void) { struct k_clock clock_realtime = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_clock_realtime_get, .clock_set = posix_clock_realtime_set, .clock_adj = posix_clock_realtime_adj, @@ -290,7 +297,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_monotonic = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_ktime_get_ts, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -300,7 +307,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_monotonic_raw = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_monotonic_raw, }; struct k_clock clock_realtime_coarse = { @@ -312,7 +319,7 @@ static __init int init_posix_timers(void) .clock_get = posix_get_monotonic_coarse, }; struct k_clock clock_tai = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_tai, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, @@ -322,7 +329,7 @@ static __init int init_posix_timers(void) .timer_del = common_timer_del, }; struct k_clock clock_boottime = { - .clock_getres = hrtimer_get_res, + .clock_getres = posix_get_hrtimer_res, .clock_get = posix_get_boottime, .nsleep = common_nsleep, .nsleep_restart = hrtimer_nanosleep_restart, diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index 6aac4beedbbe..3e7db49a2381 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -22,6 +22,7 @@ static void bc_set_mode(enum clock_event_mode mode, struct clock_event_device *bc) { switch (mode) { + case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: /* * Note, we cannot cancel the timer here as we might @@ -66,9 +67,11 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * hrtimer_{start/cancel} functions call into tracing, * calls to these functions must be bound within RCU_NONIDLE. */ - RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? - !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : - 0); + RCU_NONIDLE({ + bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0; + if (bc_moved) + hrtimer_start(&bctimer, expires, + HRTIMER_MODE_ABS_PINNED);}); if (bc_moved) { /* Bind the "device" to the cpu */ bc->bound_on = smp_processor_id(); @@ -99,10 +102,13 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t) { ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer); - if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX) + switch (ce_broadcast_hrtimer.mode) { + case CLOCK_EVT_MODE_ONESHOT: + if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX) + return HRTIMER_RESTART; + default: return HRTIMER_NORESTART; - - return HRTIMER_RESTART; + } } void tick_setup_hrtimer_broadcast(void) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 7e8ca4f448a8..d39f32cdd1b5 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -255,18 +255,18 @@ int tick_receive_broadcast(void) /* * Broadcast the event to the cpus, which are set in the mask (mangled). */ -static void tick_do_broadcast(struct cpumask *mask) +static bool tick_do_broadcast(struct cpumask *mask) { int cpu = smp_processor_id(); struct tick_device *td; + bool local = false; /* * Check, if the current cpu is in the mask */ if (cpumask_test_cpu(cpu, mask)) { cpumask_clear_cpu(cpu, mask); - td = &per_cpu(tick_cpu_device, cpu); - td->evtdev->event_handler(td->evtdev); + local = true; } if (!cpumask_empty(mask)) { @@ -279,16 +279,17 @@ static void tick_do_broadcast(struct cpumask *mask) td = &per_cpu(tick_cpu_device, cpumask_first(mask)); td->evtdev->broadcast(mask); } + return local; } /* * Periodic broadcast: * - invoke the broadcast handlers */ -static void tick_do_periodic_broadcast(void) +static bool tick_do_periodic_broadcast(void) { cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask); - tick_do_broadcast(tmpmask); + return tick_do_broadcast(tmpmask); } /* @@ -296,34 +297,26 @@ static void tick_do_periodic_broadcast(void) */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { - ktime_t next; + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + bool bc_local; raw_spin_lock(&tick_broadcast_lock); + bc_local = tick_do_periodic_broadcast(); - tick_do_periodic_broadcast(); + if (clockevent_state_oneshot(dev)) { + ktime_t next = ktime_add(dev->next_event, tick_period); - /* - * The device is in periodic mode. No reprogramming necessary: - */ - if (dev->state == CLOCK_EVT_STATE_PERIODIC) - goto unlock; + clockevents_program_event(dev, next, true); + } + raw_spin_unlock(&tick_broadcast_lock); /* - * Setup the next period for devices, which do not have - * periodic mode. We read dev->next_event first and add to it - * when the event already expired. clockevents_program_event() - * sets dev->next_event only when the event is really - * programmed to the device. + * We run the handler of the local cpu after dropping + * tick_broadcast_lock because the handler might deadlock when + * trying to switch to oneshot mode. */ - for (next = dev->next_event; ;) { - next = ktime_add(next, tick_period); - - if (!clockevents_program_event(dev, next, false)) - goto unlock; - tick_do_periodic_broadcast(); - } -unlock: - raw_spin_unlock(&tick_broadcast_lock); + if (bc_local) + td->evtdev->event_handler(td->evtdev); } /** @@ -532,23 +525,19 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc, irq_set_affinity(bc->irq, bc->cpumask); } -static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, - ktime_t expires, int force) +static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu, + ktime_t expires) { - int ret; - - if (bc->state != CLOCK_EVT_STATE_ONESHOT) - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + if (!clockevent_state_oneshot(bc)) + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); - ret = clockevents_program_event(bc, expires, force); - if (!ret) - tick_broadcast_set_affinity(bc, cpumask_of(cpu)); - return ret; + clockevents_program_event(bc, expires, 1); + tick_broadcast_set_affinity(bc, cpumask_of(cpu)); } static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); } /* @@ -566,7 +555,7 @@ void tick_check_oneshot_broadcast_this_cpu(void) * switched over, leave the device alone. */ if (td->mode == TICKDEV_MODE_ONESHOT) { - clockevents_set_state(td->evtdev, + clockevents_switch_state(td->evtdev, CLOCK_EVT_STATE_ONESHOT); } } @@ -580,9 +569,9 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) struct tick_device *td; ktime_t now, next_event; int cpu, next_cpu = 0; + bool bc_local; raw_spin_lock(&tick_broadcast_lock); -again: dev->next_event.tv64 = KTIME_MAX; next_event.tv64 = KTIME_MAX; cpumask_clear(tmpmask); @@ -624,7 +613,7 @@ again: /* * Wakeup the cpus which have an expired event. */ - tick_do_broadcast(tmpmask); + bc_local = tick_do_broadcast(tmpmask); /* * Two reasons for reprogram: @@ -636,15 +625,15 @@ again: * - There are pending events on sleeping CPUs which were not * in the event mask */ - if (next_event.tv64 != KTIME_MAX) { - /* - * Rearm the broadcast device. If event expired, - * repeat the above - */ - if (tick_broadcast_set_event(dev, next_cpu, next_event, 0)) - goto again; - } + if (next_event.tv64 != KTIME_MAX) + tick_broadcast_set_event(dev, next_cpu, next_event); + raw_spin_unlock(&tick_broadcast_lock); + + if (bc_local) { + td = this_cpu_ptr(&tick_cpu_device); + td->evtdev->event_handler(td->evtdev); + } } static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu) @@ -670,7 +659,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, if (dev->next_event.tv64 < bc->next_event.tv64) return; } - clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); + clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } /** @@ -726,7 +715,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state) */ if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) && dev->next_event.tv64 < bc->next_event.tv64) - tick_broadcast_set_event(bc, cpu, dev->next_event, 1); + tick_broadcast_set_event(bc, cpu, dev->next_event); } /* * If the current CPU owns the hrtimer broadcast @@ -740,7 +729,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state) cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast @@ -842,7 +831,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC; + int was_periodic = clockevent_state_periodic(bc); bc->event_handler = tick_handle_oneshot_broadcast; @@ -858,10 +847,10 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_oneshot_mask, tmpmask); if (was_periodic && !cpumask_empty(tmpmask)) { - clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_init_next_event(tmpmask, tick_next_period); - tick_broadcast_set_event(bc, cpu, tick_next_period, 1); + tick_broadcast_set_event(bc, cpu, tick_next_period); } else bc->next_event.tv64 = KTIME_MAX; } else { diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 3ae6afa1eb98..17f144450050 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -102,7 +102,17 @@ void tick_handle_periodic(struct clock_event_device *dev) tick_periodic(cpu); - if (dev->state != CLOCK_EVT_STATE_ONESHOT) +#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON) + /* + * The cpu might have transitioned to HIGHRES or NOHZ mode via + * update_process_times() -> run_local_timers() -> + * hrtimer_run_queues(). + */ + if (dev->event_handler != tick_handle_periodic) + return; +#endif + + if (!clockevent_state_oneshot(dev)) return; for (;;) { /* @@ -140,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && !tick_broadcast_oneshot_active()) { - clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); + clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { unsigned long seq; ktime_t next; @@ -150,7 +160,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) next = tick_next_period; } while (read_seqretry(&jiffies_lock, seq)); - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); for (;;) { if (!clockevents_program_event(dev, next, false)) @@ -367,7 +377,7 @@ void tick_shutdown(unsigned int cpu) * Prevent that the clock events layer tries to call * the set mode function! */ - dev->state = CLOCK_EVT_STATE_DETACHED; + clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index b64fdd8054c5..966a5a6fdd0a 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -36,11 +36,22 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) return !(dev->features & CLOCK_EVT_FEAT_DUMMY); } +static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev) +{ + return dev->state_use_accessors; +} + +static inline void clockevent_set_state(struct clock_event_device *dev, + enum clock_event_state state) +{ + dev->state_use_accessors = state; +} + extern void clockevents_shutdown(struct clock_event_device *dev); extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); -extern void clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state); +extern void clockevents_switch_state(struct clock_event_device *dev, + enum clock_event_state state); extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force); extern void clockevents_handle_noop(struct clock_event_device *dev); @@ -137,3 +148,19 @@ extern void tick_nohz_init(void); # else static inline void tick_nohz_init(void) { } #endif + +#ifdef CONFIG_NO_HZ_COMMON +extern unsigned long tick_nohz_active; +#else +#define tick_nohz_active (0) +#endif + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void timers_update_migration(bool update_nohz); +#else +static inline void timers_update_migration(bool update_nohz) { } +#endif + +DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); + +extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 67a64b1670bf..b51344652330 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -28,6 +28,22 @@ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); + if (unlikely(expires.tv64 == KTIME_MAX)) { + /* + * We don't need the clock event device any more, stop it. + */ + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED); + return 0; + } + + if (unlikely(clockevent_state_oneshot_stopped(dev))) { + /* + * We need the clock event again, configure it in ONESHOT mode + * before using it. + */ + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); + } + return clockevents_program_event(dev, expires, force); } @@ -38,7 +54,7 @@ void tick_resume_oneshot(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(dev, ktime_get(), true); } @@ -50,7 +66,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, ktime_t next_event) { newdev->event_handler = handler; - clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(newdev, next_event, true); } @@ -81,7 +97,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; - clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); + clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_switch_to_oneshot(); return 0; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 914259128145..c792429e98c6 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -399,7 +399,7 @@ void __init tick_nohz_init(void) * NO HZ enabled ? */ static int tick_nohz_enabled __read_mostly = 1; -int tick_nohz_active __read_mostly; +unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ @@ -565,156 +565,144 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) } EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); +static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) +{ + hrtimer_cancel(&ts->sched_timer); + hrtimer_set_expires(&ts->sched_timer, ts->last_tick); + + /* Forward the time to expire in the future */ + hrtimer_forward(&ts->sched_timer, now, tick_period); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); +} + static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ktime_t now, int cpu) { - unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; - ktime_t last_update, expires, ret = { .tv64 = 0 }; - unsigned long rcu_delta_jiffies; struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - u64 time_delta; - - time_delta = timekeeping_max_deferment(); + u64 basemono, next_tick, next_tmr, next_rcu, delta, expires; + unsigned long seq, basejiff; + ktime_t tick; /* Read jiffies and the time when jiffies were updated last */ do { seq = read_seqbegin(&jiffies_lock); - last_update = last_jiffies_update; - last_jiffies = jiffies; + basemono = last_jiffies_update.tv64; + basejiff = jiffies; } while (read_seqretry(&jiffies_lock, seq)); + ts->last_jiffies = basejiff; - if (rcu_needs_cpu(&rcu_delta_jiffies) || + if (rcu_needs_cpu(basemono, &next_rcu) || arch_needs_cpu() || irq_work_needs_cpu()) { - next_jiffies = last_jiffies + 1; - delta_jiffies = 1; + next_tick = basemono + TICK_NSEC; } else { - /* Get the next timer wheel timer */ - next_jiffies = get_next_timer_interrupt(last_jiffies); - delta_jiffies = next_jiffies - last_jiffies; - if (rcu_delta_jiffies < delta_jiffies) { - next_jiffies = last_jiffies + rcu_delta_jiffies; - delta_jiffies = rcu_delta_jiffies; - } + /* + * Get the next pending timer. If high resolution + * timers are enabled this only takes the timer wheel + * timers into account. If high resolution timers are + * disabled this also looks at the next expiring + * hrtimer. + */ + next_tmr = get_next_timer_interrupt(basejiff, basemono); + ts->next_timer = next_tmr; + /* Take the next rcu event into account */ + next_tick = next_rcu < next_tmr ? next_rcu : next_tmr; } /* - * Do not stop the tick, if we are only one off (or less) - * or if the cpu is required for RCU: + * If the tick is due in the next period, keep it ticking or + * restart it proper. */ - if (!ts->tick_stopped && delta_jiffies <= 1) - goto out; - - /* Schedule the tick, if we are at least one jiffie off */ - if ((long)delta_jiffies >= 1) { - - /* - * If this cpu is the one which updates jiffies, then - * give up the assignment and let it be taken by the - * cpu which runs the tick timer next, which might be - * this cpu as well. If we don't drop this here the - * jiffies might be stale and do_timer() never - * invoked. Keep track of the fact that it was the one - * which had the do_timer() duty last. If this cpu is - * the one which had the do_timer() duty last, we - * limit the sleep time to the timekeeping - * max_deferement value which we retrieved - * above. Otherwise we can sleep as long as we want. - */ - if (cpu == tick_do_timer_cpu) { - tick_do_timer_cpu = TICK_DO_TIMER_NONE; - ts->do_timer_last = 1; - } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { - time_delta = KTIME_MAX; - ts->do_timer_last = 0; - } else if (!ts->do_timer_last) { - time_delta = KTIME_MAX; + delta = next_tick - basemono; + if (delta <= (u64)TICK_NSEC) { + tick.tv64 = 0; + if (!ts->tick_stopped) + goto out; + if (delta == 0) { + /* Tick is stopped, but required now. Enforce it */ + tick_nohz_restart(ts, now); + goto out; } + } + + /* + * If this cpu is the one which updates jiffies, then give up + * the assignment and let it be taken by the cpu which runs + * the tick timer next, which might be this cpu as well. If we + * don't drop this here the jiffies might be stale and + * do_timer() never invoked. Keep track of the fact that it + * was the one which had the do_timer() duty last. If this cpu + * is the one which had the do_timer() duty last, we limit the + * sleep time to the timekeeping max_deferement value. + * Otherwise we can sleep as long as we want. + */ + delta = timekeeping_max_deferment(); + if (cpu == tick_do_timer_cpu) { + tick_do_timer_cpu = TICK_DO_TIMER_NONE; + ts->do_timer_last = 1; + } else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) { + delta = KTIME_MAX; + ts->do_timer_last = 0; + } else if (!ts->do_timer_last) { + delta = KTIME_MAX; + } #ifdef CONFIG_NO_HZ_FULL - if (!ts->inidle) { - time_delta = min(time_delta, - scheduler_tick_max_deferment()); - } + /* Limit the tick delta to the maximum scheduler deferment */ + if (!ts->inidle) + delta = min(delta, scheduler_tick_max_deferment()); #endif - /* - * calculate the expiry time for the next timer wheel - * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals - * that there is no timer pending or at least extremely - * far into the future (12 days for HZ=1000). In this - * case we set the expiry to the end of time. - */ - if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) { - /* - * Calculate the time delta for the next timer event. - * If the time delta exceeds the maximum time delta - * permitted by the current clocksource then adjust - * the time delta accordingly to ensure the - * clocksource does not wrap. - */ - time_delta = min_t(u64, time_delta, - tick_period.tv64 * delta_jiffies); - } - - if (time_delta < KTIME_MAX) - expires = ktime_add_ns(last_update, time_delta); - else - expires.tv64 = KTIME_MAX; - - /* Skip reprogram of event if its not changed */ - if (ts->tick_stopped && ktime_equal(expires, dev->next_event)) - goto out; + /* Calculate the next expiry time */ + if (delta < (KTIME_MAX - basemono)) + expires = basemono + delta; + else + expires = KTIME_MAX; - ret = expires; + expires = min_t(u64, expires, next_tick); + tick.tv64 = expires; - /* - * nohz_stop_sched_tick can be called several times before - * the nohz_restart_sched_tick is called. This happens when - * interrupts arrive which do not cause a reschedule. In the - * first call we save the current tick time, so we can restart - * the scheduler tick in nohz_restart_sched_tick. - */ - if (!ts->tick_stopped) { - nohz_balance_enter_idle(cpu); - calc_load_enter_idle(); + /* Skip reprogram of event if its not changed */ + if (ts->tick_stopped && (expires == dev->next_event.tv64)) + goto out; - ts->last_tick = hrtimer_get_expires(&ts->sched_timer); - ts->tick_stopped = 1; - trace_tick_stop(1, " "); - } + /* + * nohz_stop_sched_tick can be called several times before + * the nohz_restart_sched_tick is called. This happens when + * interrupts arrive which do not cause a reschedule. In the + * first call we save the current tick time, so we can restart + * the scheduler tick in nohz_restart_sched_tick. + */ + if (!ts->tick_stopped) { + nohz_balance_enter_idle(cpu); + calc_load_enter_idle(); - /* - * If the expiration time == KTIME_MAX, then - * in this case we simply stop the tick timer. - */ - if (unlikely(expires.tv64 == KTIME_MAX)) { - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) - hrtimer_cancel(&ts->sched_timer); - goto out; - } + ts->last_tick = hrtimer_get_expires(&ts->sched_timer); + ts->tick_stopped = 1; + trace_tick_stop(1, " "); + } - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start(&ts->sched_timer, expires, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - goto out; - } else if (!tick_program_event(expires, 0)) - goto out; - /* - * We are past the event already. So we crossed a - * jiffie boundary. Update jiffies and raise the - * softirq. - */ - tick_do_update_jiffies64(ktime_get()); + /* + * If the expiration time == KTIME_MAX, then we simply stop + * the tick timer. + */ + if (unlikely(expires == KTIME_MAX)) { + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_cancel(&ts->sched_timer); + goto out; } - raise_softirq_irqoff(TIMER_SOFTIRQ); + + if (ts->nohz_mode == NOHZ_MODE_HIGHRES) + hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED); + else + tick_program_event(tick, 1); out: - ts->next_jiffies = next_jiffies; - ts->last_jiffies = last_jiffies; + /* Update the estimated sleep length */ ts->sleep_length = ktime_sub(dev->next_event, now); - - return ret; + return tick; } static void tick_nohz_full_stop_tick(struct tick_sched *ts) @@ -876,32 +864,6 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } -static void tick_nohz_restart(struct tick_sched *ts, ktime_t now) -{ - hrtimer_cancel(&ts->sched_timer); - hrtimer_set_expires(&ts->sched_timer, ts->last_tick); - - while (1) { - /* Forward the time to expire in the future */ - hrtimer_forward(&ts->sched_timer, now, tick_period); - - if (ts->nohz_mode == NOHZ_MODE_HIGHRES) { - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - } else { - if (!tick_program_event( - hrtimer_get_expires(&ts->sched_timer), 0)) - break; - } - /* Reread time and update jiffies */ - now = ktime_get(); - tick_do_update_jiffies64(now); - } -} - static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) { /* Update jiffies first */ @@ -972,12 +934,6 @@ void tick_nohz_idle_exit(void) local_irq_enable(); } -static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) -{ - hrtimer_forward(&ts->sched_timer, now, tick_period); - return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0); -} - /* * The nohz low res interrupt handler */ @@ -996,10 +952,18 @@ static void tick_nohz_handler(struct clock_event_device *dev) if (unlikely(ts->tick_stopped)) return; - while (tick_nohz_reprogram(ts, now)) { - now = ktime_get(); - tick_do_update_jiffies64(now); - } + hrtimer_forward(&ts->sched_timer, now, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); +} + +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) +{ + if (!tick_nohz_enabled) + return; + ts->nohz_mode = mode; + /* One update is enough */ + if (!test_and_set_bit(0, &tick_nohz_active)) + timers_update_migration(true); } /** @@ -1013,13 +977,8 @@ static void tick_nohz_switch_to_nohz(void) if (!tick_nohz_enabled) return; - local_irq_disable(); - if (tick_switch_to_oneshot(tick_nohz_handler)) { - local_irq_enable(); + if (tick_switch_to_oneshot(tick_nohz_handler)) return; - } - tick_nohz_active = 1; - ts->nohz_mode = NOHZ_MODE_LOWRES; /* * Recycle the hrtimer in ts, so we can share the @@ -1029,13 +988,10 @@ static void tick_nohz_switch_to_nohz(void) /* Get the next period */ next = tick_init_jiffy_update(); - for (;;) { - hrtimer_set_expires(&ts->sched_timer, next); - if (!tick_program_event(next, 0)) - break; - next = ktime_add(next, tick_period); - } - local_irq_enable(); + hrtimer_forward_now(&ts->sched_timer, tick_period); + hrtimer_set_expires(&ts->sched_timer, next); + tick_program_event(next, 1); + tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } /* @@ -1087,6 +1043,7 @@ static inline void tick_nohz_irq_enter(void) static inline void tick_nohz_switch_to_nohz(void) { } static inline void tick_nohz_irq_enter(void) { } +static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { } #endif /* CONFIG_NO_HZ_COMMON */ @@ -1167,22 +1124,9 @@ void tick_setup_sched_timer(void) hrtimer_add_expires_ns(&ts->sched_timer, offset); } - for (;;) { - hrtimer_forward(&ts->sched_timer, now, tick_period); - hrtimer_start_expires(&ts->sched_timer, - HRTIMER_MODE_ABS_PINNED); - /* Check, if the timer was already in the past */ - if (hrtimer_active(&ts->sched_timer)) - break; - now = ktime_get(); - } - -#ifdef CONFIG_NO_HZ_COMMON - if (tick_nohz_enabled) { - ts->nohz_mode = NOHZ_MODE_HIGHRES; - tick_nohz_active = 1; - } -#endif + hrtimer_forward(&ts->sched_timer, now, tick_period); + hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED); + tick_nohz_activate(ts, NOHZ_MODE_HIGHRES); } #endif /* HIGH_RES_TIMERS */ @@ -1227,7 +1171,7 @@ void tick_oneshot_notify(void) * Called cyclic from the hrtimer softirq (driven by the timer * softirq) allow_nohz signals, that we can switch into low-res nohz * mode, because high resolution timers are disabled (either compile - * or runtime). + * or runtime). Called with interrupts disabled. */ int tick_check_oneshot_change(int allow_nohz) { diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 28b5da3e1a17..42fdf4958bcc 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -57,7 +57,7 @@ struct tick_sched { ktime_t iowait_sleeptime; ktime_t sleep_length; unsigned long last_jiffies; - unsigned long next_jiffies; + u64 next_timer; ktime_t idle_expires; int do_timer_last; }; diff --git a/kernel/time/time.c b/kernel/time/time.c index 2c85b7724af4..85d5bb1d67eb 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -41,7 +41,7 @@ #include <asm/uaccess.h> #include <asm/unistd.h> -#include "timeconst.h" +#include <generated/timeconst.h> #include "timekeeping.h" /* @@ -173,6 +173,10 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz) return error; if (tz) { + /* Verify we're witin the +-15 hrs range */ + if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60) + return -EINVAL; + sys_tz = *tz; update_vsyscall_tz(); if (firsttime) { @@ -483,9 +487,11 @@ struct timespec64 ns_to_timespec64(const s64 nsec) } EXPORT_SYMBOL(ns_to_timespec64); #endif -/* - * When we convert to jiffies then we interpret incoming values - * the following way: +/** + * msecs_to_jiffies: - convert milliseconds to jiffies + * @m: time in milliseconds + * + * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * @@ -493,66 +499,36 @@ EXPORT_SYMBOL(ns_to_timespec64); * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying - * the input value by a factor or dividing it with a factor - * - * We must also be careful about 32-bit overflows. + * the input value by a factor or dividing it with a factor and + * handling any 32-bit overflows. + * for the details see __msecs_to_jiffies() + * + * msecs_to_jiffies() checks for the passed in value being a constant + * via __builtin_constant_p() allowing gcc to eliminate most of the + * code, __msecs_to_jiffies() is called if the value passed does not + * allow constant folding and the actual conversion must be done at + * runtime. + * the _msecs_to_jiffies helpers are the HZ dependent conversion + * routines found in include/linux/jiffies.h */ -unsigned long msecs_to_jiffies(const unsigned int m) +unsigned long __msecs_to_jiffies(const unsigned int m) { /* * Negative value, means infinite timeout: */ if ((int)m < 0) return MAX_JIFFY_OFFSET; - -#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) - /* - * HZ is equal to or smaller than 1000, and 1000 is a nice - * round multiple of HZ, divide with the factor between them, - * but round upwards: - */ - return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); -#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) - /* - * HZ is larger than 1000, and HZ is a nice round multiple of - * 1000 - simply multiply with the factor between them. - * - * But first make sure the multiplication result cannot - * overflow: - */ - if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return m * (HZ / MSEC_PER_SEC); -#else - /* - * Generic case - multiply, round and divide. But first - * check that if we are doing a net multiplication, that - * we wouldn't overflow: - */ - if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) - return MAX_JIFFY_OFFSET; - - return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) - >> MSEC_TO_HZ_SHR32; -#endif + return _msecs_to_jiffies(m); } -EXPORT_SYMBOL(msecs_to_jiffies); +EXPORT_SYMBOL(__msecs_to_jiffies); -unsigned long usecs_to_jiffies(const unsigned int u) +unsigned long __usecs_to_jiffies(const unsigned int u) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; -#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ) - return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); -#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC) - return u * (HZ / USEC_PER_SEC); -#else - return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) - >> USEC_TO_HZ_SHR32; -#endif + return _usecs_to_jiffies(u); } -EXPORT_SYMBOL(usecs_to_jiffies); +EXPORT_SYMBOL(__usecs_to_jiffies); /* * The TICK_NSEC - 1 rounds up the value to the next resolution. Note diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc index 511bdf2cafda..c7388dee8635 100644 --- a/kernel/time/timeconst.bc +++ b/kernel/time/timeconst.bc @@ -50,7 +50,7 @@ define timeconst(hz) { print "#include <linux/types.h>\n\n" print "#if HZ != ", hz, "\n" - print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n" + print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n" print "#endif\n\n" if (hz < 2) { @@ -105,4 +105,5 @@ define timeconst(hz) { halt } +hz = read(); timeconst(hz) diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 946acb72179f..30b7a409bf1e 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -118,18 +118,6 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) #ifdef CONFIG_DEBUG_TIMEKEEPING #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ -/* - * These simple flag variables are managed - * without locks, which is racy, but ok since - * we don't really care about being super - * precise about how many events were seen, - * just that a problem was observed. - */ -static int timekeeping_underflow_seen; -static int timekeeping_overflow_seen; - -/* last_warning is only modified under the timekeeping lock */ -static long timekeeping_last_warning; static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) { @@ -149,29 +137,30 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) } } - if (timekeeping_underflow_seen) { - if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + if (tk->underflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); printk_deferred(" Your kernel is probably still fine.\n"); - timekeeping_last_warning = jiffies; + tk->last_warning = jiffies; } - timekeeping_underflow_seen = 0; + tk->underflow_seen = 0; } - if (timekeeping_overflow_seen) { - if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + if (tk->overflow_seen) { + if (jiffies - tk->last_warning > WARNING_FREQ) { printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); printk_deferred(" Your kernel is probably still fine.\n"); - timekeeping_last_warning = jiffies; + tk->last_warning = jiffies; } - timekeeping_overflow_seen = 0; + tk->overflow_seen = 0; } } static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) { + struct timekeeper *tk = &tk_core.timekeeper; cycle_t now, last, mask, max, delta; unsigned int seq; @@ -197,13 +186,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) * mask-relative negative values. */ if (unlikely((~delta & mask) < (mask >> 3))) { - timekeeping_underflow_seen = 1; + tk->underflow_seen = 1; delta = 0; } /* Cap delta value to the max_cycles values to avoid mult overflows */ if (unlikely(delta > max)) { - timekeeping_overflow_seen = 1; + tk->overflow_seen = 1; delta = tkr->clock->max_cycles; } @@ -551,6 +540,17 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier); /* + * tk_update_leap_state - helper to update the next_leap_ktime + */ +static inline void tk_update_leap_state(struct timekeeper *tk) +{ + tk->next_leap_ktime = ntp_get_next_leap(); + if (tk->next_leap_ktime.tv64 != KTIME_MAX) + /* Convert to monotonic time */ + tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real); +} + +/* * Update the ktime_t based scalar nsec members of the timekeeper */ static inline void tk_update_ktime_data(struct timekeeper *tk) @@ -591,17 +591,25 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) ntp_clear(); } + tk_update_leap_state(tk); tk_update_ktime_data(tk); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); + + if (action & TK_CLOCK_WAS_SET) + tk->clock_was_set_seq++; + /* + * The mirroring of the data to the shadow-timekeeper needs + * to happen last here to ensure we don't over-write the + * timekeeper structure on the next update with stale data + */ if (action & TK_MIRROR) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); - - update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); - update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); } /** @@ -699,6 +707,23 @@ ktime_t ktime_get(void) } EXPORT_SYMBOL_GPL(ktime_get); +u32 ktime_get_resolution_ns(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + unsigned int seq; + u32 nsecs; + + WARN_ON(timekeeping_suspended); + + do { + seq = read_seqcount_begin(&tk_core.seq); + nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift; + } while (read_seqcount_retry(&tk_core.seq, seq)); + + return nsecs; +} +EXPORT_SYMBOL_GPL(ktime_get_resolution_ns); + static ktime_t *offsets[TK_OFFS_MAX] = { [TK_OFFS_REAL] = &tk_core.timekeeper.offs_real, [TK_OFFS_BOOT] = &tk_core.timekeeper.offs_boot, @@ -1179,28 +1204,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64) } /** - * read_boot_clock - Return time of the system start. + * read_boot_clock64 - Return time of the system start. * * Weak dummy function for arches that do not yet support it. * Function to read the exact time the system has been started. - * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported. + * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported. * * XXX - Do be sure to remove it once all arches implement it. */ -void __weak read_boot_clock(struct timespec *ts) +void __weak read_boot_clock64(struct timespec64 *ts) { ts->tv_sec = 0; ts->tv_nsec = 0; } -void __weak read_boot_clock64(struct timespec64 *ts64) -{ - struct timespec ts; - - read_boot_clock(&ts); - *ts64 = timespec_to_timespec64(ts); -} - /* Flag for if timekeeping_resume() has injected sleeptime */ static bool sleeptime_injected; @@ -1836,8 +1853,9 @@ void update_wall_time(void) * memcpy under the tk_core.seq against one before we start * updating. */ + timekeeping_update(tk, clock_set); memcpy(real_tk, tk, sizeof(*tk)); - timekeeping_update(real_tk, clock_set); + /* The memcpy must come last. Do not put anything here! */ write_seqcount_end(&tk_core.seq); out: raw_spin_unlock_irqrestore(&timekeeper_lock, flags); @@ -1926,47 +1944,20 @@ void do_timer(unsigned long ticks) } /** - * ktime_get_update_offsets_tick - hrtimer helper - * @offs_real: pointer to storage for monotonic -> realtime offset - * @offs_boot: pointer to storage for monotonic -> boottime offset - * @offs_tai: pointer to storage for monotonic -> clock tai offset - * - * Returns monotonic time at last tick and various offsets - */ -ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, - ktime_t *offs_tai) -{ - struct timekeeper *tk = &tk_core.timekeeper; - unsigned int seq; - ktime_t base; - u64 nsecs; - - do { - seq = read_seqcount_begin(&tk_core.seq); - - base = tk->tkr_mono.base; - nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; - - *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; - *offs_tai = tk->offs_tai; - } while (read_seqcount_retry(&tk_core.seq, seq)); - - return ktime_add_ns(base, nsecs); -} - -#ifdef CONFIG_HIGH_RES_TIMERS -/** * ktime_get_update_offsets_now - hrtimer helper + * @cwsseq: pointer to check and store the clock was set sequence number * @offs_real: pointer to storage for monotonic -> realtime offset * @offs_boot: pointer to storage for monotonic -> boottime offset * @offs_tai: pointer to storage for monotonic -> clock tai offset * - * Returns current monotonic time and updates the offsets + * Returns current monotonic time and updates the offsets if the + * sequence number in @cwsseq and timekeeper.clock_was_set_seq are + * different. + * * Called from hrtimer_interrupt() or retrigger_next_event() */ -ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, - ktime_t *offs_tai) +ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real, + ktime_t *offs_boot, ktime_t *offs_tai) { struct timekeeper *tk = &tk_core.timekeeper; unsigned int seq; @@ -1978,15 +1969,23 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, base = tk->tkr_mono.base; nsecs = timekeeping_get_ns(&tk->tkr_mono); + base = ktime_add_ns(base, nsecs); + + if (*cwsseq != tk->clock_was_set_seq) { + *cwsseq = tk->clock_was_set_seq; + *offs_real = tk->offs_real; + *offs_boot = tk->offs_boot; + *offs_tai = tk->offs_tai; + } + + /* Handle leapsecond insertion adjustments */ + if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64)) + *offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0)); - *offs_real = tk->offs_real; - *offs_boot = tk->offs_boot; - *offs_tai = tk->offs_tai; } while (read_seqcount_retry(&tk_core.seq, seq)); - return ktime_add_ns(base, nsecs); + return base; } -#endif /** * do_adjtimex() - Accessor function to NTP __do_adjtimex function @@ -2027,6 +2026,8 @@ int do_adjtimex(struct timex *txc) __timekeeping_set_tai_offset(tk, tai); timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); } + tk_update_leap_state(tk); + write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index ead8794b9a4e..704f595ce83f 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -3,19 +3,16 @@ /* * Internal interfaces for kernel/time/ */ -extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, - ktime_t *offs_boot, - ktime_t *offs_tai); -extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, - ktime_t *offs_boot, - ktime_t *offs_tai); +extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, + ktime_t *offs_real, + ktime_t *offs_boot, + ktime_t *offs_tai); extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern int timekeeping_inject_offset(struct timespec *ts); extern s32 timekeeping_get_tai_offset(void); extern void timekeeping_set_tai_offset(s32 tai_offset); -extern void timekeeping_clocktai(struct timespec *ts); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2ece3aa5069c..520499dd85af 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -49,6 +49,8 @@ #include <asm/timex.h> #include <asm/io.h> +#include "tick-internal.h" + #define CREATE_TRACE_POINTS #include <trace/events/timer.h> @@ -68,11 +70,11 @@ EXPORT_SYMBOL(jiffies_64); #define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1)) struct tvec { - struct list_head vec[TVN_SIZE]; + struct hlist_head vec[TVN_SIZE]; }; struct tvec_root { - struct list_head vec[TVR_SIZE]; + struct hlist_head vec[TVR_SIZE]; }; struct tvec_base { @@ -83,6 +85,8 @@ struct tvec_base { unsigned long active_timers; unsigned long all_timers; int cpu; + bool migration_enabled; + bool nohz_active; struct tvec_root tv1; struct tvec tv2; struct tvec tv3; @@ -90,43 +94,60 @@ struct tvec_base { struct tvec tv5; } ____cacheline_aligned; -/* - * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've - * made NULL special, hint: lock_timer_base()) and we cannot get a compile time - * pointer to per-cpu entries because we don't know where we'll map the section, - * even for the boot cpu. - * - * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the - * rest of them. - */ -struct tvec_base boot_tvec_bases; -EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; +static DEFINE_PER_CPU(struct tvec_base, tvec_bases); + +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +unsigned int sysctl_timer_migration = 1; -/* Functions below help us manage 'deferrable' flag */ -static inline unsigned int tbase_get_deferrable(struct tvec_base *base) +void timers_update_migration(bool update_nohz) { - return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE); + bool on = sysctl_timer_migration && tick_nohz_active; + unsigned int cpu; + + /* Avoid the loop, if nothing to update */ + if (this_cpu_read(tvec_bases.migration_enabled) == on) + return; + + for_each_possible_cpu(cpu) { + per_cpu(tvec_bases.migration_enabled, cpu) = on; + per_cpu(hrtimer_bases.migration_enabled, cpu) = on; + if (!update_nohz) + continue; + per_cpu(tvec_bases.nohz_active, cpu) = true; + per_cpu(hrtimer_bases.nohz_active, cpu) = true; + } } -static inline unsigned int tbase_get_irqsafe(struct tvec_base *base) +int timer_migration_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) { - return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE); + static DEFINE_MUTEX(mutex); + int ret; + + mutex_lock(&mutex); + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (!ret && write) + timers_update_migration(false); + mutex_unlock(&mutex); + return ret; } -static inline struct tvec_base *tbase_get_base(struct tvec_base *base) +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) { - return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK)); + if (pinned || !base->migration_enabled) + return this_cpu_ptr(&tvec_bases); + return per_cpu_ptr(&tvec_bases, get_nohz_timer_target()); } - -static inline void -timer_set_base(struct timer_list *timer, struct tvec_base *new_base) +#else +static inline struct tvec_base *get_target_base(struct tvec_base *base, + int pinned) { - unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK; - - timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags); + return this_cpu_ptr(&tvec_bases); } +#endif static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) @@ -349,26 +370,12 @@ void set_timer_slack(struct timer_list *timer, int slack_hz) } EXPORT_SYMBOL_GPL(set_timer_slack); -/* - * If the list is empty, catch up ->timer_jiffies to the current time. - * The caller must hold the tvec_base lock. Returns true if the list - * was empty and therefore ->timer_jiffies was updated. - */ -static bool catchup_timer_jiffies(struct tvec_base *base) -{ - if (!base->all_timers) { - base->timer_jiffies = jiffies; - return true; - } - return false; -} - static void __internal_add_timer(struct tvec_base *base, struct timer_list *timer) { unsigned long expires = timer->expires; unsigned long idx = expires - base->timer_jiffies; - struct list_head *vec; + struct hlist_head *vec; if (idx < TVR_SIZE) { int i = expires & TVR_MASK; @@ -401,25 +408,25 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer) i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK; vec = base->tv5.vec + i; } - /* - * Timers are FIFO: - */ - list_add_tail(&timer->entry, vec); + + hlist_add_head(&timer->entry, vec); } static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) { - (void)catchup_timer_jiffies(base); + /* Advance base->jiffies, if the base is empty */ + if (!base->all_timers++) + base->timer_jiffies = jiffies; + __internal_add_timer(base, timer); /* * Update base->active_timers and base->next_timer */ - if (!tbase_get_deferrable(timer->base)) { + if (!(timer->flags & TIMER_DEFERRABLE)) { if (!base->active_timers++ || time_before(timer->expires, base->next_timer)) base->next_timer = timer->expires; } - base->all_timers++; /* * Check whether the other CPU is in dynticks mode and needs @@ -434,8 +441,11 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) * require special care against races with idle_cpu(), lets deal * with that later. */ - if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu)) - wake_up_nohz_cpu(base->cpu); + if (base->nohz_active) { + if (!(timer->flags & TIMER_DEFERRABLE) || + tick_nohz_full_cpu(base->cpu)) + wake_up_nohz_cpu(base->cpu); + } } #ifdef CONFIG_TIMER_STATS @@ -451,15 +461,12 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr) static void timer_stats_account_timer(struct timer_list *timer) { - unsigned int flag = 0; - if (likely(!timer->start_site)) return; - if (unlikely(tbase_get_deferrable(timer->base))) - flag |= TIMER_STATS_FLAG_DEFERRABLE; timer_stats_update_stats(timer, timer->start_pid, timer->start_site, - timer->function, timer->start_comm, flag); + timer->function, timer->start_comm, + timer->flags); } #else @@ -516,8 +523,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state) * statically initialized. We just make sure that it * is tracked in the object tracker. */ - if (timer->entry.next == NULL && - timer->entry.prev == TIMER_ENTRY_STATIC) { + if (timer->entry.pprev == NULL && + timer->entry.next == TIMER_ENTRY_STATIC) { debug_object_init(timer, &timer_debug_descr); debug_object_activate(timer, &timer_debug_descr); return 0; @@ -563,7 +570,7 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state) switch (state) { case ODEBUG_STATE_NOTAVAILABLE: - if (timer->entry.prev == TIMER_ENTRY_STATIC) { + if (timer->entry.next == TIMER_ENTRY_STATIC) { /* * This is not really a fixup. The timer was * statically initialized. We just make sure that it @@ -648,7 +655,7 @@ static inline void debug_activate(struct timer_list *timer, unsigned long expires) { debug_timer_activate(timer); - trace_timer_start(timer, expires); + trace_timer_start(timer, expires, timer->flags); } static inline void debug_deactivate(struct timer_list *timer) @@ -665,10 +672,8 @@ static inline void debug_assert_init(struct timer_list *timer) static void do_init_timer(struct timer_list *timer, unsigned int flags, const char *name, struct lock_class_key *key) { - struct tvec_base *base = raw_cpu_read(tvec_bases); - - timer->entry.next = NULL; - timer->base = (void *)((unsigned long)base | flags); + timer->entry.pprev = NULL; + timer->flags = flags | raw_smp_processor_id(); timer->slack = -1; #ifdef CONFIG_TIMER_STATS timer->start_site = NULL; @@ -699,24 +704,23 @@ EXPORT_SYMBOL(init_timer_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { - struct list_head *entry = &timer->entry; + struct hlist_node *entry = &timer->entry; debug_deactivate(timer); - __list_del(entry->prev, entry->next); + __hlist_del(entry); if (clear_pending) - entry->next = NULL; - entry->prev = LIST_POISON2; + entry->pprev = NULL; + entry->next = LIST_POISON2; } static inline void detach_expired_timer(struct timer_list *timer, struct tvec_base *base) { detach_timer(timer, true); - if (!tbase_get_deferrable(timer->base)) + if (!(timer->flags & TIMER_DEFERRABLE)) base->active_timers--; base->all_timers--; - (void)catchup_timer_jiffies(base); } static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, @@ -726,13 +730,14 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, return 0; detach_timer(timer, clear_pending); - if (!tbase_get_deferrable(timer->base)) { + if (!(timer->flags & TIMER_DEFERRABLE)) { base->active_timers--; if (timer->expires == base->next_timer) base->next_timer = base->timer_jiffies; } - base->all_timers--; - (void)catchup_timer_jiffies(base); + /* If this was the last timer, advance base->jiffies */ + if (!--base->all_timers) + base->timer_jiffies = jiffies; return 1; } @@ -744,24 +749,22 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base, * So __run_timers/migrate_timers can safely modify all timers which could * be found on ->tvX lists. * - * When the timer's base is locked, and the timer removed from list, it is - * possible to set timer->base = NULL and drop the lock: the timer remains - * locked. + * When the timer's base is locked and removed from the list, the + * TIMER_MIGRATING flag is set, FIXME */ static struct tvec_base *lock_timer_base(struct timer_list *timer, unsigned long *flags) __acquires(timer->base->lock) { - struct tvec_base *base; - for (;;) { - struct tvec_base *prelock_base = timer->base; - base = tbase_get_base(prelock_base); - if (likely(base != NULL)) { + u32 tf = timer->flags; + struct tvec_base *base; + + if (!(tf & TIMER_MIGRATING)) { + base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK); spin_lock_irqsave(&base->lock, *flags); - if (likely(prelock_base == timer->base)) + if (timer->flags == tf) return base; - /* The timer has migrated to another CPU */ spin_unlock_irqrestore(&base->lock, *flags); } cpu_relax(); @@ -770,11 +773,11 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer, static inline int __mod_timer(struct timer_list *timer, unsigned long expires, - bool pending_only, int pinned) + bool pending_only, int pinned) { struct tvec_base *base, *new_base; unsigned long flags; - int ret = 0 , cpu; + int ret = 0; timer_stats_timer_set_start_info(timer); BUG_ON(!timer->function); @@ -787,8 +790,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, debug_activate(timer, expires); - cpu = get_nohz_timer_target(pinned); - new_base = per_cpu(tvec_bases, cpu); + new_base = get_target_base(base, pinned); if (base != new_base) { /* @@ -800,11 +802,13 @@ __mod_timer(struct timer_list *timer, unsigned long expires, */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ - timer_set_base(timer, NULL); + timer->flags |= TIMER_MIGRATING; + spin_unlock(&base->lock); base = new_base; spin_lock(&base->lock); - timer_set_base(timer, base); + timer->flags &= ~TIMER_BASEMASK; + timer->flags |= base->cpu; } } @@ -966,13 +970,13 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct tvec_base *base = per_cpu(tvec_bases, cpu); + struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); unsigned long flags; timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); spin_lock_irqsave(&base->lock, flags); - timer_set_base(timer, base); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; debug_activate(timer, timer->expires); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); @@ -1037,8 +1041,6 @@ int try_to_del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(try_to_del_timer_sync); #ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); - /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -1093,7 +1095,7 @@ int del_timer_sync(struct timer_list *timer) * don't use it in hardirq context, because it * could lead to deadlock. */ - WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base)); + WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); for (;;) { int ret = try_to_del_timer_sync(timer); if (ret >= 0) @@ -1107,17 +1109,17 @@ EXPORT_SYMBOL(del_timer_sync); static int cascade(struct tvec_base *base, struct tvec *tv, int index) { /* cascade all the timers from tv up one level */ - struct timer_list *timer, *tmp; - struct list_head tv_list; + struct timer_list *timer; + struct hlist_node *tmp; + struct hlist_head tv_list; - list_replace_init(tv->vec + index, &tv_list); + hlist_move_list(tv->vec + index, &tv_list); /* * We are removing _all_ timers from the list, so we * don't have to detach them individually. */ - list_for_each_entry_safe(timer, tmp, &tv_list, entry) { - BUG_ON(tbase_get_base(timer->base) != base); + hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) { /* No accounting, while moving them */ __internal_add_timer(base, timer); } @@ -1182,14 +1184,18 @@ static inline void __run_timers(struct tvec_base *base) struct timer_list *timer; spin_lock_irq(&base->lock); - if (catchup_timer_jiffies(base)) { - spin_unlock_irq(&base->lock); - return; - } + while (time_after_eq(jiffies, base->timer_jiffies)) { - struct list_head work_list; - struct list_head *head = &work_list; - int index = base->timer_jiffies & TVR_MASK; + struct hlist_head work_list; + struct hlist_head *head = &work_list; + int index; + + if (!base->all_timers) { + base->timer_jiffies = jiffies; + break; + } + + index = base->timer_jiffies & TVR_MASK; /* * Cascade timers: @@ -1200,16 +1206,16 @@ static inline void __run_timers(struct tvec_base *base) !cascade(base, &base->tv4, INDEX(2))) cascade(base, &base->tv5, INDEX(3)); ++base->timer_jiffies; - list_replace_init(base->tv1.vec + index, head); - while (!list_empty(head)) { + hlist_move_list(base->tv1.vec + index, head); + while (!hlist_empty(head)) { void (*fn)(unsigned long); unsigned long data; bool irqsafe; - timer = list_first_entry(head, struct timer_list,entry); + timer = hlist_entry(head->first, struct timer_list, entry); fn = timer->function; data = timer->data; - irqsafe = tbase_get_irqsafe(timer->base); + irqsafe = timer->flags & TIMER_IRQSAFE; timer_stats_account_timer(timer); @@ -1248,8 +1254,8 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base) /* Look for timer events in tv1. */ index = slot = timer_jiffies & TVR_MASK; do { - list_for_each_entry(nte, base->tv1.vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) + hlist_for_each_entry(nte, base->tv1.vec + slot, entry) { + if (nte->flags & TIMER_DEFERRABLE) continue; found = 1; @@ -1279,8 +1285,8 @@ cascade: index = slot = timer_jiffies & TVN_MASK; do { - list_for_each_entry(nte, varp->vec + slot, entry) { - if (tbase_get_deferrable(nte->base)) + hlist_for_each_entry(nte, varp->vec + slot, entry) { + if (nte->flags & TIMER_DEFERRABLE) continue; found = 1; @@ -1311,54 +1317,48 @@ cascade: * Check, if the next hrtimer event is before the next timer wheel * event: */ -static unsigned long cmp_next_hrtimer_event(unsigned long now, - unsigned long expires) +static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { - ktime_t hr_delta = hrtimer_get_next_event(); - struct timespec tsdelta; - unsigned long delta; - - if (hr_delta.tv64 == KTIME_MAX) - return expires; + u64 nextevt = hrtimer_get_next_event(); /* - * Expired timer available, let it expire in the next tick + * If high resolution timers are enabled + * hrtimer_get_next_event() returns KTIME_MAX. */ - if (hr_delta.tv64 <= 0) - return now + 1; - - tsdelta = ktime_to_timespec(hr_delta); - delta = timespec_to_jiffies(&tsdelta); + if (expires <= nextevt) + return expires; /* - * Limit the delta to the max value, which is checked in - * tick_nohz_stop_sched_tick(): + * If the next timer is already expired, return the tick base + * time so the tick is fired immediately. */ - if (delta > NEXT_TIMER_MAX_DELTA) - delta = NEXT_TIMER_MAX_DELTA; + if (nextevt <= basem) + return basem; /* - * Take rounding errors in to account and make sure, that it - * expires in the next tick. Otherwise we go into an endless - * ping pong due to tick_nohz_stop_sched_tick() retriggering - * the timer softirq + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to + * make sure that this tick really expires the timer to avoid + * a ping pong of the nohz stop code. + * + * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 */ - if (delta < 1) - delta = 1; - now += delta; - if (time_before(now, expires)) - return now; - return expires; + return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; } /** - * get_next_timer_interrupt - return the jiffy of the next pending timer - * @now: current time (in jiffies) + * get_next_timer_interrupt - return the time (clock mono) of the next timer + * @basej: base time jiffies + * @basem: base time clock monotonic + * + * Returns the tick aligned clock monotonic time of the next pending + * timer or KTIME_MAX if no timer is pending. */ -unsigned long get_next_timer_interrupt(unsigned long now) +u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { - struct tvec_base *base = __this_cpu_read(tvec_bases); - unsigned long expires = now + NEXT_TIMER_MAX_DELTA; + struct tvec_base *base = this_cpu_ptr(&tvec_bases); + u64 expires = KTIME_MAX; + unsigned long nextevt; /* * Pretend that there is no timer pending if the cpu is offline. @@ -1371,14 +1371,15 @@ unsigned long get_next_timer_interrupt(unsigned long now) if (base->active_timers) { if (time_before_eq(base->next_timer, base->timer_jiffies)) base->next_timer = __next_timer_interrupt(base); - expires = base->next_timer; + nextevt = base->next_timer; + if (time_before_eq(nextevt, basej)) + expires = basem; + else + expires = basem + (nextevt - basej) * TICK_NSEC; } spin_unlock(&base->lock); - if (time_before_eq(expires, now)) - return now; - - return cmp_next_hrtimer_event(now, expires); + return cmp_next_hrtimer_event(basem, expires); } #endif @@ -1407,9 +1408,7 @@ void update_process_times(int user_tick) */ static void run_timer_softirq(struct softirq_action *h) { - struct tvec_base *base = __this_cpu_read(tvec_bases); - - hrtimer_run_pending(); + struct tvec_base *base = this_cpu_ptr(&tvec_bases); if (time_after_eq(jiffies, base->timer_jiffies)) __run_timers(base); @@ -1545,15 +1544,16 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) EXPORT_SYMBOL(schedule_timeout_uninterruptible); #ifdef CONFIG_HOTPLUG_CPU -static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) +static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { struct timer_list *timer; + int cpu = new_base->cpu; - while (!list_empty(head)) { - timer = list_first_entry(head, struct timer_list, entry); + while (!hlist_empty(head)) { + timer = hlist_entry(head->first, struct timer_list, entry); /* We ignore the accounting on the dying cpu */ detach_timer(timer, false); - timer_set_base(timer, new_base); + timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } @@ -1565,8 +1565,8 @@ static void migrate_timers(int cpu) int i; BUG_ON(cpu_online(cpu)); - old_base = per_cpu(tvec_bases, cpu); - new_base = get_cpu_var(tvec_bases); + old_base = per_cpu_ptr(&tvec_bases, cpu); + new_base = this_cpu_ptr(&tvec_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. @@ -1590,7 +1590,6 @@ static void migrate_timers(int cpu) spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); - put_cpu_var(tvec_bases); } static int timer_cpu_notify(struct notifier_block *self, @@ -1616,52 +1615,27 @@ static inline void timer_register_cpu_notifier(void) static inline void timer_register_cpu_notifier(void) { } #endif /* CONFIG_HOTPLUG_CPU */ -static void __init init_timer_cpu(struct tvec_base *base, int cpu) +static void __init init_timer_cpu(int cpu) { - int j; - - BUG_ON(base != tbase_get_base(base)); + struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); base->cpu = cpu; - per_cpu(tvec_bases, cpu) = base; spin_lock_init(&base->lock); - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); - base->timer_jiffies = jiffies; base->next_timer = base->timer_jiffies; } static void __init init_timer_cpus(void) { - struct tvec_base *base; - int local_cpu = smp_processor_id(); int cpu; - for_each_possible_cpu(cpu) { - if (cpu == local_cpu) - base = &boot_tvec_bases; -#ifdef CONFIG_SMP - else - base = per_cpu_ptr(&__tvec_bases, cpu); -#endif - - init_timer_cpu(base, cpu); - } + for_each_possible_cpu(cpu) + init_timer_cpu(cpu); } void __init init_timers(void) { - /* ensure there are enough low bits for flags in timer->base pointer */ - BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); - init_timer_cpus(); init_timer_stats(); timer_register_cpu_notifier(); @@ -1697,14 +1671,14 @@ unsigned long msleep_interruptible(unsigned int msecs) EXPORT_SYMBOL(msleep_interruptible); -static int __sched do_usleep_range(unsigned long min, unsigned long max) +static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; unsigned long delta; kmin = ktime_set(0, min * NSEC_PER_USEC); delta = (max - min) * NSEC_PER_USEC; - return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); + schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } /** @@ -1712,7 +1686,7 @@ static int __sched do_usleep_range(unsigned long min, unsigned long max) * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep */ -void usleep_range(unsigned long min, unsigned long max) +void __sched usleep_range(unsigned long min, unsigned long max) { __set_current_state(TASK_UNINTERRUPTIBLE); do_usleep_range(min, max); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index e878c2e0ba45..a4536e1e3e2a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -29,19 +29,24 @@ struct timer_list_iter { typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes); -DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); - /* * This allows printing both to /proc/timer_list and * to the console (on SysRq-Q): */ -#define SEQ_printf(m, x...) \ - do { \ - if (m) \ - seq_printf(m, x); \ - else \ - printk(x); \ - } while (0) +__printf(2, 3) +static void SEQ_printf(struct seq_file *m, const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + + if (m) + seq_vprintf(m, fmt, args); + else + vprintk(fmt, args); + + va_end(args); +} static void print_name_offset(struct seq_file *m, void *sym) { @@ -120,10 +125,10 @@ static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { SEQ_printf(m, " .base: %pK\n", base); - SEQ_printf(m, " .index: %d\n", - base->index); - SEQ_printf(m, " .resolution: %Lu nsecs\n", - (unsigned long long)ktime_to_ns(base->resolution)); + SEQ_printf(m, " .index: %d\n", base->index); + + SEQ_printf(m, " .resolution: %u nsecs\n", (unsigned) hrtimer_resolution); + SEQ_printf(m, " .get_time: "); print_name_offset(m, base->get_time); SEQ_printf(m, "\n"); @@ -158,7 +163,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P(nr_events); P(nr_retries); P(nr_hangs); - P_ns(max_hang_time); + P(max_hang_time); #endif #undef P #undef P_ns @@ -184,7 +189,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) P_ns(idle_sleeptime); P_ns(iowait_sleeptime); P(last_jiffies); - P(next_jiffies); + P(next_timer); P_ns(idle_expires); SEQ_printf(m, "jiffies: %Lu\n", (unsigned long long)jiffies); @@ -251,6 +256,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, "\n"); } + if (dev->set_state_oneshot_stopped) { + SEQ_printf(m, " oneshot stopped: "); + print_name_offset(m, dev->set_state_oneshot_stopped); + SEQ_printf(m, "\n"); + } + if (dev->tick_resume) { SEQ_printf(m, " resume: "); print_name_offset(m, dev->tick_resume); @@ -269,11 +280,11 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) { #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST print_tickdevice(m, tick_get_broadcast_device(), -1); - SEQ_printf(m, "tick_broadcast_mask: %08lx\n", - cpumask_bits(tick_get_broadcast_mask())[0]); + SEQ_printf(m, "tick_broadcast_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_mask())); #ifdef CONFIG_TICK_ONESHOT - SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n", - cpumask_bits(tick_get_broadcast_oneshot_mask())[0]); + SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n", + cpumask_pr_args(tick_get_broadcast_oneshot_mask())); #endif SEQ_printf(m, "\n"); #endif @@ -282,7 +293,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m) static inline void timer_list_header(struct seq_file *m, u64 now) { - SEQ_printf(m, "Timer List Version: v0.7\n"); + SEQ_printf(m, "Timer List Version: v0.8\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); SEQ_printf(m, "\n"); diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index 1fb08f21302e..1adecb4b87c8 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c @@ -68,7 +68,7 @@ struct entry { * Number of timeout events: */ unsigned long count; - unsigned int timer_flag; + u32 flags; /* * We save the command-line string to preserve @@ -227,13 +227,13 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm) * @startf: pointer to the function which did the timer setup * @timerf: pointer to the timer callback function of the timer * @comm: name of the process which set up the timer + * @tflags: The flags field of the timer * * When the timer is already registered, then the event counter is * incremented. Otherwise the timer is registered in a free slot. */ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, - void *timerf, char *comm, - unsigned int timer_flag) + void *timerf, char *comm, u32 tflags) { /* * It doesn't matter which lock we take: @@ -251,7 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf, input.start_func = startf; input.expire_func = timerf; input.pid = pid; - input.timer_flag = timer_flag; + input.flags = tflags; raw_spin_lock_irqsave(lock, flags); if (!timer_stats_active) @@ -306,7 +306,7 @@ static int tstats_show(struct seq_file *m, void *v) for (i = 0; i < nr_entries; i++) { entry = entries + i; - if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) { + if (entry->flags & TIMER_DEFERRABLE) { seq_printf(m, "%4luD, %5d %-16s ", entry->count, entry->pid, entry->comm); } else { diff --git a/lib/timerqueue.c b/lib/timerqueue.c index a382e4a32609..782ae8ca2c06 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -36,7 +36,7 @@ * Adds the timer node to the timerqueue, sorted by the * node's expires value. */ -void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) +bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) { struct rb_node **p = &head->head.rb_node; struct rb_node *parent = NULL; @@ -56,8 +56,11 @@ void timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node) rb_link_node(&node->node, parent, p); rb_insert_color(&node->node, &head->head); - if (!head->next || node->expires.tv64 < head->next->expires.tv64) + if (!head->next || node->expires.tv64 < head->next->expires.tv64) { head->next = node; + return true; + } + return false; } EXPORT_SYMBOL_GPL(timerqueue_add); @@ -69,7 +72,7 @@ EXPORT_SYMBOL_GPL(timerqueue_add); * * Removes the timer node from the timerqueue. */ -void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) +bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) { WARN_ON_ONCE(RB_EMPTY_NODE(&node->node)); @@ -82,6 +85,7 @@ void timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node) } rb_erase(&node->node, &head->head); RB_CLEAR_NODE(&node->node); + return head->next != NULL; } EXPORT_SYMBOL_GPL(timerqueue_del); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 508155b283dd..54817d365366 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2212,8 +2212,6 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until) do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; if (likely(t.task)) schedule(); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 1e1c89e51a11..73a123daa2cc 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1885,13 +1885,10 @@ EXPORT_SYMBOL(tcf_destroy_chain); #ifdef CONFIG_PROC_FS static int psched_show(struct seq_file *seq, void *v) { - struct timespec ts; - - hrtimer_get_res(CLOCK_MONOTONIC, &ts); seq_printf(seq, "%08x %08x %08x %08x\n", (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1), 1000000, - (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts))); + (u32)NSEC_PER_SEC / hrtimer_resolution); return 0; } diff --git a/sound/core/hrtimer.c b/sound/core/hrtimer.c index 886be7da989d..f845ecf7e172 100644 --- a/sound/core/hrtimer.c +++ b/sound/core/hrtimer.c @@ -121,16 +121,9 @@ static struct snd_timer *mytimer; static int __init snd_hrtimer_init(void) { struct snd_timer *timer; - struct timespec tp; int err; - hrtimer_get_res(CLOCK_MONOTONIC, &tp); - if (tp.tv_sec > 0 || !tp.tv_nsec) { - pr_err("snd-hrtimer: Invalid resolution %u.%09u", - (unsigned)tp.tv_sec, (unsigned)tp.tv_nsec); - return -EINVAL; - } - resolution = tp.tv_nsec; + resolution = hrtimer_resolution; /* Create a new timer and set up the fields */ err = snd_timer_global_new("hrtimer", SNDRV_TIMER_GLOBAL_HRTIMER, diff --git a/sound/drivers/pcsp/pcsp.c b/sound/drivers/pcsp/pcsp.c index d9647bd84d0f..27e25bb78c97 100644 --- a/sound/drivers/pcsp/pcsp.c +++ b/sound/drivers/pcsp/pcsp.c @@ -42,16 +42,13 @@ struct snd_pcsp pcsp_chip; static int snd_pcsp_create(struct snd_card *card) { static struct snd_device_ops ops = { }; - struct timespec tp; - int err; - int div, min_div, order; - - hrtimer_get_res(CLOCK_MONOTONIC, &tp); + unsigned int resolution = hrtimer_resolution; + int err, div, min_div, order; if (!nopcm) { - if (tp.tv_sec || tp.tv_nsec > PCSP_MAX_PERIOD_NS) { + if (resolution > PCSP_MAX_PERIOD_NS) { printk(KERN_ERR "PCSP: Timer resolution is not sufficient " - "(%linS)\n", tp.tv_nsec); + "(%unS)\n", resolution); printk(KERN_ERR "PCSP: Make sure you have HPET and ACPI " "enabled.\n"); printk(KERN_ERR "PCSP: Turned into nopcm mode.\n"); @@ -59,13 +56,13 @@ static int snd_pcsp_create(struct snd_card *card) } } - if (loops_per_jiffy >= PCSP_MIN_LPJ && tp.tv_nsec <= PCSP_MIN_PERIOD_NS) + if (loops_per_jiffy >= PCSP_MIN_LPJ && resolution <= PCSP_MIN_PERIOD_NS) min_div = MIN_DIV; else min_div = MAX_DIV; #if PCSP_DEBUG - printk(KERN_DEBUG "PCSP: lpj=%li, min_div=%i, res=%li\n", - loops_per_jiffy, min_div, tp.tv_nsec); + printk(KERN_DEBUG "PCSP: lpj=%li, min_div=%i, res=%u\n", + loops_per_jiffy, min_div, resolution); #endif div = MAX_DIV / min_div; diff --git a/tools/testing/selftests/timers/leap-a-day.c b/tools/testing/selftests/timers/leap-a-day.c index b8272e6c4b3b..fb46ad6ac92c 100644 --- a/tools/testing/selftests/timers/leap-a-day.c +++ b/tools/testing/selftests/timers/leap-a-day.c @@ -44,6 +44,7 @@ #include <time.h> #include <sys/time.h> #include <sys/timex.h> +#include <sys/errno.h> #include <string.h> #include <signal.h> #include <unistd.h> @@ -63,6 +64,9 @@ static inline int ksft_exit_fail(void) #define NSEC_PER_SEC 1000000000ULL #define CLOCK_TAI 11 +time_t next_leap; +int error_found; + /* returns 1 if a <= b, 0 otherwise */ static inline int in_order(struct timespec a, struct timespec b) { @@ -134,6 +138,35 @@ void handler(int unused) exit(0); } +void sigalarm(int signo) +{ + struct timex tx; + int ret; + + tx.modes = 0; + ret = adjtimex(&tx); + + if (tx.time.tv_sec < next_leap) { + printf("Error: Early timer expiration! (Should be %ld)\n", next_leap); + error_found = 1; + printf("adjtimex: %10ld sec + %6ld us (%i)\t%s\n", + tx.time.tv_sec, + tx.time.tv_usec, + tx.tai, + time_state_str(ret)); + } + if (ret != TIME_WAIT) { + printf("Error: Timer seeing incorrect NTP state? (Should be TIME_WAIT)\n"); + error_found = 1; + printf("adjtimex: %10ld sec + %6ld us (%i)\t%s\n", + tx.time.tv_sec, + tx.time.tv_usec, + tx.tai, + time_state_str(ret)); + } +} + + /* Test for known hrtimer failure */ void test_hrtimer_failure(void) { @@ -144,12 +177,19 @@ void test_hrtimer_failure(void) clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &target, NULL); clock_gettime(CLOCK_REALTIME, &now); - if (!in_order(target, now)) + if (!in_order(target, now)) { printf("ERROR: hrtimer early expiration failure observed.\n"); + error_found = 1; + } } int main(int argc, char **argv) { + timer_t tm1; + struct itimerspec its1; + struct sigevent se; + struct sigaction act; + int signum = SIGRTMAX; int settime = 0; int tai_time = 0; int insert = 1; @@ -191,6 +231,12 @@ int main(int argc, char **argv) signal(SIGINT, handler); signal(SIGKILL, handler); + /* Set up timer signal handler: */ + sigfillset(&act.sa_mask); + act.sa_flags = 0; + act.sa_handler = sigalarm; + sigaction(signum, &act, NULL); + if (iterations < 0) printf("This runs continuously. Press ctrl-c to stop\n"); else @@ -201,7 +247,7 @@ int main(int argc, char **argv) int ret; struct timespec ts; struct timex tx; - time_t now, next_leap; + time_t now; /* Get the current time */ clock_gettime(CLOCK_REALTIME, &ts); @@ -251,10 +297,27 @@ int main(int argc, char **argv) printf("Scheduling leap second for %s", ctime(&next_leap)); + /* Set up timer */ + printf("Setting timer for %ld - %s", next_leap, ctime(&next_leap)); + memset(&se, 0, sizeof(se)); + se.sigev_notify = SIGEV_SIGNAL; + se.sigev_signo = signum; + se.sigev_value.sival_int = 0; + if (timer_create(CLOCK_REALTIME, &se, &tm1) == -1) { + printf("Error: timer_create failed\n"); + return ksft_exit_fail(); + } + its1.it_value.tv_sec = next_leap; + its1.it_value.tv_nsec = 0; + its1.it_interval.tv_sec = 0; + its1.it_interval.tv_nsec = 0; + timer_settime(tm1, TIMER_ABSTIME, &its1, NULL); + /* Wake up 3 seconds before leap */ ts.tv_sec = next_leap - 3; ts.tv_nsec = 0; + while (clock_nanosleep(CLOCK_REALTIME, TIMER_ABSTIME, &ts, NULL)) printf("Something woke us up, returning to sleep\n"); @@ -276,6 +339,7 @@ int main(int argc, char **argv) while (now < next_leap + 2) { char buf[26]; struct timespec tai; + int ret; tx.modes = 0; ret = adjtimex(&tx); @@ -308,8 +372,13 @@ int main(int argc, char **argv) /* Note if kernel has known hrtimer failure */ test_hrtimer_failure(); - printf("Leap complete\n\n"); - + printf("Leap complete\n"); + if (error_found) { + printf("Errors observed\n"); + clear_time_state(); + return ksft_exit_fail(); + } + printf("\n"); if ((iterations != -1) && !(--iterations)) break; } |