diff options
44 files changed, 827 insertions, 395 deletions
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index 8e37b0ba2c9d..cbc1b46cbf70 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -408,6 +408,11 @@ CONTENTS * the new scheduling related syscalls that manipulate it, i.e., sched_setattr() and sched_getattr() are implemented. + For debugging purposes, the leftover runtime and absolute deadline of a + SCHED_DEADLINE task can be retrieved through /proc/<pid>/sched (entries + dl.runtime and dl.deadline, both values in ns). A programmatic way to + retrieve these values from production code is under discussion. + 4.3 Default behavior --------------------- @@ -476,6 +481,7 @@ CONTENTS Still missing: + - programmatic way to retrieve current runtime and absolute deadline - refinements to deadline inheritance, especially regarding the possibility of retaining bandwidth isolation among non-interacting tasks. This is being studied from both theoretical and practical points of view, and diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index a03f0d944fe6..d8fce3e78457 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt @@ -158,11 +158,11 @@ as its prone to starvation without deadline scheduling. Consider two sibling groups A and B; both have 50% bandwidth, but A's period is twice the length of B's. -* group A: period=100000us, runtime=10000us - - this runs for 0.01s once every 0.1s +* group A: period=100000us, runtime=50000us + - this runs for 0.05s once every 0.1s -* group B: period= 50000us, runtime=10000us - - this runs for 0.01s twice every 0.1s (or once every 0.05 sec). +* group B: period= 50000us, runtime=25000us + - this runs for 0.025s twice every 0.1s (or once every 0.05 sec). This means that currently a while (1) loop in A will run for the full period of B and can starve B's tasks (assuming they are of lower priority) for a whole diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index c7026429816b..8742d741d19a 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -27,6 +27,12 @@ struct thread_info { mm_segment_t addr_limit; /* user-level address space limit */ int preempt_count; /* 0=premptable, <0=BUG; will also serve as bh-counter */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + __u64 utime; + __u64 stime; + __u64 gtime; + __u64 hardirq_time; + __u64 softirq_time; + __u64 idle_time; __u64 ac_stamp; __u64 ac_leave; __u64 ac_stime; diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c index 7ec7acc844c2..c483ece3eb84 100644 --- a/arch/ia64/kernel/setup.c +++ b/arch/ia64/kernel/setup.c @@ -619,6 +619,8 @@ setup_arch (char **cmdline_p) check_sal_cache_flush(); #endif paging_init(); + + clear_sched_clock_stable(); } /* diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 71775b95d6cc..d040f12ea9f9 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -61,16 +61,41 @@ static struct clocksource *itc_clocksource; extern cputime_t cycle_to_cputime(u64 cyc); -void vtime_account_user(struct task_struct *tsk) +void vtime_flush(struct task_struct *tsk) { - cputime_t delta_utime; struct thread_info *ti = task_thread_info(tsk); + cputime_t delta; - if (ti->ac_utime) { - delta_utime = cycle_to_cputime(ti->ac_utime); - account_user_time(tsk, delta_utime); - ti->ac_utime = 0; + if (ti->utime) + account_user_time(tsk, cycle_to_cputime(ti->utime)); + + if (ti->gtime) + account_guest_time(tsk, cycle_to_cputime(ti->gtime)); + + if (ti->idle_time) + account_idle_time(cycle_to_cputime(ti->idle_time)); + + if (ti->stime) { + delta = cycle_to_cputime(ti->stime); + account_system_index_time(tsk, delta, CPUTIME_SYSTEM); + } + + if (ti->hardirq_time) { + delta = cycle_to_cputime(ti->hardirq_time); + account_system_index_time(tsk, delta, CPUTIME_IRQ); + } + + if (ti->softirq_time) { + delta = cycle_to_cputime(ti->softirq_time); + account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ); } + + ti->utime = 0; + ti->gtime = 0; + ti->idle_time = 0; + ti->stime = 0; + ti->hardirq_time = 0; + ti->softirq_time = 0; } /* @@ -83,7 +108,7 @@ void arch_vtime_task_switch(struct task_struct *prev) struct thread_info *pi = task_thread_info(prev); struct thread_info *ni = task_thread_info(current); - pi->ac_stamp = ni->ac_stamp; + ni->ac_stamp = pi->ac_stamp; ni->ac_stime = ni->ac_utime = 0; } @@ -91,18 +116,15 @@ void arch_vtime_task_switch(struct task_struct *prev) * Account time for a transition between system, hard irq or soft irq state. * Note that this function is called with interrupts enabled. */ -static cputime_t vtime_delta(struct task_struct *tsk) +static __u64 vtime_delta(struct task_struct *tsk) { struct thread_info *ti = task_thread_info(tsk); - cputime_t delta_stime; - __u64 now; + __u64 now, delta_stime; WARN_ON_ONCE(!irqs_disabled()); now = ia64_get_itc(); - - delta_stime = cycle_to_cputime(ti->ac_stime + (now - ti->ac_stamp)); - ti->ac_stime = 0; + delta_stime = now - ti->ac_stamp; ti->ac_stamp = now; return delta_stime; @@ -110,15 +132,25 @@ static cputime_t vtime_delta(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { - cputime_t delta = vtime_delta(tsk); - - account_system_time(tsk, 0, delta); + struct thread_info *ti = task_thread_info(tsk); + __u64 stime = vtime_delta(tsk); + + if ((tsk->flags & PF_VCPU) && !irq_count()) + ti->gtime += stime; + else if (hardirq_count()) + ti->hardirq_time += stime; + else if (in_serving_softirq()) + ti->softirq_time += stime; + else + ti->stime += stime; } EXPORT_SYMBOL_GPL(vtime_account_system); void vtime_account_idle(struct task_struct *tsk) { - account_idle_time(vtime_delta(tsk)); + struct thread_info *ti = task_thread_info(tsk); + + ti->idle_time += vtime_delta(tsk); } #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index 2e66a887788e..068ed3607bac 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -36,6 +36,7 @@ #undef PCI_DEBUG #include <linux/proc_fs.h> #include <linux/export.h> +#include <linux/sched.h> #include <asm/processor.h> #include <asm/sections.h> @@ -176,6 +177,7 @@ void __init setup_arch(char **cmdline_p) conswitchp = &dummy_con; /* we use do_take_over_console() later ! */ #endif + clear_sched_clock_stable(); } /* diff --git a/arch/powerpc/include/asm/accounting.h b/arch/powerpc/include/asm/accounting.h index c133246df467..3abcf98ed2e0 100644 --- a/arch/powerpc/include/asm/accounting.h +++ b/arch/powerpc/include/asm/accounting.h @@ -12,9 +12,17 @@ /* Stuff for accurate time accounting */ struct cpu_accounting_data { - unsigned long user_time; /* accumulated usermode TB ticks */ - unsigned long system_time; /* accumulated system TB ticks */ - unsigned long user_time_scaled; /* accumulated usermode SPURR ticks */ + /* Accumulated cputime values to flush on ticks*/ + unsigned long utime; + unsigned long stime; + unsigned long utime_scaled; + unsigned long stime_scaled; + unsigned long gtime; + unsigned long hardirq_time; + unsigned long softirq_time; + unsigned long steal_time; + unsigned long idle_time; + /* Internal counters */ unsigned long starttime; /* TB value snapshot */ unsigned long starttime_user; /* TB value on exit to usermode */ unsigned long startspurr; /* SPURR value snapshot */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 6a6792bb39fb..708c3e592eeb 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -187,7 +187,6 @@ struct paca_struct { /* Stuff for accurate time accounting */ struct cpu_accounting_data accounting; - u64 stolen_time; /* TB ticks taken by hypervisor */ u64 dtl_ridx; /* read index in dispatch log */ struct dtl_entry *dtl_curr; /* pointer corresponding to dtl_ridx */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 0601e6a7297c..e505319574ca 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -252,9 +252,9 @@ int main(void) DEFINE(ACCOUNT_STARTTIME_USER, offsetof(struct paca_struct, accounting.starttime_user)); DEFINE(ACCOUNT_USER_TIME, - offsetof(struct paca_struct, accounting.user_time)); + offsetof(struct paca_struct, accounting.utime)); DEFINE(ACCOUNT_SYSTEM_TIME, - offsetof(struct paca_struct, accounting.system_time)); + offsetof(struct paca_struct, accounting.stime)); DEFINE(PACA_TRAP_SAVE, offsetof(struct paca_struct, trap_save)); DEFINE(PACA_NAPSTATELOST, offsetof(struct paca_struct, nap_state_lost)); DEFINE(PACA_SPRG_VDSO, offsetof(struct paca_struct, sprg_vdso)); @@ -265,9 +265,9 @@ int main(void) DEFINE(ACCOUNT_STARTTIME_USER, offsetof(struct thread_info, accounting.starttime_user)); DEFINE(ACCOUNT_USER_TIME, - offsetof(struct thread_info, accounting.user_time)); + offsetof(struct thread_info, accounting.utime)); DEFINE(ACCOUNT_SYSTEM_TIME, - offsetof(struct thread_info, accounting.system_time)); + offsetof(struct thread_info, accounting.stime)); #endif #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index bc2e08d415fa..02e97305d22b 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -271,25 +271,19 @@ void accumulate_stolen_time(void) sst = scan_dispatch_log(acct->starttime_user); ust = scan_dispatch_log(acct->starttime); - acct->system_time -= sst; - acct->user_time -= ust; - local_paca->stolen_time += ust + sst; + acct->stime -= sst; + acct->utime -= ust; + acct->steal_time += ust + sst; local_paca->soft_enabled = save_soft_enabled; } static inline u64 calculate_stolen_time(u64 stop_tb) { - u64 stolen = 0; - - if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) { - stolen = scan_dispatch_log(stop_tb); - get_paca()->accounting.system_time -= stolen; - } + if (get_paca()->dtl_ridx != be64_to_cpu(get_lppaca()->dtl_idx)) + return scan_dispatch_log(stop_tb); - stolen += get_paca()->stolen_time; - get_paca()->stolen_time = 0; - return stolen; + return 0; } #else /* CONFIG_PPC_SPLPAR */ @@ -305,28 +299,27 @@ static inline u64 calculate_stolen_time(u64 stop_tb) * or soft irq state. */ static unsigned long vtime_delta(struct task_struct *tsk, - unsigned long *sys_scaled, - unsigned long *stolen) + unsigned long *stime_scaled, + unsigned long *steal_time) { unsigned long now, nowscaled, deltascaled; - unsigned long udelta, delta, user_scaled; + unsigned long stime; + unsigned long utime, utime_scaled; struct cpu_accounting_data *acct = get_accounting(tsk); WARN_ON_ONCE(!irqs_disabled()); now = mftb(); nowscaled = read_spurr(now); - acct->system_time += now - acct->starttime; + stime = now - acct->starttime; acct->starttime = now; deltascaled = nowscaled - acct->startspurr; acct->startspurr = nowscaled; - *stolen = calculate_stolen_time(now); + *steal_time = calculate_stolen_time(now); - delta = acct->system_time; - acct->system_time = 0; - udelta = acct->user_time - acct->utime_sspurr; - acct->utime_sspurr = acct->user_time; + utime = acct->utime - acct->utime_sspurr; + acct->utime_sspurr = acct->utime; /* * Because we don't read the SPURR on every kernel entry/exit, @@ -338,62 +331,104 @@ static unsigned long vtime_delta(struct task_struct *tsk, * the user ticks get saved up in paca->user_time_scaled to be * used by account_process_tick. */ - *sys_scaled = delta; - user_scaled = udelta; - if (deltascaled != delta + udelta) { - if (udelta) { - *sys_scaled = deltascaled * delta / (delta + udelta); - user_scaled = deltascaled - *sys_scaled; + *stime_scaled = stime; + utime_scaled = utime; + if (deltascaled != stime + utime) { + if (utime) { + *stime_scaled = deltascaled * stime / (stime + utime); + utime_scaled = deltascaled - *stime_scaled; } else { - *sys_scaled = deltascaled; + *stime_scaled = deltascaled; } } - acct->user_time_scaled += user_scaled; + acct->utime_scaled += utime_scaled; - return delta; + return stime; } void vtime_account_system(struct task_struct *tsk) { - unsigned long delta, sys_scaled, stolen; + unsigned long stime, stime_scaled, steal_time; + struct cpu_accounting_data *acct = get_accounting(tsk); + + stime = vtime_delta(tsk, &stime_scaled, &steal_time); + + stime -= min(stime, steal_time); + acct->steal_time += steal_time; - delta = vtime_delta(tsk, &sys_scaled, &stolen); - account_system_time(tsk, 0, delta); - tsk->stimescaled += sys_scaled; - if (stolen) - account_steal_time(stolen); + if ((tsk->flags & PF_VCPU) && !irq_count()) { + acct->gtime += stime; + acct->utime_scaled += stime_scaled; + } else { + if (hardirq_count()) + acct->hardirq_time += stime; + else if (in_serving_softirq()) + acct->softirq_time += stime; + else + acct->stime += stime; + + acct->stime_scaled += stime_scaled; + } } EXPORT_SYMBOL_GPL(vtime_account_system); void vtime_account_idle(struct task_struct *tsk) { - unsigned long delta, sys_scaled, stolen; + unsigned long stime, stime_scaled, steal_time; + struct cpu_accounting_data *acct = get_accounting(tsk); - delta = vtime_delta(tsk, &sys_scaled, &stolen); - account_idle_time(delta + stolen); + stime = vtime_delta(tsk, &stime_scaled, &steal_time); + acct->idle_time += stime + steal_time; } /* - * Transfer the user time accumulated in the paca - * by the exception entry and exit code to the generic - * process user time records. + * Account the whole cputime accumulated in the paca * Must be called with interrupts disabled. * Assumes that vtime_account_system/idle() has been called * recently (i.e. since the last entry from usermode) so that * get_paca()->user_time_scaled is up to date. */ -void vtime_account_user(struct task_struct *tsk) +void vtime_flush(struct task_struct *tsk) { - cputime_t utime, utimescaled; struct cpu_accounting_data *acct = get_accounting(tsk); - utime = acct->user_time; - utimescaled = acct->user_time_scaled; - acct->user_time = 0; - acct->user_time_scaled = 0; + if (acct->utime) + account_user_time(tsk, acct->utime); + + if (acct->utime_scaled) + tsk->utimescaled += acct->utime_scaled; + + if (acct->gtime) + account_guest_time(tsk, acct->gtime); + + if (acct->steal_time) + account_steal_time(acct->steal_time); + + if (acct->idle_time) + account_idle_time(acct->idle_time); + + if (acct->stime) + account_system_index_time(tsk, acct->stime, CPUTIME_SYSTEM); + + if (acct->stime_scaled) + tsk->stimescaled += acct->stime_scaled; + + if (acct->hardirq_time) + account_system_index_time(tsk, acct->hardirq_time, CPUTIME_IRQ); + + if (acct->softirq_time) + account_system_index_time(tsk, acct->softirq_time, CPUTIME_SOFTIRQ); + + acct->utime = 0; + acct->utime_scaled = 0; acct->utime_sspurr = 0; - account_user_time(tsk, utime); - tsk->utimescaled += utimescaled; + acct->gtime = 0; + acct->steal_time = 0; + acct->idle_time = 0; + acct->stime = 0; + acct->stime_scaled = 0; + acct->hardirq_time = 0; + acct->softirq_time = 0; } #ifdef CONFIG_PPC32 @@ -407,8 +442,7 @@ void arch_vtime_task_switch(struct task_struct *prev) struct cpu_accounting_data *acct = get_accounting(current); acct->starttime = get_accounting(prev)->starttime; - acct->system_time = 0; - acct->user_time = 0; + acct->startspurr = get_accounting(prev)->startspurr; } #endif /* CONFIG_PPC32 */ diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 9c0e17cf6886..3f864c36d847 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -2287,14 +2287,14 @@ static void dump_one_paca(int cpu) DUMP(p, subcore_sibling_mask, "x"); #endif - DUMP(p, accounting.user_time, "llx"); - DUMP(p, accounting.system_time, "llx"); - DUMP(p, accounting.user_time_scaled, "llx"); + DUMP(p, accounting.utime, "llx"); + DUMP(p, accounting.stime, "llx"); + DUMP(p, accounting.utime_scaled, "llx"); DUMP(p, accounting.starttime, "llx"); DUMP(p, accounting.starttime_user, "llx"); DUMP(p, accounting.startspurr, "llx"); DUMP(p, accounting.utime_sspurr, "llx"); - DUMP(p, stolen_time, "llx"); + DUMP(p, accounting.steal_time, "llx"); #undef DUMP catch_memory_errors = 0; diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 9bfad2ad6312..61261e0e95c0 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -85,53 +85,56 @@ struct lowcore { __u64 mcck_enter_timer; /* 0x02c0 */ __u64 exit_timer; /* 0x02c8 */ __u64 user_timer; /* 0x02d0 */ - __u64 system_timer; /* 0x02d8 */ - __u64 steal_timer; /* 0x02e0 */ - __u64 last_update_timer; /* 0x02e8 */ - __u64 last_update_clock; /* 0x02f0 */ - __u64 int_clock; /* 0x02f8 */ - __u64 mcck_clock; /* 0x0300 */ - __u64 clock_comparator; /* 0x0308 */ + __u64 guest_timer; /* 0x02d8 */ + __u64 system_timer; /* 0x02e0 */ + __u64 hardirq_timer; /* 0x02e8 */ + __u64 softirq_timer; /* 0x02f0 */ + __u64 steal_timer; /* 0x02f8 */ + __u64 last_update_timer; /* 0x0300 */ + __u64 last_update_clock; /* 0x0308 */ + __u64 int_clock; /* 0x0310 */ + __u64 mcck_clock; /* 0x0318 */ + __u64 clock_comparator; /* 0x0320 */ /* Current process. */ - __u64 current_task; /* 0x0310 */ - __u8 pad_0x318[0x320-0x318]; /* 0x0318 */ - __u64 kernel_stack; /* 0x0320 */ + __u64 current_task; /* 0x0328 */ + __u8 pad_0x318[0x320-0x318]; /* 0x0330 */ + __u64 kernel_stack; /* 0x0338 */ /* Interrupt, panic and restart stack. */ - __u64 async_stack; /* 0x0328 */ - __u64 panic_stack; /* 0x0330 */ - __u64 restart_stack; /* 0x0338 */ + __u64 async_stack; /* 0x0340 */ + __u64 panic_stack; /* 0x0348 */ + __u64 restart_stack; /* 0x0350 */ /* Restart function and parameter. */ - __u64 restart_fn; /* 0x0340 */ - __u64 restart_data; /* 0x0348 */ - __u64 restart_source; /* 0x0350 */ + __u64 restart_fn; /* 0x0358 */ + __u64 restart_data; /* 0x0360 */ + __u64 restart_source; /* 0x0368 */ /* Address space pointer. */ - __u64 kernel_asce; /* 0x0358 */ - __u64 user_asce; /* 0x0360 */ + __u64 kernel_asce; /* 0x0370 */ + __u64 user_asce; /* 0x0378 */ /* * The lpp and current_pid fields form a * 64-bit value that is set as program * parameter with the LPP instruction. */ - __u32 lpp; /* 0x0368 */ - __u32 current_pid; /* 0x036c */ + __u32 lpp; /* 0x0380 */ + __u32 current_pid; /* 0x0384 */ /* SMP info area */ - __u32 cpu_nr; /* 0x0370 */ - __u32 softirq_pending; /* 0x0374 */ - __u64 percpu_offset; /* 0x0378 */ - __u64 vdso_per_cpu_data; /* 0x0380 */ - __u64 machine_flags; /* 0x0388 */ - __u32 preempt_count; /* 0x0390 */ - __u8 pad_0x0394[0x0398-0x0394]; /* 0x0394 */ - __u64 gmap; /* 0x0398 */ - __u32 spinlock_lockval; /* 0x03a0 */ - __u32 fpu_flags; /* 0x03a4 */ - __u8 pad_0x03a8[0x0400-0x03a8]; /* 0x03a8 */ + __u32 cpu_nr; /* 0x0388 */ + __u32 softirq_pending; /* 0x038c */ + __u64 percpu_offset; /* 0x0390 */ + __u64 vdso_per_cpu_data; /* 0x0398 */ + __u64 machine_flags; /* 0x03a0 */ + __u32 preempt_count; /* 0x03a8 */ + __u8 pad_0x03ac[0x03b0-0x03ac]; /* 0x03ac */ + __u64 gmap; /* 0x03b0 */ + __u32 spinlock_lockval; /* 0x03b8 */ + __u32 fpu_flags; /* 0x03bc */ + __u8 pad_0x03c0[0x0400-0x03c0]; /* 0x03c0 */ /* Per cpu primary space access list */ __u32 paste[16]; /* 0x0400 */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 6bca916a5ba0..977a5b6501b8 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -111,7 +111,10 @@ struct thread_struct { unsigned int acrs[NUM_ACRS]; unsigned long ksp; /* kernel stack pointer */ unsigned long user_timer; /* task cputime in user space */ + unsigned long guest_timer; /* task cputime in kvm guest */ unsigned long system_timer; /* task cputime in kernel space */ + unsigned long hardirq_timer; /* task cputime in hardirq context */ + unsigned long softirq_timer; /* task cputime in softirq context */ unsigned long sys_call_table; /* system call table address */ mm_segment_t mm_segment; unsigned long gmap_addr; /* address of last gmap fault. */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 1b5c5ee9fc1b..0a9e5d67547d 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -90,14 +90,41 @@ static void update_mt_scaling(void) __this_cpu_write(mt_scaling_jiffies, jiffies_64); } +static inline u64 update_tsk_timer(unsigned long *tsk_vtime, u64 new) +{ + u64 delta; + + delta = new - *tsk_vtime; + *tsk_vtime = new; + return delta; +} + + +static inline u64 scale_vtime(u64 vtime) +{ + u64 mult = __this_cpu_read(mt_scaling_mult); + u64 div = __this_cpu_read(mt_scaling_div); + + if (smp_cpu_mtid) + return vtime * mult / div; + return vtime; +} + +static void account_system_index_scaled(struct task_struct *p, + cputime_t cputime, cputime_t scaled, + enum cpu_usage_stat index) +{ + p->stimescaled += scaled; + account_system_index_time(p, cputime, index); +} + /* * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. */ static int do_account_vtime(struct task_struct *tsk) { - u64 timer, clock, user, system, steal; - u64 user_scaled, system_scaled; + u64 timer, clock, user, guest, system, hardirq, softirq, steal; timer = S390_lowcore.last_update_timer; clock = S390_lowcore.last_update_clock; @@ -110,36 +137,53 @@ static int do_account_vtime(struct task_struct *tsk) #endif : "=m" (S390_lowcore.last_update_timer), "=m" (S390_lowcore.last_update_clock)); - S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; - S390_lowcore.steal_timer += S390_lowcore.last_update_clock - clock; + clock = S390_lowcore.last_update_clock - clock; + timer -= S390_lowcore.last_update_timer; + + if (hardirq_count()) + S390_lowcore.hardirq_timer += timer; + else + S390_lowcore.system_timer += timer; /* Update MT utilization calculation */ if (smp_cpu_mtid && time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) update_mt_scaling(); - user = S390_lowcore.user_timer - tsk->thread.user_timer; - S390_lowcore.steal_timer -= user; - tsk->thread.user_timer = S390_lowcore.user_timer; - - system = S390_lowcore.system_timer - tsk->thread.system_timer; - S390_lowcore.steal_timer -= system; - tsk->thread.system_timer = S390_lowcore.system_timer; - - user_scaled = user; - system_scaled = system; - /* Do MT utilization scaling */ - if (smp_cpu_mtid) { - u64 mult = __this_cpu_read(mt_scaling_mult); - u64 div = __this_cpu_read(mt_scaling_div); + /* Calculate cputime delta */ + user = update_tsk_timer(&tsk->thread.user_timer, + READ_ONCE(S390_lowcore.user_timer)); + guest = update_tsk_timer(&tsk->thread.guest_timer, + READ_ONCE(S390_lowcore.guest_timer)); + system = update_tsk_timer(&tsk->thread.system_timer, + READ_ONCE(S390_lowcore.system_timer)); + hardirq = update_tsk_timer(&tsk->thread.hardirq_timer, + READ_ONCE(S390_lowcore.hardirq_timer)); + softirq = update_tsk_timer(&tsk->thread.softirq_timer, + READ_ONCE(S390_lowcore.softirq_timer)); + S390_lowcore.steal_timer += + clock - user - guest - system - hardirq - softirq; + + /* Push account value */ + if (user) { + account_user_time(tsk, user); + tsk->utimescaled += scale_vtime(user); + } - user_scaled = (user_scaled * mult) / div; - system_scaled = (system_scaled * mult) / div; + if (guest) { + account_guest_time(tsk, guest); + tsk->utimescaled += scale_vtime(guest); } - account_user_time(tsk, user); - tsk->utimescaled += user_scaled; - account_system_time(tsk, 0, system); - tsk->stimescaled += system_scaled; + + if (system) + account_system_index_scaled(tsk, system, scale_vtime(system), + CPUTIME_SYSTEM); + if (hardirq) + account_system_index_scaled(tsk, hardirq, scale_vtime(hardirq), + CPUTIME_IRQ); + if (softirq) + account_system_index_scaled(tsk, softirq, scale_vtime(softirq), + CPUTIME_SOFTIRQ); steal = S390_lowcore.steal_timer; if ((s64) steal > 0) { @@ -147,16 +191,22 @@ static int do_account_vtime(struct task_struct *tsk) account_steal_time(steal); } - return virt_timer_forward(user + system); + return virt_timer_forward(user + guest + system + hardirq + softirq); } void vtime_task_switch(struct task_struct *prev) { do_account_vtime(prev); prev->thread.user_timer = S390_lowcore.user_timer; + prev->thread.guest_timer = S390_lowcore.guest_timer; prev->thread.system_timer = S390_lowcore.system_timer; + prev->thread.hardirq_timer = S390_lowcore.hardirq_timer; + prev->thread.softirq_timer = S390_lowcore.softirq_timer; S390_lowcore.user_timer = current->thread.user_timer; + S390_lowcore.guest_timer = current->thread.guest_timer; S390_lowcore.system_timer = current->thread.system_timer; + S390_lowcore.hardirq_timer = current->thread.hardirq_timer; + S390_lowcore.softirq_timer = current->thread.softirq_timer; } /* @@ -164,7 +214,7 @@ void vtime_task_switch(struct task_struct *prev) * accounting system time in order to correctly compute * the stolen time accounting. */ -void vtime_account_user(struct task_struct *tsk) +void vtime_flush(struct task_struct *tsk) { if (do_account_vtime(tsk)) virt_timer_expire(); @@ -176,32 +226,22 @@ void vtime_account_user(struct task_struct *tsk) */ void vtime_account_irq_enter(struct task_struct *tsk) { - u64 timer, system, system_scaled; + u64 timer; timer = S390_lowcore.last_update_timer; S390_lowcore.last_update_timer = get_vtimer(); - S390_lowcore.system_timer += timer - S390_lowcore.last_update_timer; - - /* Update MT utilization calculation */ - if (smp_cpu_mtid && - time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) - update_mt_scaling(); - - system = S390_lowcore.system_timer - tsk->thread.system_timer; - S390_lowcore.steal_timer -= system; - tsk->thread.system_timer = S390_lowcore.system_timer; - system_scaled = system; - /* Do MT utilization scaling */ - if (smp_cpu_mtid) { - u64 mult = __this_cpu_read(mt_scaling_mult); - u64 div = __this_cpu_read(mt_scaling_div); - - system_scaled = (system_scaled * mult) / div; - } - account_system_time(tsk, 0, system); - tsk->stimescaled += system_scaled; - - virt_timer_forward(system); + timer -= S390_lowcore.last_update_timer; + + if ((tsk->flags & PF_VCPU) && (irq_count() == 0)) + S390_lowcore.guest_timer += timer; + else if (hardirq_count()) + S390_lowcore.hardirq_timer += timer; + else if (in_serving_softirq()) + S390_lowcore.softirq_timer += timer; + else + S390_lowcore.system_timer += timer; + + virt_timer_forward(timer); } EXPORT_SYMBOL_GPL(vtime_account_irq_enter); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1d3167269a67..80e657e89eed 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -541,8 +541,10 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (!check_tsc_unstable()) - set_sched_clock_stable(); + if (check_tsc_unstable()) + clear_sched_clock_stable(); + } else { + clear_sched_clock_stable(); } /* Bit 12 of 8000_0007 edx is accumulated power mechanism. */ diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index 1661d8ec9280..2c234a6d94c4 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -1,5 +1,5 @@ -#include <linux/bitops.h> -#include <linux/kernel.h> + +#include <linux/sched.h> #include <asm/cpufeature.h> #include <asm/e820.h> @@ -104,6 +104,8 @@ static void early_init_centaur(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); #endif + + clear_sched_clock_stable(); } static void init_centaur(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9bab7a8a4293..0bdb1ab7d17c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -83,6 +83,7 @@ static void default_init(struct cpuinfo_x86 *c) strcpy(c->x86_model_id, "386"); } #endif + clear_sched_clock_stable(); } static const struct cpu_dev default_cpu = { @@ -1055,6 +1056,8 @@ static void identify_cpu(struct cpuinfo_x86 *c) */ if (this_cpu->c_init) this_cpu->c_init(c); + else + clear_sched_clock_stable(); /* Disable the PN if appropriate */ squash_the_stupid_serial_number(c); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index bd9dcd6b712d..47416f959a48 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -9,6 +9,7 @@ #include <asm/pci-direct.h> #include <asm/tsc.h> #include <asm/cpufeature.h> +#include <linux/sched.h> #include "cpu.h" @@ -183,6 +184,7 @@ static void early_init_cyrix(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); break; } + clear_sched_clock_stable(); } static void init_cyrix(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 203f860d2ab3..026c728d6ba7 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -119,8 +119,10 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86_power & (1 << 8)) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); - if (!check_tsc_unstable()) - set_sched_clock_stable(); + if (check_tsc_unstable()) + clear_sched_clock_stable(); + } else { + clear_sched_clock_stable(); } /* Penwell and Cloverview have the TSC which doesn't sleep on S3 */ diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 34178564be2a..c1ea5b999839 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -1,4 +1,5 @@ #include <linux/kernel.h> +#include <linux/sched.h> #include <linux/mm.h> #include <asm/cpufeature.h> #include <asm/msr.h> @@ -14,6 +15,8 @@ static void early_init_transmeta(struct cpuinfo_x86 *c) if (xlvl >= 0x80860001) c->x86_capability[CPUID_8086_0001_EDX] = cpuid_edx(0x80860001); } + + clear_sched_clock_stable(); } static void init_transmeta(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c index cb9c1ed1d391..f73f475d0573 100644 --- a/arch/x86/kernel/itmt.c +++ b/arch/x86/kernel/itmt.c @@ -132,10 +132,8 @@ int sched_set_itmt_support(void) sysctl_sched_itmt_enabled = 1; - if (sysctl_sched_itmt_enabled) { - x86_topology_update = true; - rebuild_sched_domains(); - } + x86_topology_update = true; + rebuild_sched_domains(); mutex_unlock(&itmt_update_mutex); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2a5cafdf8808..542710b99f52 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -107,12 +107,12 @@ static inline void kvm_sched_clock_init(bool stable) { if (!stable) { pv_time_ops.sched_clock = kvm_clock_read; + clear_sched_clock_stable(); return; } kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - set_sched_clock_stable(); printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", kvm_sched_clock_offset); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e41af597aed8..cb60bbe093b2 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1107,6 +1107,16 @@ static u64 read_tsc(struct clocksource *cs) return (u64)rdtsc_ordered(); } +static void tsc_cs_mark_unstable(struct clocksource *cs) +{ + if (tsc_unstable) + return; + tsc_unstable = 1; + clear_sched_clock_stable(); + disable_sched_clock_irqtime(); + pr_info("Marking TSC unstable due to clocksource watchdog\n"); +} + /* * .mask MUST be CLOCKSOURCE_MASK(64). See comment above read_tsc() */ @@ -1119,6 +1129,7 @@ static struct clocksource clocksource_tsc = { CLOCK_SOURCE_MUST_VERIFY, .archdata = { .vclock_mode = VCLOCK_TSC }, .resume = tsc_resume, + .mark_unstable = tsc_cs_mark_unstable, }; void mark_tsc_unstable(char *reason) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 8c514367ba5a..b6b194ec1b4f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -393,7 +393,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) /* Do we need to erase the effects of a prior jbd2_journal_flush? */ if (journal->j_flags & JBD2_FLUSHED) { jbd_debug(3, "super block updated\n"); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); /* * We hold j_checkpoint_mutex so tail cannot change under us. * We don't need any special data guarantees for writing sb diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a097048ed1a3..d8a5d0a08f07 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -944,7 +944,7 @@ out: */ void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) { - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); if (tid_gt(tid, journal->j_tail_sequence)) __jbd2_update_log_tail(journal, tid, block); mutex_unlock(&journal->j_checkpoint_mutex); @@ -1304,7 +1304,7 @@ static int journal_reset(journal_t *journal) journal->j_flags |= JBD2_FLUSHED; } else { /* Lock here to make assertions happy... */ - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); /* * Update log tail information. We use REQ_FUA since new * transaction will start reusing journal space and so we @@ -1691,7 +1691,7 @@ int jbd2_journal_destroy(journal_t *journal) spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); err = jbd2_log_do_checkpoint(journal); mutex_unlock(&journal->j_checkpoint_mutex); /* @@ -1713,7 +1713,7 @@ int jbd2_journal_destroy(journal_t *journal) if (journal->j_sb_buffer) { if (!is_journal_aborted(journal)) { - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); journal->j_tail_sequence = @@ -1955,7 +1955,7 @@ int jbd2_journal_flush(journal_t *journal) spin_lock(&journal->j_list_lock); while (!err && journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); err = jbd2_log_do_checkpoint(journal); mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); @@ -1965,7 +1965,7 @@ int jbd2_journal_flush(journal_t *journal) if (is_journal_aborted(journal)) return -EIO; - mutex_lock(&journal->j_checkpoint_mutex); + mutex_lock_io(&journal->j_checkpoint_mutex); if (!err) { err = jbd2_cleanup_journal_tail(journal); if (err < 0) { diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index e315d04a2fd9..cfc75848a35d 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -62,6 +62,8 @@ struct module; * @archdata: arch-specific data * @suspend: suspend function for the clocksource, if necessary * @resume: resume function for the clocksource, if necessary + * @mark_unstable: Optional function to inform the clocksource driver that + * the watchdog marked the clocksource unstable * @owner: module reference, must be set by clocksource in modules * * Note: This struct is not used in hotpathes of the timekeeping code @@ -93,6 +95,7 @@ struct clocksource { unsigned long flags; void (*suspend)(struct clocksource *cs); void (*resume)(struct clocksource *cs); + void (*mark_unstable)(struct clocksource *cs); /* private: */ #ifdef CONFIG_CLOCKSOURCE_WATCHDOG diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 00f776816aa3..c3e38ded2d73 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -79,14 +79,17 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) } extern void account_user_time(struct task_struct *, cputime_t); +extern void account_guest_time(struct task_struct *, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t); +extern void account_system_index_time(struct task_struct *, cputime_t, + enum cpu_usage_stat); extern void account_steal_time(cputime_t); extern void account_idle_time(cputime_t); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE static inline void account_process_tick(struct task_struct *tsk, int user) { - vtime_account_user(tsk); + vtime_flush(tsk); } #else extern void account_process_tick(struct task_struct *, int user); diff --git a/include/linux/mutex.h b/include/linux/mutex.h index b97870f2debd..7fffbfcd5430 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -156,10 +156,12 @@ extern int __must_check mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass); extern int __must_check mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass); +extern void mutex_lock_io_nested(struct mutex *lock, unsigned int subclass); #define mutex_lock(lock) mutex_lock_nested(lock, 0) #define mutex_lock_interruptible(lock) mutex_lock_interruptible_nested(lock, 0) #define mutex_lock_killable(lock) mutex_lock_killable_nested(lock, 0) +#define mutex_lock_io(lock) mutex_lock_io_nested(lock, 0) #define mutex_lock_nest_lock(lock, nest_lock) \ do { \ @@ -171,11 +173,13 @@ do { \ extern void mutex_lock(struct mutex *lock); extern int __must_check mutex_lock_interruptible(struct mutex *lock); extern int __must_check mutex_lock_killable(struct mutex *lock); +extern void mutex_lock_io(struct mutex *lock); # define mutex_lock_nested(lock, subclass) mutex_lock(lock) # define mutex_lock_interruptible_nested(lock, subclass) mutex_lock_interruptible(lock) # define mutex_lock_killable_nested(lock, subclass) mutex_lock_killable(lock) # define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) +# define mutex_lock_io_nested(lock, subclass) mutex_lock(lock) #endif /* diff --git a/include/linux/sched.h b/include/linux/sched.h index ad3ec9ec61f7..69e6852fede1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -461,12 +461,10 @@ extern signed long schedule_timeout_idle(signed long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); +extern int __must_check io_schedule_prepare(void); +extern void io_schedule_finish(int token); extern long io_schedule_timeout(long timeout); - -static inline void io_schedule(void) -{ - io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); -} +extern void io_schedule(void); void __noreturn do_task_dead(void); @@ -2515,6 +2513,10 @@ extern u64 sched_clock_cpu(int cpu); extern void sched_clock_init(void); #ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +static inline void sched_clock_init_late(void) +{ +} + static inline void sched_clock_tick(void) { } @@ -2537,6 +2539,7 @@ static inline u64 local_clock(void) return sched_clock(); } #else +extern void sched_clock_init_late(void); /* * Architectures can set this to 1 if they have specified * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig, @@ -2544,7 +2547,6 @@ static inline u64 local_clock(void) * is reliable after all: */ extern int sched_clock_stable(void); -extern void set_sched_clock_stable(void); extern void clear_sched_clock_stable(void); extern void sched_clock_tick(void); diff --git a/include/linux/vtime.h b/include/linux/vtime.h index aa9bfea8804a..0681fe25abeb 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -58,27 +58,28 @@ static inline void vtime_task_switch(struct task_struct *prev) extern void vtime_account_system(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk); -extern void vtime_account_user(struct task_struct *tsk); #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ static inline void vtime_task_switch(struct task_struct *prev) { } static inline void vtime_account_system(struct task_struct *tsk) { } -static inline void vtime_account_user(struct task_struct *tsk) { } #endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void arch_vtime_task_switch(struct task_struct *tsk); +extern void vtime_account_user(struct task_struct *tsk); extern void vtime_user_enter(struct task_struct *tsk); static inline void vtime_user_exit(struct task_struct *tsk) { vtime_account_user(tsk); } + extern void vtime_guest_enter(struct task_struct *tsk); extern void vtime_guest_exit(struct task_struct *tsk); extern void vtime_init_idle(struct task_struct *tsk, int cpu); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ +static inline void vtime_account_user(struct task_struct *tsk) { } static inline void vtime_user_enter(struct task_struct *tsk) { } static inline void vtime_user_exit(struct task_struct *tsk) { } static inline void vtime_guest_enter(struct task_struct *tsk) { } @@ -93,9 +94,11 @@ static inline void vtime_account_irq_exit(struct task_struct *tsk) /* On hard|softirq exit we always account to hard|softirq cputime */ vtime_account_system(tsk); } +extern void vtime_flush(struct task_struct *tsk); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static inline void vtime_account_irq_enter(struct task_struct *tsk) { } static inline void vtime_account_irq_exit(struct task_struct *tsk) { } +static inline void vtime_flush(struct task_struct *tsk) { } #endif diff --git a/init/main.c b/init/main.c index b0c9d6facef9..19228149386c 100644 --- a/init/main.c +++ b/init/main.c @@ -625,7 +625,6 @@ asmlinkage __visible void __init start_kernel(void) numa_policy_init(); if (late_time_init) late_time_init(); - sched_clock_init(); calibrate_delay(); pidmap_init(); anon_vma_init(); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 9b349619f431..8464a5cbab97 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -783,6 +783,20 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); +void __sched +mutex_lock_io_nested(struct mutex *lock, unsigned int subclass) +{ + int token; + + might_sleep(); + + token = io_schedule_prepare(); + __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, + subclass, NULL, _RET_IP_, NULL, 0); + io_schedule_finish(token); +} +EXPORT_SYMBOL_GPL(mutex_lock_io_nested); + static inline int ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { @@ -950,6 +964,16 @@ int __sched mutex_lock_killable(struct mutex *lock) } EXPORT_SYMBOL(mutex_lock_killable); +void __sched mutex_lock_io(struct mutex *lock) +{ + int token; + + token = io_schedule_prepare(); + mutex_lock(lock); + io_schedule_finish(token); +} +EXPORT_SYMBOL_GPL(mutex_lock_io); + static noinline void __sched __mutex_lock_slowpath(struct mutex *lock) { diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e85a725e5c34..ad64efe41722 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -77,41 +77,88 @@ EXPORT_SYMBOL_GPL(sched_clock); __read_mostly int sched_clock_running; +void sched_clock_init(void) +{ + sched_clock_running = 1; +} + #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK -static struct static_key __sched_clock_stable = STATIC_KEY_INIT; -static int __sched_clock_stable_early; +/* + * We must start with !__sched_clock_stable because the unstable -> stable + * transition is accurate, while the stable -> unstable transition is not. + * + * Similarly we start with __sched_clock_stable_early, thereby assuming we + * will become stable, such that there's only a single 1 -> 0 transition. + */ +static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); +static int __sched_clock_stable_early = 1; -int sched_clock_stable(void) +/* + * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset + */ +static __read_mostly u64 raw_offset; +static __read_mostly u64 gtod_offset; + +struct sched_clock_data { + u64 tick_raw; + u64 tick_gtod; + u64 clock; +}; + +static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); + +static inline struct sched_clock_data *this_scd(void) { - return static_key_false(&__sched_clock_stable); + return this_cpu_ptr(&sched_clock_data); } -static void __set_sched_clock_stable(void) +static inline struct sched_clock_data *cpu_sdc(int cpu) { - if (!sched_clock_stable()) - static_key_slow_inc(&__sched_clock_stable); + return &per_cpu(sched_clock_data, cpu); +} - tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); +int sched_clock_stable(void) +{ + return static_branch_likely(&__sched_clock_stable); } -void set_sched_clock_stable(void) +static void __set_sched_clock_stable(void) { - __sched_clock_stable_early = 1; + struct sched_clock_data *scd = this_scd(); - smp_mb(); /* matches sched_clock_init() */ + /* + * Attempt to make the (initial) unstable->stable transition continuous. + */ + raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw); - if (!sched_clock_running) - return; + printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n", + scd->tick_gtod, gtod_offset, + scd->tick_raw, raw_offset); - __set_sched_clock_stable(); + static_branch_enable(&__sched_clock_stable); + tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } static void __clear_sched_clock_stable(struct work_struct *work) { - /* XXX worry about clock continuity */ - if (sched_clock_stable()) - static_key_slow_dec(&__sched_clock_stable); + struct sched_clock_data *scd = this_scd(); + + /* + * Attempt to make the stable->unstable transition continuous. + * + * Trouble is, this is typically called from the TSC watchdog + * timer, which is late per definition. This means the tick + * values can already be screwy. + * + * Still do what we can. + */ + gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod); + + printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n", + scd->tick_gtod, gtod_offset, + scd->tick_raw, raw_offset); + static_branch_disable(&__sched_clock_stable); tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); } @@ -121,47 +168,15 @@ void clear_sched_clock_stable(void) { __sched_clock_stable_early = 0; - smp_mb(); /* matches sched_clock_init() */ - - if (!sched_clock_running) - return; + smp_mb(); /* matches sched_clock_init_late() */ - schedule_work(&sched_clock_work); + if (sched_clock_running == 2) + schedule_work(&sched_clock_work); } -struct sched_clock_data { - u64 tick_raw; - u64 tick_gtod; - u64 clock; -}; - -static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); - -static inline struct sched_clock_data *this_scd(void) +void sched_clock_init_late(void) { - return this_cpu_ptr(&sched_clock_data); -} - -static inline struct sched_clock_data *cpu_sdc(int cpu) -{ - return &per_cpu(sched_clock_data, cpu); -} - -void sched_clock_init(void) -{ - u64 ktime_now = ktime_to_ns(ktime_get()); - int cpu; - - for_each_possible_cpu(cpu) { - struct sched_clock_data *scd = cpu_sdc(cpu); - - scd->tick_raw = 0; - scd->tick_gtod = ktime_now; - scd->clock = ktime_now; - } - - sched_clock_running = 1; - + sched_clock_running = 2; /* * Ensure that it is impossible to not do a static_key update. * @@ -173,8 +188,6 @@ void sched_clock_init(void) if (__sched_clock_stable_early) __set_sched_clock_stable(); - else - __clear_sched_clock_stable(NULL); } /* @@ -216,7 +229,7 @@ again: * scd->tick_gtod + TICK_NSEC); */ - clock = scd->tick_gtod + delta; + clock = scd->tick_gtod + gtod_offset + delta; min_clock = wrap_max(scd->tick_gtod, old_clock); max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC); @@ -302,7 +315,7 @@ u64 sched_clock_cpu(int cpu) u64 clock; if (sched_clock_stable()) - return sched_clock(); + return sched_clock() + raw_offset; if (unlikely(!sched_clock_running)) return 0ull; @@ -323,23 +336,22 @@ EXPORT_SYMBOL_GPL(sched_clock_cpu); void sched_clock_tick(void) { struct sched_clock_data *scd; - u64 now, now_gtod; - - if (sched_clock_stable()) - return; - - if (unlikely(!sched_clock_running)) - return; WARN_ON_ONCE(!irqs_disabled()); + /* + * Update these values even if sched_clock_stable(), because it can + * become unstable at any point in time at which point we need some + * values to fall back on. + * + * XXX arguably we can skip this if we expose tsc_clocksource_reliable + */ scd = this_scd(); - now_gtod = ktime_to_ns(ktime_get()); - now = sched_clock(); + scd->tick_raw = sched_clock(); + scd->tick_gtod = ktime_get_ns(); - scd->tick_raw = now; - scd->tick_gtod = now_gtod; - sched_clock_local(scd); + if (!sched_clock_stable() && likely(sched_clock_running)) + sched_clock_local(scd); } /* @@ -366,11 +378,6 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ -void sched_clock_init(void) -{ - sched_clock_running = 1; -} - u64 sched_clock_cpu(int cpu) { if (unlikely(!sched_clock_running)) @@ -378,6 +385,7 @@ u64 sched_clock_cpu(int cpu) return sched_clock(); } + #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 8d0f35debf35..f063a25d4449 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c @@ -31,7 +31,8 @@ void complete(struct completion *x) unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); - x->done++; + if (x->done != UINT_MAX) + x->done++; __wake_up_locked(&x->wait, TASK_NORMAL, 1); spin_unlock_irqrestore(&x->wait.lock, flags); } @@ -51,7 +52,7 @@ void complete_all(struct completion *x) unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); - x->done += UINT_MAX/2; + x->done = UINT_MAX; __wake_up_locked(&x->wait, TASK_NORMAL, 0); spin_unlock_irqrestore(&x->wait.lock, flags); } @@ -79,7 +80,8 @@ do_wait_for_common(struct completion *x, if (!x->done) return timeout; } - x->done--; + if (x->done != UINT_MAX) + x->done--; return timeout ?: 1; } @@ -280,7 +282,7 @@ bool try_wait_for_completion(struct completion *x) spin_lock_irqsave(&x->wait.lock, flags); if (!x->done) ret = 0; - else + else if (x->done != UINT_MAX) x->done--; spin_unlock_irqrestore(&x->wait.lock, flags); return ret; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c56fb57f2991..d01f9d047397 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -102,9 +102,12 @@ void update_rq_clock(struct rq *rq) lockdep_assert_held(&rq->lock); - if (rq->clock_skip_update & RQCF_ACT_SKIP) + if (rq->clock_update_flags & RQCF_ACT_SKIP) return; +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags |= RQCF_UPDATED; +#endif delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; if (delta < 0) return; @@ -185,7 +188,7 @@ struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) rq = task_rq(p); raw_spin_lock(&rq->lock); if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, rf); return rq; } raw_spin_unlock(&rq->lock); @@ -225,7 +228,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) * pair with the WMB to ensure we must then also see migrating. */ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) { - rf->cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, rf); return rq; } raw_spin_unlock(&rq->lock); @@ -1195,9 +1198,9 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + rq_unpin_lock(rq, &rf); rq = move_queued_task(rq, p, dest_cpu); - lockdep_repin_lock(&rq->lock, rf.cookie); + rq_repin_lock(rq, &rf); } out: task_rq_unlock(rq, p, &rf); @@ -1690,7 +1693,7 @@ static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_fl * Mark the task runnable and perform wakeup-preemption. */ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, - struct pin_cookie cookie) + struct rq_flags *rf) { check_preempt_curr(rq, p, wake_flags); p->state = TASK_RUNNING; @@ -1702,9 +1705,9 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, * Our task @p is fully woken up and running; so its safe to * drop the rq->lock, hereafter rq is only used for statistics. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); p->sched_class->task_woken(rq, p); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); } if (rq->idle_stamp) { @@ -1723,7 +1726,7 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags, static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, - struct pin_cookie cookie) + struct rq_flags *rf) { int en_flags = ENQUEUE_WAKEUP; @@ -1738,7 +1741,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, #endif ttwu_activate(rq, p, en_flags); - ttwu_do_wakeup(rq, p, wake_flags, cookie); + ttwu_do_wakeup(rq, p, wake_flags, rf); } /* @@ -1757,7 +1760,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) if (task_on_rq_queued(p)) { /* check_preempt_curr() may use rq clock */ update_rq_clock(rq); - ttwu_do_wakeup(rq, p, wake_flags, rf.cookie); + ttwu_do_wakeup(rq, p, wake_flags, &rf); ret = 1; } __task_rq_unlock(rq, &rf); @@ -1770,15 +1773,15 @@ void sched_ttwu_pending(void) { struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); - struct pin_cookie cookie; struct task_struct *p; unsigned long flags; + struct rq_flags rf; if (!llist) return; raw_spin_lock_irqsave(&rq->lock, flags); - cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, &rf); while (llist) { int wake_flags = 0; @@ -1789,10 +1792,10 @@ void sched_ttwu_pending(void) if (p->sched_remote_wakeup) wake_flags = WF_MIGRATED; - ttwu_do_activate(rq, p, wake_flags, cookie); + ttwu_do_activate(rq, p, wake_flags, &rf); } - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, &rf); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -1881,7 +1884,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu) static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) { struct rq *rq = cpu_rq(cpu); - struct pin_cookie cookie; + struct rq_flags rf; #if defined(CONFIG_SMP) if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) { @@ -1892,9 +1895,9 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) #endif raw_spin_lock(&rq->lock); - cookie = lockdep_pin_lock(&rq->lock); - ttwu_do_activate(rq, p, wake_flags, cookie); - lockdep_unpin_lock(&rq->lock, cookie); + rq_pin_lock(rq, &rf); + ttwu_do_activate(rq, p, wake_flags, &rf); + rq_unpin_lock(rq, &rf); raw_spin_unlock(&rq->lock); } @@ -2086,11 +2089,24 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&task_rq(p)->nr_iowait); + } + cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags); if (task_cpu(p) != cpu) { wake_flags |= WF_MIGRATED; set_task_cpu(p, cpu); } + +#else /* CONFIG_SMP */ + + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&task_rq(p)->nr_iowait); + } + #endif /* CONFIG_SMP */ ttwu_queue(p, cpu, wake_flags); @@ -2111,7 +2127,7 @@ out: * ensure that this_rq() is locked, @p is bound to this_rq() and not * the current task. */ -static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie) +static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) { struct rq *rq = task_rq(p); @@ -2128,11 +2144,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); raw_spin_unlock(&rq->lock); raw_spin_lock(&p->pi_lock); raw_spin_lock(&rq->lock); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); } if (!(p->state & TASK_NORMAL)) @@ -2140,10 +2156,15 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie trace_sched_waking(p); - if (!task_on_rq_queued(p)) + if (!task_on_rq_queued(p)) { + if (p->in_iowait) { + delayacct_blkio_end(); + atomic_dec(&rq->nr_iowait); + } ttwu_activate(rq, p, ENQUEUE_WAKEUP); + } - ttwu_do_wakeup(rq, p, 0, cookie); + ttwu_do_wakeup(rq, p, 0, rf); ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); @@ -2578,6 +2599,7 @@ void wake_up_new_task(struct task_struct *p) __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); + update_rq_clock(rq); post_init_entity_util_avg(&p->se); activate_task(rq, p, 0); @@ -2590,9 +2612,9 @@ void wake_up_new_task(struct task_struct *p) * Nothing relies on rq->lock after this, so its fine to * drop it. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + rq_unpin_lock(rq, &rf); p->sched_class->task_woken(rq, p); - lockdep_repin_lock(&rq->lock, rf.cookie); + rq_repin_lock(rq, &rf); } #endif task_rq_unlock(rq, p, &rf); @@ -2861,7 +2883,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, - struct task_struct *next, struct pin_cookie cookie) + struct task_struct *next, struct rq_flags *rf) { struct mm_struct *mm, *oldmm; @@ -2887,13 +2909,16 @@ context_switch(struct rq *rq, struct task_struct *prev, prev->active_mm = NULL; rq->prev_mm = oldmm; } + + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); + /* * Since the runqueue lock will be released by the next * task (which is an invalid locking op but in the case * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ @@ -2949,6 +2974,36 @@ unsigned long long nr_context_switches(void) return sum; } +/* + * IO-wait accounting, and how its mostly bollocks (on SMP). + * + * The idea behind IO-wait account is to account the idle time that we could + * have spend running if it were not for IO. That is, if we were to improve the + * storage performance, we'd have a proportional reduction in IO-wait time. + * + * This all works nicely on UP, where, when a task blocks on IO, we account + * idle time as IO-wait, because if the storage were faster, it could've been + * running and we'd not be idle. + * + * This has been extended to SMP, by doing the same for each CPU. This however + * is broken. + * + * Imagine for instance the case where two tasks block on one CPU, only the one + * CPU will have IO-wait accounted, while the other has regular idle. Even + * though, if the storage were faster, both could've ran at the same time, + * utilising both CPUs. + * + * This means, that when looking globally, the current IO-wait accounting on + * SMP is a lower bound, by reason of under accounting. + * + * Worse, since the numbers are provided per CPU, they are sometimes + * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly + * associated with any one particular CPU, it can wake to another CPU than it + * blocked on. This means the per CPU IO-wait number is meaningless. + * + * Task CPU affinities can make all that even more 'interesting'. + */ + unsigned long nr_iowait(void) { unsigned long i, sum = 0; @@ -2959,6 +3014,13 @@ unsigned long nr_iowait(void) return sum; } +/* + * Consumers of these two interfaces, like for example the cpufreq menu + * governor are using nonsensical data. Boosting frequency for a CPU that has + * IO-wait which might not even end up running the task when it does become + * runnable. + */ + unsigned long nr_iowait_cpu(int cpu) { struct rq *this = cpu_rq(cpu); @@ -3257,31 +3319,30 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { - const struct sched_class *class = &fair_sched_class; + const struct sched_class *class; struct task_struct *p; /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(prev->sched_class == class && - rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev, cookie); + if (likely(rq->nr_running == rq->cfs.h_nr_running)) { + p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto again; /* assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev, cookie); + p = idle_sched_class.pick_next_task(rq, prev, rf); return p; } again: for_each_class(class) { - p = class->pick_next_task(rq, prev, cookie); + p = class->pick_next_task(rq, prev, rf); if (p) { if (unlikely(p == RETRY_TASK)) goto again; @@ -3335,7 +3396,7 @@ static void __sched notrace __schedule(bool preempt) { struct task_struct *prev, *next; unsigned long *switch_count; - struct pin_cookie cookie; + struct rq_flags rf; struct rq *rq; int cpu; @@ -3358,9 +3419,9 @@ static void __sched notrace __schedule(bool preempt) */ smp_mb__before_spinlock(); raw_spin_lock(&rq->lock); - cookie = lockdep_pin_lock(&rq->lock); + rq_pin_lock(rq, &rf); - rq->clock_skip_update <<= 1; /* promote REQ to ACT */ + rq->clock_update_flags <<= 1; /* promote REQ to ACT */ switch_count = &prev->nivcsw; if (!preempt && prev->state) { @@ -3370,6 +3431,11 @@ static void __sched notrace __schedule(bool preempt) deactivate_task(rq, prev, DEQUEUE_SLEEP); prev->on_rq = 0; + if (prev->in_iowait) { + atomic_inc(&rq->nr_iowait); + delayacct_blkio_start(); + } + /* * If a worker went to sleep, notify and ask workqueue * whether it wants to wake up a task to maintain @@ -3380,7 +3446,7 @@ static void __sched notrace __schedule(bool preempt) to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) - try_to_wake_up_local(to_wakeup, cookie); + try_to_wake_up_local(to_wakeup, &rf); } } switch_count = &prev->nvcsw; @@ -3389,10 +3455,9 @@ static void __sched notrace __schedule(bool preempt) if (task_on_rq_queued(prev)) update_rq_clock(rq); - next = pick_next_task(rq, prev, cookie); + next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); - rq->clock_skip_update = 0; if (likely(prev != next)) { rq->nr_switches++; @@ -3400,9 +3465,10 @@ static void __sched notrace __schedule(bool preempt) ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next, cookie); /* unlocks the rq */ + rq = context_switch(rq, prev, next, &rf); /* unlocks the rq */ } else { - lockdep_unpin_lock(&rq->lock, cookie); + rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); + rq_unpin_lock(rq, &rf); raw_spin_unlock_irq(&rq->lock); } @@ -3651,6 +3717,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p, &rf); + update_rq_clock(rq); /* * Idle task boosting is a nono in general. There is one @@ -3747,6 +3814,8 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -4179,6 +4248,7 @@ recheck: * runqueue lock must be held. */ rq = task_rq_lock(p, &rf); + update_rq_clock(rq); /* * Changing the policy of the stop threads its a very bad idea @@ -5057,31 +5127,48 @@ out_irq: } EXPORT_SYMBOL_GPL(yield_to); +int io_schedule_prepare(void) +{ + int old_iowait = current->in_iowait; + + current->in_iowait = 1; + blk_schedule_flush_plug(current); + + return old_iowait; +} + +void io_schedule_finish(int token) +{ + current->in_iowait = token; +} + /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. */ long __sched io_schedule_timeout(long timeout) { - int old_iowait = current->in_iowait; - struct rq *rq; + int token; long ret; - current->in_iowait = 1; - blk_schedule_flush_plug(current); - - delayacct_blkio_start(); - rq = raw_rq(); - atomic_inc(&rq->nr_iowait); + token = io_schedule_prepare(); ret = schedule_timeout(timeout); - current->in_iowait = old_iowait; - atomic_dec(&rq->nr_iowait); - delayacct_blkio_end(); + io_schedule_finish(token); return ret; } EXPORT_SYMBOL(io_schedule_timeout); +void io_schedule(void) +{ + int token; + + token = io_schedule_prepare(); + schedule(); + io_schedule_finish(token); +} +EXPORT_SYMBOL(io_schedule); + /** * sys_sched_get_priority_max - return maximum RT priority. * @policy: scheduling class. @@ -5521,7 +5608,7 @@ static void migrate_tasks(struct rq *dead_rq) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; - struct pin_cookie cookie; + struct rq_flags rf, old_rf; int dest_cpu; /* @@ -5553,8 +5640,8 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task assumes pinned rq->lock. */ - cookie = lockdep_pin_lock(&rq->lock); - next = pick_next_task(rq, &fake_task, cookie); + rq_pin_lock(rq, &rf); + next = pick_next_task(rq, &fake_task, &rf); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5567,7 +5654,7 @@ static void migrate_tasks(struct rq *dead_rq) * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, &rf); raw_spin_unlock(&rq->lock); raw_spin_lock(&next->pi_lock); raw_spin_lock(&rq->lock); @@ -5582,6 +5669,13 @@ static void migrate_tasks(struct rq *dead_rq) continue; } + /* + * __migrate_task() may return with a different + * rq->lock held and a new cookie in 'rf', but we need + * to preserve rf::clock_update_flags for 'dead_rq'. + */ + old_rf = rf; + /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); @@ -5590,6 +5684,7 @@ static void migrate_tasks(struct rq *dead_rq) raw_spin_unlock(&rq->lock); rq = dead_rq; raw_spin_lock(&rq->lock); + rf = old_rf; } raw_spin_unlock(&next->pi_lock); } @@ -5881,12 +5976,14 @@ static int init_rootdomain(struct root_domain *rd) init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) - goto free_dlo_mask; + goto free_rto_mask; if (cpupri_init(&rd->cpupri) != 0) - goto free_rto_mask; + goto free_cpudl; return 0; +free_cpudl: + cpudl_cleanup(&rd->cpudl); free_rto_mask: free_cpumask_var(rd->rto_mask); free_dlo_mask: @@ -7487,6 +7584,7 @@ void __init sched_init_smp(void) init_sched_dl_class(); sched_init_smt(); + sched_clock_init_late(); sched_smp_initialized = true; } @@ -7502,6 +7600,7 @@ early_initcall(migration_init); void __init sched_init_smp(void) { sched_init_granularity(); + sched_clock_init_late(); } #endif /* CONFIG_SMP */ @@ -7545,6 +7644,8 @@ void __init sched_init(void) int i, j; unsigned long alloc_size = 0, ptr; + sched_clock_init(); + for (i = 0; i < WAIT_TABLE_SIZE; i++) init_waitqueue_head(bit_wait_table + i); @@ -7983,6 +8084,7 @@ void sched_move_task(struct task_struct *tsk) struct rq *rq; rq = task_rq_lock(tsk, &rf); + update_rq_clock(rq); running = task_current(rq, tsk); queued = task_on_rq_queued(tsk); @@ -8431,6 +8533,7 @@ static void cpu_cgroup_fork(struct task_struct *task) rq = task_rq_lock(task, &rf); + update_rq_clock(rq); sched_change_group(task, TASK_SET_GROUP); task_rq_unlock(rq, task, &rf); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 7700a9cba335..f7c14cc71d06 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -151,7 +151,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) * @p: the process that the cpu time gets accounted to * @cputime: the cpu time spent in virtual machine since the last update */ -static void account_guest_time(struct task_struct *p, cputime_t cputime) +void account_guest_time(struct task_struct *p, cputime_t cputime) { u64 *cpustat = kcpustat_this_cpu->cpustat; @@ -176,8 +176,8 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime) * @cputime: the cpu time spent in kernel space since the last update * @index: pointer to cpustat field that has to be updated */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, int index) +void account_system_index_time(struct task_struct *p, + cputime_t cputime, enum cpu_usage_stat index) { /* Add system time to process. */ p->stime += cputime; @@ -213,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, else index = CPUTIME_SYSTEM; - __account_system_time(p, cputime, index); + account_system_index_time(p, cputime, index); } /* @@ -400,7 +400,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, * So, we have to handle it separately here. * Also, p->stime needs to be updated for ksoftirqd. */ - __account_system_time(p, cputime, CPUTIME_SOFTIRQ); + account_system_index_time(p, cputime, CPUTIME_SOFTIRQ); } else if (user_tick) { account_user_time(p, cputime); } else if (p == rq->idle) { @@ -408,7 +408,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick, } else if (p->flags & PF_VCPU) { /* System time or guest time */ account_guest_time(p, cputime); } else { - __account_system_time(p, cputime, CPUTIME_SYSTEM); + account_system_index_time(p, cputime, CPUTIME_SYSTEM); } } @@ -437,9 +437,7 @@ void vtime_common_task_switch(struct task_struct *prev) else vtime_account_system(prev); -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - vtime_account_user(prev); -#endif + vtime_flush(prev); arch_vtime_task_switch(prev); } #endif diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 70ef2b1901e4..27737f34757d 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -663,9 +663,9 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * Nothing relies on rq->lock after this, so its safe to drop * rq->lock. */ - lockdep_unpin_lock(&rq->lock, rf.cookie); + rq_unpin_lock(rq, &rf); push_dl_task(rq); - lockdep_repin_lock(&rq->lock, rf.cookie); + rq_repin_lock(rq, &rf); } #endif @@ -1118,7 +1118,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, } struct task_struct * -pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_dl_entity *dl_se; struct task_struct *p; @@ -1133,9 +1133,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); pull_dl_task(rq); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); /* * pull_dl_task() can drop (and re-acquire) rq->lock; this * means a stop task can slip in, in which case we need to @@ -1729,12 +1729,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) #ifdef CONFIG_SMP if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded) queue_push_tasks(rq); -#else +#endif if (dl_task(rq->curr)) check_preempt_curr_dl(rq, p, 0); else resched_curr(rq); -#endif } } diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index fa178b62ea79..109adc0e9cb9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -953,6 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) #endif P(policy); P(prio); + if (p->policy == SCHED_DEADLINE) { + P(dl.runtime); + P(dl.deadline); + } #undef PN_SCHEDSTAT #undef PN #undef __PN diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6559d197e08a..274c747a01ce 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2657,6 +2657,18 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg) if (tg_weight) shares /= tg_weight; + /* + * MIN_SHARES has to be unscaled here to support per-CPU partitioning + * of a group with small tg->shares value. It is a floor value which is + * assigned as a minimum load.weight to the sched_entity representing + * the group on a CPU. + * + * E.g. on 64-bit for a group with tg->shares of scale_load(15)=15*1024 + * on an 8-core system with 8 tasks each runnable on one CPU shares has + * to be 15*1024*1/8=1920 instead of scale_load(MIN_SHARES)=2*1024. In + * case no task is runnable on a CPU MIN_SHARES=2 should be returned + * instead of 0. + */ if (shares < MIN_SHARES) shares = MIN_SHARES; if (shares > tg->shares) @@ -2689,16 +2701,20 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); -static void update_cfs_shares(struct cfs_rq *cfs_rq) +static void update_cfs_shares(struct sched_entity *se) { + struct cfs_rq *cfs_rq = group_cfs_rq(se); struct task_group *tg; - struct sched_entity *se; long shares; - tg = cfs_rq->tg; - se = tg->se[cpu_of(rq_of(cfs_rq))]; - if (!se || throttled_hierarchy(cfs_rq)) + if (!cfs_rq) + return; + + if (throttled_hierarchy(cfs_rq)) return; + + tg = cfs_rq->tg; + #ifndef CONFIG_SMP if (likely(se->load.weight == tg->shares)) return; @@ -2707,8 +2723,9 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq) reweight_entity(cfs_rq_of(se), se, shares); } + #else /* CONFIG_FAIR_GROUP_SCHED */ -static inline void update_cfs_shares(struct cfs_rq *cfs_rq) +static inline void update_cfs_shares(struct sched_entity *se) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -3424,7 +3441,7 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int idle_balance(struct rq *this_rq); +static int idle_balance(struct rq *this_rq, struct rq_flags *rf); #else /* CONFIG_SMP */ @@ -3453,7 +3470,7 @@ attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} static inline void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {} -static inline int idle_balance(struct rq *rq) +static inline int idle_balance(struct rq *rq, struct rq_flags *rf) { return 0; } @@ -3582,10 +3599,18 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (renorm && !curr) se->vruntime += cfs_rq->min_vruntime; + /* + * When enqueuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Add its load to cfs_rq->runnable_avg + * - For group_entity, update its weight to reflect the new share of + * its group cfs_rq + * - Add its new weight to cfs_rq->load.weight + */ update_load_avg(se, UPDATE_TG); enqueue_entity_load_avg(cfs_rq, se); + update_cfs_shares(se); account_entity_enqueue(cfs_rq, se); - update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) place_entity(cfs_rq, se, 0); @@ -3657,6 +3682,15 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. + * - Substract its load from the cfs_rq->runnable_avg. + * - Substract its previous weight from cfs_rq->load.weight. + * - For group entity, update its weight to reflect the new share + * of its group cfs_rq. + */ update_load_avg(se, UPDATE_TG); dequeue_entity_load_avg(cfs_rq, se); @@ -3681,7 +3715,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); /* * Now advance min_vruntime if @se was the entity holding it back, @@ -3864,7 +3898,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * Ensure that runnable average is periodically updated. */ update_load_avg(curr, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(curr); #ifdef CONFIG_SCHED_HRTICK /* @@ -4761,7 +4795,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -4820,7 +4854,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; update_load_avg(se, UPDATE_TG); - update_cfs_shares(cfs_rq); + update_cfs_shares(se); } if (!se) @@ -6213,7 +6247,7 @@ preempt: } static struct task_struct * -pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -6320,15 +6354,8 @@ simple: return p; idle: - /* - * This is OK, because current is on_cpu, which avoids it being picked - * for load-balance and preemption/IRQs are still disabled avoiding - * further scheduler activity on it and we're being very careful to - * re-start the picking loop. - */ - lockdep_unpin_lock(&rq->lock, cookie); - new_tasks = idle_balance(rq); - lockdep_repin_lock(&rq->lock, cookie); + new_tasks = idle_balance(rq, rf); + /* * Because idle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we @@ -8077,6 +8104,7 @@ redo: more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); + update_rq_clock(busiest); /* * cur_ld_moved - load moved in current iteration @@ -8297,7 +8325,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static int idle_balance(struct rq *this_rq) +static int idle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -8311,6 +8339,14 @@ static int idle_balance(struct rq *this_rq) */ this_rq->idle_stamp = rq_clock(this_rq); + /* + * This is OK, because current is on_cpu, which avoids it being picked + * for load-balance and preemption/IRQs are still disabled avoiding + * further scheduler activity on it and we're being very careful to + * re-start the picking loop. + */ + rq_unpin_lock(this_rq, rf); + if (this_rq->avg_idle < sysctl_sched_migration_cost || !this_rq->rd->overload) { rcu_read_lock(); @@ -8388,6 +8424,8 @@ out: if (pulled_task) this_rq->idle_stamp = 0; + rq_repin_lock(this_rq, rf); + return pulled_task; } @@ -8443,6 +8481,7 @@ static int active_load_balance_cpu_stop(void *data) }; schedstat_inc(sd->alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { @@ -9264,6 +9303,7 @@ void online_fair_sched_group(struct task_group *tg) se = tg->se[i]; raw_spin_lock_irq(&rq->lock); + update_rq_clock(rq); attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); @@ -9356,8 +9396,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) /* Possible calls to update_curr() need rq clock */ update_rq_clock(rq); - for_each_sched_entity(se) - update_cfs_shares(group_cfs_rq(se)); + for_each_sched_entity(se) { + update_load_avg(se, UPDATE_TG); + update_cfs_shares(se); + } raw_spin_unlock_irqrestore(&rq->lock, flags); } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index 5405d3feb112..0c00172db63e 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -24,7 +24,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl } static struct task_struct * -pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { put_prev_task(rq, prev); update_idle_core(rq); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 2516b8df6dbb..704f2b89abf1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1523,7 +1523,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq) } static struct task_struct * -pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *p; struct rt_rq *rt_rq = &rq->rt; @@ -1535,9 +1535,9 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct pin_cookie coo * disabled avoiding further scheduler activity on it and we're * being very careful to re-start the picking loop. */ - lockdep_unpin_lock(&rq->lock, cookie); + rq_unpin_lock(rq, rf); pull_rt_task(rq); - lockdep_repin_lock(&rq->lock, cookie); + rq_repin_lock(rq, rf); /* * pull_rt_task() can drop (and re-acquire) rq->lock; this * means a dl or stop task can slip in, in which case we need @@ -2198,10 +2198,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) #ifdef CONFIG_SMP if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded) queue_push_tasks(rq); -#else +#endif /* CONFIG_SMP */ if (p->prio < rq->curr->prio) resched_curr(rq); -#endif /* CONFIG_SMP */ } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7b34c7826ca5..6eeae7ebd99b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -644,7 +644,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - unsigned int clock_skip_update; + unsigned int clock_update_flags; u64 clock; u64 clock_task; @@ -768,28 +768,110 @@ static inline u64 __rq_clock_broken(struct rq *rq) return READ_ONCE(rq->clock); } +/* + * rq::clock_update_flags bits + * + * %RQCF_REQ_SKIP - will request skipping of clock update on the next + * call to __schedule(). This is an optimisation to avoid + * neighbouring rq clock updates. + * + * %RQCF_ACT_SKIP - is set from inside of __schedule() when skipping is + * in effect and calls to update_rq_clock() are being ignored. + * + * %RQCF_UPDATED - is a debug flag that indicates whether a call has been + * made to update_rq_clock() since the last time rq::lock was pinned. + * + * If inside of __schedule(), clock_update_flags will have been + * shifted left (a left shift is a cheap operation for the fast path + * to promote %RQCF_REQ_SKIP to %RQCF_ACT_SKIP), so you must use, + * + * if (rq-clock_update_flags >= RQCF_UPDATED) + * + * to check if %RQCF_UPADTED is set. It'll never be shifted more than + * one position though, because the next rq_unpin_lock() will shift it + * back. + */ +#define RQCF_REQ_SKIP 0x01 +#define RQCF_ACT_SKIP 0x02 +#define RQCF_UPDATED 0x04 + +static inline void assert_clock_updated(struct rq *rq) +{ + /* + * The only reason for not seeing a clock update since the + * last rq_pin_lock() is if we're currently skipping updates. + */ + SCHED_WARN_ON(rq->clock_update_flags < RQCF_ACT_SKIP); +} + static inline u64 rq_clock(struct rq *rq) { lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { lockdep_assert_held(&rq->lock); + assert_clock_updated(rq); + return rq->clock_task; } -#define RQCF_REQ_SKIP 0x01 -#define RQCF_ACT_SKIP 0x02 - static inline void rq_clock_skip_update(struct rq *rq, bool skip) { lockdep_assert_held(&rq->lock); if (skip) - rq->clock_skip_update |= RQCF_REQ_SKIP; + rq->clock_update_flags |= RQCF_REQ_SKIP; else - rq->clock_skip_update &= ~RQCF_REQ_SKIP; + rq->clock_update_flags &= ~RQCF_REQ_SKIP; +} + +struct rq_flags { + unsigned long flags; + struct pin_cookie cookie; +#ifdef CONFIG_SCHED_DEBUG + /* + * A copy of (rq::clock_update_flags & RQCF_UPDATED) for the + * current pin context is stashed here in case it needs to be + * restored in rq_repin_lock(). + */ + unsigned int clock_update_flags; +#endif +}; + +static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) +{ + rf->cookie = lockdep_pin_lock(&rq->lock); + +#ifdef CONFIG_SCHED_DEBUG + rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); + rf->clock_update_flags = 0; +#endif +} + +static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) +{ +#ifdef CONFIG_SCHED_DEBUG + if (rq->clock_update_flags > RQCF_ACT_SKIP) + rf->clock_update_flags = RQCF_UPDATED; +#endif + + lockdep_unpin_lock(&rq->lock, rf->cookie); +} + +static inline void rq_repin_lock(struct rq *rq, struct rq_flags *rf) +{ + lockdep_repin_lock(&rq->lock, rf->cookie); + +#ifdef CONFIG_SCHED_DEBUG + /* + * Restore the value we stashed in @rf for this pin context. + */ + rq->clock_update_flags |= rf->clock_update_flags; +#endif } #ifdef CONFIG_NUMA @@ -1245,7 +1327,7 @@ struct sched_class { */ struct task_struct * (*pick_next_task) (struct rq *rq, struct task_struct *prev, - struct pin_cookie cookie); + struct rq_flags *rf); void (*put_prev_task) (struct rq *rq, struct task_struct *p); #ifdef CONFIG_SMP @@ -1501,11 +1583,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { } static inline void sched_avg_update(struct rq *rq) { } #endif -struct rq_flags { - unsigned long flags; - struct pin_cookie cookie; -}; - struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) @@ -1515,7 +1592,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) static inline void __task_rq_unlock(struct rq *rq, struct rq_flags *rf) __releases(rq->lock) { - lockdep_unpin_lock(&rq->lock, rf->cookie); + rq_unpin_lock(rq, rf); raw_spin_unlock(&rq->lock); } @@ -1524,7 +1601,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) __releases(rq->lock) __releases(p->pi_lock) { - lockdep_unpin_lock(&rq->lock, rf->cookie); + rq_unpin_lock(rq, rf); raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 604297a08b3a..9f69fb630853 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -24,7 +24,7 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) } static struct task_struct * -pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie) +pick_next_task_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct task_struct *stop = rq->stop; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 665985b0a89a..93621ae718d3 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -141,6 +141,10 @@ static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; + + if (cs->mark_unstable) + cs->mark_unstable(cs); + if (finished_booting) schedule_work(&watchdog_work); } |