From 6892b75e60557a48c01d57ba320419a9e2ce9846 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 Feb 2008 14:02:36 +0100 Subject: sched: make early bootup sched_clock() use safer do not call sched_clock() too early. Not only might rq->idle not be set up - but pure per-cpu data might not be accessible either. this solves an ia64 early bootup hang with CONFIG_PRINTK_TIME=y. Tested-by: Tony Luck Acked-by: Tony Luck Acked-by: David S. Miller Signed-off-by: Ingo Molnar --- kernel/sched.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b387a8de26a5..7286ccb01082 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -668,6 +668,8 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32; */ unsigned int sysctl_sched_rt_period = 1000000; +static __read_mostly int scheduler_running; + /* * part of the period that we allow rt tasks to run in us. * default: 0.95s @@ -689,14 +691,16 @@ unsigned long long cpu_clock(int cpu) unsigned long flags; struct rq *rq; - local_irq_save(flags); - rq = cpu_rq(cpu); /* * Only call sched_clock() if the scheduler has already been * initialized (some code might call cpu_clock() very early): */ - if (rq->idle) - update_rq_clock(rq); + if (unlikely(!scheduler_running)) + return 0; + + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); now = rq->clock; local_irq_restore(flags); @@ -7284,6 +7288,8 @@ void __init sched_init(void) * During early bootup we pretend to be a normal task: */ current->sched_class = &fair_sched_class; + + scheduler_running = 1; } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP -- cgit v1.2.3 From 70eee74b70c1a8485ec5f2bafa13dbc66fab6e02 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 22 Feb 2008 13:25:53 +0530 Subject: sched: remove duplicate code from sched_fair.c pick_task_entity() duplicates existing code. This functionality can be easily obtained using rb_last(). Avoid code duplication by using rb_last(). Signed-off-by: Balbir Singh Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6c091d6e159d..7abad50d935f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -202,16 +202,13 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct sched_entity *se = NULL; - struct rb_node *parent; - - while (*link) { - parent = *link; - se = rb_entry(parent, struct sched_entity, run_node); - link = &parent->rb_right; - } + struct rb_node *last; + struct sched_entity *se; + last = rb_last(&cfs_rq->tasks_timeline); + if (!last) + return NULL; + se = rb_entry(last, struct sched_entity, run_node); return se; } -- cgit v1.2.3 From 7eee3e677d6e2e9007afcd7d79b0715525aa552e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 Feb 2008 10:32:21 +0100 Subject: sched: clean up __pick_last_entity() a bit Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7abad50d935f..c8e6492c5925 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -202,14 +202,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node *last; - struct sched_entity *se; + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); - last = rb_last(&cfs_rq->tasks_timeline); if (!last) return NULL; - se = rb_entry(last, struct sched_entity, run_node); - return se; + + return rb_entry(last, struct sched_entity, run_node); } /************************************************************** -- cgit v1.2.3 From 67ca7bde2e9d3516b5ae0188330ad1059ac03f38 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 15 Feb 2008 09:56:36 -0800 Subject: sched: fix signedness warnings in sched.c Unsigned long values are always assigned to switch_count, make it unsigned long. kernel/sched.c:3897:15: warning: incorrect type in assignment (different signedness) kernel/sched.c:3897:15: expected long *switch_count kernel/sched.c:3897:15: got unsigned long * kernel/sched.c:3921:16: warning: incorrect type in assignment (different signedness) kernel/sched.c:3921:16: expected long *switch_count kernel/sched.c:3921:16: got unsigned long * Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 7286ccb01082..f06950c8a6ce 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3889,7 +3889,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev) asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; - long *switch_count; + unsigned long *switch_count; struct rq *rq; int cpu; -- cgit v1.2.3 From 1481197b50114d7212d659d41cb97f31a8934883 Mon Sep 17 00:00:00 2001 From: Dale Farnsworth Date: Mon, 25 Feb 2008 23:03:02 +0100 Subject: Subject: lockdep: include all lock classes in all_lock_classes Add each lock class to the all_lock_classes list when it is first registered. Previously, lock classes were added to all_lock_classes when the lock class was first used. Since one of the uses of the list is to find unused locks, this didn't work well. Signed-off-by: Dale Farnsworth Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 3574379f4d62..81a4e4a3f087 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -779,6 +779,10 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * parallel walking of the hash-list safe: */ list_add_tail_rcu(&class->hash_entry, hash_head); + /* + * Add it to the global list of classes: + */ + list_add_tail_rcu(&class->lock_entry, &all_lock_classes); if (verbose(class)) { graph_unlock(); @@ -2282,10 +2286,6 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this, return 0; break; case LOCK_USED: - /* - * Add it to the global list of classes: - */ - list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes); debug_atomic_dec(&nr_unused_locks); break; default: -- cgit v1.2.3 From cf3680b90c7842cf91ed857ac4528f4e057da366 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 14 Feb 2008 10:32:07 +0900 Subject: printk: fix possible printk overrun printk recursion detection prepends message to printk_buf and offsets printk_buf when actual message is printed but it forgets to trim buffer length accordingly. This can result in overrun in extreme cases. Fix it. [ mingo@elte.hu: bug was introduced by me via: commit 32a76006683f7b28ae3cc491da37716e002f198e Author: Ingo Molnar Date: Fri Jan 25 21:07:58 2008 +0100 printk: make printk more robust by not allowing recursion ] Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index bee36100f110..9adc2a473e6e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -666,7 +666,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) } /* Emit the output into the temporary buffer */ printed_len += vscnprintf(printk_buf + printed_len, - sizeof(printk_buf), fmt, args); + sizeof(printk_buf) - printed_len, fmt, args); /* * Copy the output into log_buf. If the caller didn't provide -- cgit v1.2.3 From 2232c2d8e0a6a31061dec311f3d1cf7624bc14f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 29 Feb 2008 18:46:50 +0100 Subject: rcu: add support for dynamic ticks and preempt rcu The PREEMPT-RCU can get stuck if a CPU goes idle and NO_HZ is set. The idle CPU will not progress the RCU through its grace period and a synchronize_rcu my get stuck. Without this patch I have a box that will not boot when PREEMPT_RCU and NO_HZ are set. That same box boots fine with this patch. This patch comes from the -rt kernel where it has been tested for several months. Signed-off-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 10 ++ include/linux/rcuclassic.h | 3 + include/linux/rcupreempt.h | 22 +++++ kernel/rcupreempt.c | 224 ++++++++++++++++++++++++++++++++++++++++++++- kernel/softirq.c | 1 + kernel/time/tick-sched.c | 3 + 6 files changed, 259 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 2961ec788046..49829988bfa0 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -109,6 +109,14 @@ static inline void account_system_vtime(struct task_struct *tsk) } #endif +#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ) +extern void rcu_irq_enter(void); +extern void rcu_irq_exit(void); +#else +# define rcu_irq_enter() do { } while (0) +# define rcu_irq_exit() do { } while (0) +#endif /* CONFIG_PREEMPT_RCU */ + /* * It is safe to do non-atomic ops on ->hardirq_context, * because NMI handlers may not preempt and the ops are @@ -117,6 +125,7 @@ static inline void account_system_vtime(struct task_struct *tsk) */ #define __irq_enter() \ do { \ + rcu_irq_enter(); \ account_system_vtime(current); \ add_preempt_count(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ @@ -135,6 +144,7 @@ extern void irq_enter(void); trace_hardirq_exit(); \ account_system_vtime(current); \ sub_preempt_count(HARDIRQ_OFFSET); \ + rcu_irq_exit(); \ } while (0) /* diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 4d6624260b4c..b3dccd68629e 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -160,5 +160,8 @@ extern void rcu_restart_cpu(int cpu); extern long rcu_batches_completed(void); extern long rcu_batches_completed_bh(void); +#define rcu_enter_nohz() do { } while (0) +#define rcu_exit_nohz() do { } while (0) + #endif /* __KERNEL__ */ #endif /* __LINUX_RCUCLASSIC_H */ diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h index 60c2a033b19e..01152ed532c8 100644 --- a/include/linux/rcupreempt.h +++ b/include/linux/rcupreempt.h @@ -82,5 +82,27 @@ extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu); struct softirq_action; +#ifdef CONFIG_NO_HZ +DECLARE_PER_CPU(long, dynticks_progress_counter); + +static inline void rcu_enter_nohz(void) +{ + __get_cpu_var(dynticks_progress_counter)++; + WARN_ON(__get_cpu_var(dynticks_progress_counter) & 0x1); + mb(); +} + +static inline void rcu_exit_nohz(void) +{ + mb(); + __get_cpu_var(dynticks_progress_counter)++; + WARN_ON(!(__get_cpu_var(dynticks_progress_counter) & 0x1)); +} + +#else /* CONFIG_NO_HZ */ +#define rcu_enter_nohz() do { } while (0) +#define rcu_exit_nohz() do { } while (0) +#endif /* CONFIG_NO_HZ */ + #endif /* __KERNEL__ */ #endif /* __LINUX_RCUPREEMPT_H */ diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 987cfb7ade89..c7c52096df48 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -23,6 +23,10 @@ * to Suparna Bhattacharya for pushing me completely away * from atomic instructions on the read side. * + * - Added handling of Dynamic Ticks + * Copyright 2007 - Paul E. Mckenney + * - Steven Rostedt + * * Papers: http://www.rdrop.com/users/paulmck/RCU * * Design Document: http://lwn.net/Articles/253651/ @@ -409,6 +413,212 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp) } } +#ifdef CONFIG_NO_HZ + +DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; +static DEFINE_PER_CPU(long, rcu_dyntick_snapshot); +static DEFINE_PER_CPU(int, rcu_update_flag); + +/** + * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. + * + * If the CPU was idle with dynamic ticks active, this updates the + * dynticks_progress_counter to let the RCU handling know that the + * CPU is active. + */ +void rcu_irq_enter(void) +{ + int cpu = smp_processor_id(); + + if (per_cpu(rcu_update_flag, cpu)) + per_cpu(rcu_update_flag, cpu)++; + + /* + * Only update if we are coming from a stopped ticks mode + * (dynticks_progress_counter is even). + */ + if (!in_interrupt() && + (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { + /* + * The following might seem like we could have a race + * with NMI/SMIs. But this really isn't a problem. + * Here we do a read/modify/write, and the race happens + * when an NMI/SMI comes in after the read and before + * the write. But NMI/SMIs will increment this counter + * twice before returning, so the zero bit will not + * be corrupted by the NMI/SMI which is the most important + * part. + * + * The only thing is that we would bring back the counter + * to a postion that it was in during the NMI/SMI. + * But the zero bit would be set, so the rest of the + * counter would again be ignored. + * + * On return from the IRQ, the counter may have the zero + * bit be 0 and the counter the same as the return from + * the NMI/SMI. If the state machine was so unlucky to + * see that, it still doesn't matter, since all + * RCU read-side critical sections on this CPU would + * have already completed. + */ + per_cpu(dynticks_progress_counter, cpu)++; + /* + * The following memory barrier ensures that any + * rcu_read_lock() primitives in the irq handler + * are seen by other CPUs to follow the above + * increment to dynticks_progress_counter. This is + * required in order for other CPUs to correctly + * determine when it is safe to advance the RCU + * grace-period state machine. + */ + smp_mb(); /* see above block comment. */ + /* + * Since we can't determine the dynamic tick mode from + * the dynticks_progress_counter after this routine, + * we use a second flag to acknowledge that we came + * from an idle state with ticks stopped. + */ + per_cpu(rcu_update_flag, cpu)++; + /* + * If we take an NMI/SMI now, they will also increment + * the rcu_update_flag, and will not update the + * dynticks_progress_counter on exit. That is for + * this IRQ to do. + */ + } +} + +/** + * rcu_irq_exit - Called from exiting Hard irq context. + * + * If the CPU was idle with dynamic ticks active, update the + * dynticks_progress_counter to put let the RCU handling be + * aware that the CPU is going back to idle with no ticks. + */ +void rcu_irq_exit(void) +{ + int cpu = smp_processor_id(); + + /* + * rcu_update_flag is set if we interrupted the CPU + * when it was idle with ticks stopped. + * Once this occurs, we keep track of interrupt nesting + * because a NMI/SMI could also come in, and we still + * only want the IRQ that started the increment of the + * dynticks_progress_counter to be the one that modifies + * it on exit. + */ + if (per_cpu(rcu_update_flag, cpu)) { + if (--per_cpu(rcu_update_flag, cpu)) + return; + + /* This must match the interrupt nesting */ + WARN_ON(in_interrupt()); + + /* + * If an NMI/SMI happens now we are still + * protected by the dynticks_progress_counter being odd. + */ + + /* + * The following memory barrier ensures that any + * rcu_read_unlock() primitives in the irq handler + * are seen by other CPUs to preceed the following + * increment to dynticks_progress_counter. This + * is required in order for other CPUs to determine + * when it is safe to advance the RCU grace-period + * state machine. + */ + smp_mb(); /* see above block comment. */ + per_cpu(dynticks_progress_counter, cpu)++; + WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); + } +} + +static void dyntick_save_progress_counter(int cpu) +{ + per_cpu(rcu_dyntick_snapshot, cpu) = + per_cpu(dynticks_progress_counter, cpu); +} + +static inline int +rcu_try_flip_waitack_needed(int cpu) +{ + long curr; + long snap; + + curr = per_cpu(dynticks_progress_counter, cpu); + snap = per_cpu(rcu_dyntick_snapshot, cpu); + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot be in the middle of an rcu_read_lock(), so + * the next rcu_read_lock() it executes must use the new value + * of the counter. So we can safely pretend that this CPU + * already acknowledged the counter. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU passed through or entered a dynticks idle phase with + * no active irq handlers, then, as above, we can safely pretend + * that this CPU already acknowledged the counter. + */ + + if ((curr - snap) > 2 || (snap & 0x1) == 0) + return 0; + + /* We need this CPU to explicitly acknowledge the counter flip. */ + + return 1; +} + +static inline int +rcu_try_flip_waitmb_needed(int cpu) +{ + long curr; + long snap; + + curr = per_cpu(dynticks_progress_counter, cpu); + snap = per_cpu(rcu_dyntick_snapshot, cpu); + smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ + + /* + * If the CPU remained in dynticks mode for the entire time + * and didn't take any interrupts, NMIs, SMIs, or whatever, + * then it cannot have executed an RCU read-side critical section + * during that time, so there is no need for it to execute a + * memory barrier. + */ + + if ((curr == snap) && ((curr & 0x1) == 0)) + return 0; + + /* + * If the CPU either entered or exited an outermost interrupt, + * SMI, NMI, or whatever handler, then we know that it executed + * a memory barrier when doing so. So we don't need another one. + */ + if (curr != snap) + return 0; + + /* We need the CPU to execute a memory barrier. */ + + return 1; +} + +#else /* !CONFIG_NO_HZ */ + +# define dyntick_save_progress_counter(cpu) do { } while (0) +# define rcu_try_flip_waitack_needed(cpu) (1) +# define rcu_try_flip_waitmb_needed(cpu) (1) + +#endif /* CONFIG_NO_HZ */ + /* * Get here when RCU is idle. Decide whether we need to * move out of idle state, and return non-zero if so. @@ -447,8 +657,10 @@ rcu_try_flip_idle(void) /* Now ask each CPU for acknowledgement of the flip. */ - for_each_cpu_mask(cpu, rcu_cpu_online_map) + for_each_cpu_mask(cpu, rcu_cpu_online_map) { per_cpu(rcu_flip_flag, cpu) = rcu_flipped; + dyntick_save_progress_counter(cpu); + } return 1; } @@ -464,7 +676,8 @@ rcu_try_flip_waitack(void) RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); for_each_cpu_mask(cpu, rcu_cpu_online_map) - if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { + if (rcu_try_flip_waitack_needed(cpu) && + per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); return 0; } @@ -509,8 +722,10 @@ rcu_try_flip_waitzero(void) smp_mb(); /* ^^^^^^^^^^^^ */ /* Call for a memory barrier from each CPU. */ - for_each_cpu_mask(cpu, rcu_cpu_online_map) + for_each_cpu_mask(cpu, rcu_cpu_online_map) { per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; + dyntick_save_progress_counter(cpu); + } RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); return 1; @@ -528,7 +743,8 @@ rcu_try_flip_waitmb(void) RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); for_each_cpu_mask(cpu, rcu_cpu_online_map) - if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { + if (rcu_try_flip_waitmb_needed(cpu) && + per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); return 0; } diff --git a/kernel/softirq.c b/kernel/softirq.c index 5b3aea5f471e..31e9f2a47928 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -313,6 +313,7 @@ void irq_exit(void) /* Make sure that timer wheel updates are propagated */ if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) tick_nohz_stop_sched_tick(); + rcu_irq_exit(); #endif preempt_enable_no_resched(); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index fa9bb73dbdb4..2968298f8f36 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -282,6 +282,7 @@ void tick_nohz_stop_sched_tick(void) ts->idle_tick = ts->sched_timer.expires; ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; + rcu_enter_nohz(); } /* @@ -375,6 +376,8 @@ void tick_nohz_restart_sched_tick(void) return; } + rcu_exit_nohz(); + /* Update jiffies first */ select_nohz_load_balancer(0); now = ktime_get(); -- cgit v1.2.3 From 7be2a03e3174cee3a3cdcdf17db357470f51caff Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Fri, 8 Feb 2008 15:41:13 +0100 Subject: softlockup: fix task state setting kthread_stop() can be called when a 'watchdog' thread is executing after kthread_should_stop() but before set_task_state(TASK_INTERRUPTIBLE). Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 7c2da88db4ed..01b6522fd92b 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -216,26 +216,27 @@ static int watchdog(void *__bind_cpu) /* initialize timestamp */ touch_softlockup_watchdog(); + set_current_state(TASK_INTERRUPTIBLE); /* * Run briefly once per second to reset the softlockup timestamp. * If this gets delayed for more than 60 seconds then the * debug-printout triggers in softlockup_tick(). */ while (!kthread_should_stop()) { - set_current_state(TASK_INTERRUPTIBLE); touch_softlockup_watchdog(); schedule(); if (kthread_should_stop()) break; - if (this_cpu != check_cpu) - continue; - - if (sysctl_hung_task_timeout_secs) - check_hung_uninterruptible_tasks(this_cpu); + if (this_cpu == check_cpu) { + if (sysctl_hung_task_timeout_secs) + check_hung_uninterruptible_tasks(this_cpu); + } + set_current_state(TASK_INTERRUPTIBLE); } + __set_current_state(TASK_RUNNING); return 0; } -- cgit v1.2.3 From ae778869ae4549628b9e83efe958c3aaa63ed1b9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 27 Feb 2008 16:21:10 -0800 Subject: rcupreempt: fix hibernate/resume in presence of PREEMPT_RCU and hotplug This fixes a oops encountered when doing hibernate/resume in presence of PREEMPT_RCU. The problem was that the code failed to disable preemption when accessing a per-CPU variable. This is OK when called from code that already has preemption disabled, but such is not the case from the suspend/resume code path. Reported-by: Dave Young Tested-by: Dave Young Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index c7c52096df48..845abcd472b0 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -918,8 +918,9 @@ void rcu_offline_cpu(int cpu) * fix. */ + local_irq_save(flags); rdp = RCU_DATA_ME(); - spin_lock_irqsave(&rdp->lock, flags); + spin_lock(&rdp->lock); *rdp->nexttail = list; if (list) rdp->nexttail = tail; -- cgit v1.2.3 From c9e71002aacc9821e99531dcc130db88bbc8ad05 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Feb 2008 11:51:07 -0800 Subject: rcupreempt: remove never-migrates assumption from rcu_process_callbacks() This patch fixes a potentially invalid access to a per-CPU variable in rcu_process_callbacks(). This per-CPU access needs to be done in such a way as to guarantee that the code using it cannot move to some other CPU before all uses of the value accessed have completed. Even though this code is currently only invoked from softirq context, which currrently cannot migrate to some other CPU, life would be better if this code did not silently make such an assumption. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index 845abcd472b0..e9517014b57c 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -952,9 +952,11 @@ static void rcu_process_callbacks(struct softirq_action *unused) { unsigned long flags; struct rcu_head *next, *list; - struct rcu_data *rdp = RCU_DATA_ME(); + struct rcu_data *rdp; - spin_lock_irqsave(&rdp->lock, flags); + local_irq_save(flags); + rdp = RCU_DATA_ME(); + spin_lock(&rdp->lock); list = rdp->donelist; if (list == NULL) { spin_unlock_irqrestore(&rdp->lock, flags); -- cgit v1.2.3 From 422b03cf75e11dfdfb29b0f19709bac585335f86 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Wed, 27 Feb 2008 10:39:22 -0500 Subject: [PATCH] Audit: Fix the format type for size_t variables Fix the following compiler warning by using "%zu" as defined in C99. CC kernel/auditsc.o kernel/auditsc.c: In function 'audit_log_single_execve_arg': kernel/auditsc.c:1074: warning: format '%ld' expects type 'long int', but argument 4 has type 'size_t' Signed-off-by: Paul Moore Signed-off-by: Al Viro --- kernel/auditsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 2087d6de67ea..782262e4107d 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1070,7 +1070,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, * so we can be sure nothing was lost. */ if ((i == 0) && (too_long)) - audit_log_format(*ab, "a%d_len=%ld ", arg_num, + audit_log_format(*ab, "a%d_len=%zu ", arg_num, has_cntl ? 2*len : len); /* -- cgit v1.2.3 From b29ee87e9b441e72454efd1be56aa1a05ffb2f58 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 Feb 2008 15:53:05 -0500 Subject: [RFC] AUDIT: do not panic when printk loses messages On the latest kernels if one was to load about 15 rules, set the failure state to panic, and then run service auditd stop the kernel will panic. This is because auditd stops, then the script deletes all of the rules. These deletions are sent as audit messages out of the printk kernel interface which is already known to be lossy. These will overun the default kernel rate limiting (10 really fast messages) and will call audit_panic(). The same effect can happen if a slew of avc's come through while auditd is stopped. This can be fixed a number of ways but this patch fixes the problem by just not panicing if auditd is not running. We know printk is lossy and if the user chooses to set the failure mode to panic and tries to use printk we can't make any promises no matter how hard we try, so why try? At least in this way we continue to get lost message accounting and will eventually know that things went bad. The other change is to add a new call to audit_log_lost() if auditd disappears. We already pulled the skb off the queue and couldn't send it so that message is lost. At least this way we will account for the last message and panic if the machine is configured to panic. This code path should only be run if auditd dies for unforeseen reasons. If auditd closes correctly audit_pid will get set to 0 and we won't walk this code path. Signed-off-by: Al Viro --- kernel/audit.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 2eeea9a14240..6d7175c1e878 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -170,7 +170,9 @@ void audit_panic(const char *message) printk(KERN_ERR "audit: %s\n", message); break; case AUDIT_FAIL_PANIC: - panic("audit: %s\n", message); + /* test audit_pid since printk is always losey, why bother? */ + if (audit_pid) + panic("audit: %s\n", message); break; } } @@ -352,6 +354,7 @@ static int kauditd_thread(void *dummy) if (err < 0) { BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); + audit_log_lost("auditd dissapeared\n"); audit_pid = 0; } } else { -- cgit v1.2.3 From 8d07a67cface19ac07d7324f38bda7bbb06bbdb2 Mon Sep 17 00:00:00 2001 From: Steve Grubb Date: Thu, 21 Feb 2008 16:59:22 -0500 Subject: [PATCH] drop EOE records from printk Hi, While we are looking at the printk issue, I see that its printk'ing the EOE (end of event) records which is really not something that we need in syslog. Its really intended for the realtime audit event stream handled by the audit daemon. So, lets avoid printk'ing that record type. Signed-off-by: Steve Grubb Signed-off-by: Al Viro --- kernel/audit.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 6d7175c1e878..10c4930c2bbf 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1353,17 +1353,19 @@ void audit_log_end(struct audit_buffer *ab) if (!audit_rate_check()) { audit_log_lost("rate limit exceeded"); } else { + struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); if (audit_pid) { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); nlh->nlmsg_len = ab->skb->len - NLMSG_SPACE(0); skb_queue_tail(&audit_skb_queue, ab->skb); ab->skb = NULL; wake_up_interruptible(&kauditd_wait); - } else if (printk_ratelimit()) { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, ab->skb->data + NLMSG_SPACE(0)); - } else { - audit_log_lost("printk limit exceeded\n"); + } else if (nlh->nlmsg_type != AUDIT_EOE) { + if (printk_ratelimit()) { + printk(KERN_NOTICE "type=%d %s\n", + nlh->nlmsg_type, + ab->skb->data + NLMSG_SPACE(0)); + } else + audit_log_lost("printk limit exceeded\n"); } } audit_buffer_free(ab); -- cgit v1.2.3 From f49ee505b1ecb5960984880740f09aba87f870dc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 2 Mar 2008 21:44:40 +0300 Subject: introduce kill_orphaned_pgrp() helper Factor out the common code in reparent_thread() and exit_notify(). No functional changes. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 74 ++++++++++++++++++++++++++++------------------------------- 1 file changed, 35 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 506a957b665a..11fcce760151 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -255,6 +255,37 @@ static int has_stopped_jobs(struct pid *pgrp) return retval; } +/* + * Check to see if any process groups have become orphaned as + * a result of our exiting, and if they have any stopped jobs, + * send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void +kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent) +{ + struct pid *pgrp = task_pgrp(tsk); + struct task_struct *ignored_task = tsk; + + if (!parent) + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ + parent = tsk->real_parent; + else + /* reparent: our child is in a different pgrp than + * we are, and it was the only connection outside. + */ + ignored_task = NULL; + + if (task_pgrp(parent) != pgrp && + task_session(parent) == task_session(tsk) && + will_become_orphaned_pgrp(pgrp, ignored_task) && + has_stopped_jobs(pgrp)) { + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); + } +} + /** * reparent_to_kthreadd - Reparent the calling kernel thread to kthreadd * @@ -635,22 +666,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) p->exit_signal != -1 && thread_group_empty(p)) do_notify_parent(p, p->exit_signal); - /* - * process group orphan check - * Case ii: Our child is in a different pgrp - * than we are, and it was the only connection - * outside, so the child pgrp is now orphaned. - */ - if ((task_pgrp(p) != task_pgrp(father)) && - (task_session(p) == task_session(father))) { - struct pid *pgrp = task_pgrp(p); - - if (will_become_orphaned_pgrp(pgrp, NULL) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } - } + kill_orphaned_pgrp(p, father); } /* @@ -738,8 +754,6 @@ static void forget_original_parent(struct task_struct *father) static void exit_notify(struct task_struct *tsk) { int state; - struct task_struct *t; - struct pid *pgrp; /* * This does two things: @@ -753,25 +767,7 @@ static void exit_notify(struct task_struct *tsk) exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); - /* - * Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - * - * Case i: Our father is in a different pgrp than we are - * and we were the only connection outside, so our pgrp - * is about to become orphaned. - */ - t = tsk->real_parent; - - pgrp = task_pgrp(tsk); - if ((task_pgrp(t) != pgrp) && - (task_session(t) == task_session(tsk)) && - will_become_orphaned_pgrp(pgrp, tsk) && - has_stopped_jobs(pgrp)) { - __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); - } + kill_orphaned_pgrp(tsk, NULL); /* Let father know we died * @@ -788,8 +784,8 @@ static void exit_notify(struct task_struct *tsk) * the same after a fork. */ if (tsk->exit_signal != SIGCHLD && tsk->exit_signal != -1 && - ( tsk->parent_exec_id != t->self_exec_id || - tsk->self_exec_id != tsk->parent_exec_id) + (tsk->parent_exec_id != tsk->real_parent->self_exec_id || + tsk->self_exec_id != tsk->parent_exec_id) && !capable(CAP_KILL)) tsk->exit_signal = SIGCHLD; -- cgit v1.2.3 From 05e83df624fe682bb8571cdb2c6d5284a99c3066 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 2 Mar 2008 21:44:42 +0300 Subject: will_become_orphaned_pgrp: partially fix insufficient ->exit_state check p->exit_state != 0 doesn't mean this process is dead, it may have sub-threads. Change the code to use "p->exit_state && thread_group_empty(p)" instead. Without this patch, ^Z doesn't deliver SIGTSTP to the foreground process if the main thread has exited. However, the new check is not perfect either. There is a window when exit_notify() drops tasklist and before release_task(). Suppose that the last (non-leader) thread exits. This means that entire group exits, but thread_group_empty() is not true yet. As Eric pointed out, is_global_init() is wrong as well, but I did not dare to do other changes. Just for the record, has_stopped_jobs() is absolutely wrong too. But we can't fix it now, we should first fix SIGNAL_STOP_STOPPED issues. Even with this patch ^Z doesn't play well with the dead main thread. The task is stopped correctly but do_wait(WSTOPPED) won't see it. This is another unrelated issue, will be (hopefully) fixed separately. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 11fcce760151..41c1edace97a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -214,20 +214,19 @@ struct pid *session_of_pgrp(struct pid *pgrp) static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; - int ret = 1; do_each_pid_task(pgrp, PIDTYPE_PGID, p) { - if (p == ignored_task - || p->exit_state - || is_global_init(p->real_parent)) + if ((p == ignored_task) || + (p->exit_state && thread_group_empty(p)) || + is_global_init(p->real_parent)) continue; + if (task_pgrp(p->real_parent) != pgrp && - task_session(p->real_parent) == task_session(p)) { - ret = 0; - break; - } + task_session(p->real_parent) == task_session(p)) + return 0; } while_each_pid_task(pgrp, PIDTYPE_PGID, p); - return ret; /* (sighing) "Often!" */ + + return 1; } int is_current_pgrp_orphaned(void) -- cgit v1.2.3 From 821c7de7194e77afee1a69d50830a329a6d9af9f Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 2 Mar 2008 21:44:44 +0300 Subject: exit_notify: fix kill_orphaned_pgrp() usage with mt exit 1. exit_notify() always calls kill_orphaned_pgrp(). This is wrong, we should do this only when the whole process exits. 2. exit_notify() uses "current" as "ignored_task", obviously wrong. Use ->group_leader instead. Test case: void hup(int sig) { printf("HUP received\n"); } void *tfunc(void *arg) { sleep(2); printf("sub-thread exited\n"); return NULL; } int main(int argc, char *argv[]) { if (!fork()) { signal(SIGHUP, hup); kill(getpid(), SIGSTOP); exit(0); } pthread_t thr; pthread_create(&thr, NULL, tfunc, NULL); sleep(1); printf("main thread exited\n"); syscall(__NR_exit, 0); return 0; } output: main thread exited HUP received Hangup With this patch the output is: main thread exited sub-thread exited HUP received Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 41c1edace97a..cd20bf07e9e3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -750,7 +750,7 @@ static void forget_original_parent(struct task_struct *father) * Send signals to all our closest relatives so that they know * to properly mourn us.. */ -static void exit_notify(struct task_struct *tsk) +static void exit_notify(struct task_struct *tsk, int group_dead) { int state; @@ -766,7 +766,8 @@ static void exit_notify(struct task_struct *tsk) exit_task_namespaces(tsk); write_lock_irq(&tasklist_lock); - kill_orphaned_pgrp(tsk, NULL); + if (group_dead) + kill_orphaned_pgrp(tsk->group_leader, NULL); /* Let father know we died * @@ -981,7 +982,7 @@ NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); proc_exit_connector(tsk); - exit_notify(tsk); + exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA mpol_free(tsk->mempolicy); tsk->mempolicy = NULL; -- cgit v1.2.3 From 13b1c3d4b49bd83d861c775ca2db54e1692a1b07 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 3 Mar 2008 20:22:05 -0800 Subject: freezer vs stopped or traced This changes the "freezer" code used by suspend/hibernate in its treatment of tasks in TASK_STOPPED (job control stop) and TASK_TRACED (ptrace) states. As I understand it, the intent of the "freezer" is to hold all tasks from doing anything significant. For this purpose, TASK_STOPPED and TASK_TRACED are "frozen enough". It's possible the tasks might resume from ptrace calls (if the tracer were unfrozen) or from signals (including ones that could come via timer interrupts, etc). But this doesn't matter as long as they quickly block again while "freezing" is in effect. Some minor adjustments to the signal.c code make sure that try_to_freeze() very shortly follows all wakeups from both kinds of stop. This lets the freezer code safely leave stopped tasks unmolested. Changing this fixes the longstanding bug of seeing after resuming from suspend/hibernate your shell report "[1] Stopped" and the like for all your jobs stopped by ^Z et al, as if you had freshly fg'd and ^Z'd them. It also removes from the freezer the arcane special case treatment for ptrace'd tasks, which relied on intimate knowledge of ptrace internals. Signed-off-by: Roland McGrath Signed-off-by: Linus Torvalds --- kernel/power/process.c | 29 ++++++++++++----------------- kernel/signal.c | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/power/process.c b/kernel/power/process.c index 7c2118f9597f..f1d0b345c9ba 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -75,22 +75,15 @@ void refrigerator(void) __set_current_state(save); } -static void fake_signal_wake_up(struct task_struct *p, int resume) +static void fake_signal_wake_up(struct task_struct *p) { unsigned long flags; spin_lock_irqsave(&p->sighand->siglock, flags); - signal_wake_up(p, resume); + signal_wake_up(p, 0); spin_unlock_irqrestore(&p->sighand->siglock, flags); } -static void send_fake_signal(struct task_struct *p) -{ - if (task_is_stopped(p)) - force_sig_specific(SIGSTOP, p); - fake_signal_wake_up(p, task_is_stopped(p)); -} - static int has_mm(struct task_struct *p) { return (p->mm && !(p->flags & PF_BORROWED_MM)); @@ -121,7 +114,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only) if (freezing(p)) { if (has_mm(p)) { if (!signal_pending(p)) - fake_signal_wake_up(p, 0); + fake_signal_wake_up(p); } else { if (with_mm_only) ret = 0; @@ -135,7 +128,7 @@ static int freeze_task(struct task_struct *p, int with_mm_only) } else { if (has_mm(p)) { set_freeze_flag(p); - send_fake_signal(p); + fake_signal_wake_up(p); } else { if (with_mm_only) { ret = 0; @@ -182,15 +175,17 @@ static int try_to_freeze_tasks(int freeze_user_space) if (frozen(p) || !freezeable(p)) continue; - if (task_is_traced(p) && frozen(p->parent)) { - cancel_freezing(p); - continue; - } - if (!freeze_task(p, freeze_user_space)) continue; - if (!freezer_should_skip(p)) + /* + * Now that we've done set_freeze_flag, don't + * perturb a task in TASK_STOPPED or TASK_TRACED. + * It is "frozen enough". If the task does wake + * up, it will immediately call try_to_freeze. + */ + if (!task_is_stopped_or_traced(p) && + !freezer_should_skip(p)) todo++; } while_each_thread(g, p); read_unlock(&tasklist_lock); diff --git a/kernel/signal.c b/kernel/signal.c index 84917fe507f7..6af1210092c3 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1623,7 +1623,6 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) /* Let the debugger run. */ __set_current_state(TASK_TRACED); spin_unlock_irq(¤t->sighand->siglock); - try_to_freeze(); read_lock(&tasklist_lock); if (!unlikely(killed) && may_ptrace_stop()) { do_notify_parent_cldstop(current, CLD_TRAPPED); @@ -1640,6 +1639,13 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) read_unlock(&tasklist_lock); } + /* + * While in TASK_TRACED, we were considered "frozen enough". + * Now that we woke up, it's crucial if we're supposed to be + * frozen that we freeze now before running anything substantial. + */ + try_to_freeze(); + /* * We are back. Now reacquire the siglock before touching * last_siginfo, so that we are sure to have synchronized with @@ -1757,9 +1763,15 @@ int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, sigset_t *mask = ¤t->blocked; int signr = 0; +relock: + /* + * We'll jump back here after any time we were stopped in TASK_STOPPED. + * While in TASK_STOPPED, we were considered "frozen enough". + * Now that we woke up, it's crucial if we're supposed to be + * frozen that we freeze now before running anything substantial. + */ try_to_freeze(); -relock: spin_lock_irq(¤t->sighand->siglock); for (;;) { struct k_sigaction *ka; -- cgit v1.2.3 From 62fb185130e4d420f71a30ff59d8b16b74ef5d2b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Feb 2008 17:34:02 +0100 Subject: sched: revert load_balance_monitor() changes The following commits cause a number of regressions: commit 58e2d4ca581167c2a079f4ee02be2f0bc52e8729 Author: Srivatsa Vaddagiri Date: Fri Jan 25 21:08:00 2008 +0100 sched: group scheduling, change how cpu load is calculated commit 6b2d7700266b9402e12824e11e0099ae6a4a6a79 Author: Srivatsa Vaddagiri Date: Fri Jan 25 21:08:00 2008 +0100 sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups Namely: - very frequent wakeups on SMP, reported by PowerTop users. - cacheline trashing on (large) SMP - some latencies larger than 500ms While there is a mergeable patch to fix the latter, the former issues are not fixable in a manner suitable for .25 (we're at -rc3 now). Hence we revert them and try again in v2.6.26. Signed-off-by: Peter Zijlstra CC: Srivatsa Vaddagiri Tested-by: Alexey Zaytsev Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 - kernel/sched.c | 283 +++++++------------------------------------------- kernel/sched_fair.c | 115 +++++++------------- kernel/sched_rt.c | 4 - kernel/sysctl.c | 18 ---- 5 files changed, 70 insertions(+), 354 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c9621f8bf87..9ae4030067a9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1542,10 +1542,6 @@ extern unsigned int sysctl_sched_child_runs_first; extern unsigned int sysctl_sched_features; extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) -extern unsigned int sysctl_sched_min_bal_int_shares; -extern unsigned int sysctl_sched_max_bal_int_shares; -#endif int sched_nr_latency_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, diff --git a/kernel/sched.c b/kernel/sched.c index f06950c8a6ce..dcd553cc4ee8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -174,41 +174,6 @@ struct task_group { struct sched_entity **se; /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; - - /* - * shares assigned to a task group governs how much of cpu bandwidth - * is allocated to the group. The more shares a group has, the more is - * the cpu bandwidth allocated to it. - * - * For ex, lets say that there are three task groups, A, B and C which - * have been assigned shares 1000, 2000 and 3000 respectively. Then, - * cpu bandwidth allocated by the scheduler to task groups A, B and C - * should be: - * - * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% - * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% - * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% - * - * The weight assigned to a task group's schedulable entities on every - * cpu (task_group.se[a_cpu]->load.weight) is derived from the task - * group's shares. For ex: lets say that task group A has been - * assigned shares of 1000 and there are two CPUs in a system. Then, - * - * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; - * - * Note: It's not necessary that each of a task's group schedulable - * entity have the same weight on all CPUs. If the group - * has 2 of its tasks on CPU0 and 1 task on CPU1, then a - * better distribution of weight could be: - * - * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 - * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 - * - * rebalance_shares() is responsible for distributing the shares of a - * task groups like this among the group's schedulable entities across - * cpus. - * - */ unsigned long shares; #endif @@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock); static DEFINE_MUTEX(doms_cur_mutex); #ifdef CONFIG_FAIR_GROUP_SCHED -#ifdef CONFIG_SMP -/* kernel thread that runs rebalance_shares() periodically */ -static struct task_struct *lb_monitor_task; -static int load_balance_monitor(void *unused); -#endif - -static void set_se_shares(struct sched_entity *se, unsigned long shares); - #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) #else # define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif -#define MIN_GROUP_SHARES 2 - static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif @@ -1245,16 +1200,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif -static inline void inc_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_add(&rq->load, load); -} - -static inline void dec_cpu_load(struct rq *rq, unsigned long load) -{ - update_load_sub(&rq->load, load); -} - #ifdef CONFIG_SMP static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); @@ -1272,14 +1217,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); #define sched_class_highest (&rt_sched_class) -static void inc_nr_running(struct rq *rq) +static inline void inc_load(struct rq *rq, const struct task_struct *p) +{ + update_load_add(&rq->load, p->se.load.weight); +} + +static inline void dec_load(struct rq *rq, const struct task_struct *p) +{ + update_load_sub(&rq->load, p->se.load.weight); +} + +static void inc_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running++; + inc_load(rq, p); } -static void dec_nr_running(struct rq *rq) +static void dec_nr_running(struct task_struct *p, struct rq *rq) { rq->nr_running--; + dec_load(rq, p); } static void set_load_weight(struct task_struct *p) @@ -1371,7 +1328,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) rq->nr_uninterruptible--; enqueue_task(rq, p, wakeup); - inc_nr_running(rq); + inc_nr_running(p, rq); } /* @@ -1383,7 +1340,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) rq->nr_uninterruptible++; dequeue_task(rq, p, sleep); - dec_nr_running(rq); + dec_nr_running(p, rq); } /** @@ -2023,7 +1980,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); - inc_nr_running(rq); + inc_nr_running(p, rq); } check_preempt_curr(rq, p); #ifdef CONFIG_SMP @@ -4362,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } on_rq = p->se.on_rq; - if (on_rq) + if (on_rq) { dequeue_task(rq, p, 0); + dec_load(rq, p); + } p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -4373,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice) if (on_rq) { enqueue_task(rq, p, 0); + inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -7087,21 +7047,6 @@ void __init sched_init_smp(void) if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); sched_init_granularity(); - -#ifdef CONFIG_FAIR_GROUP_SCHED - if (nr_cpu_ids == 1) - return; - - lb_monitor_task = kthread_create(load_balance_monitor, NULL, - "group_balance"); - if (!IS_ERR(lb_monitor_task)) { - lb_monitor_task->flags |= PF_NOFREEZE; - wake_up_process(lb_monitor_task); - } else { - printk(KERN_ERR "Could not create load balance monitor thread" - "(error = %ld) \n", PTR_ERR(lb_monitor_task)); - } -#endif } #else void __init sched_init_smp(void) @@ -7424,157 +7369,6 @@ void set_curr_task(int cpu, struct task_struct *p) #ifdef CONFIG_GROUP_SCHED -#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP -/* - * distribute shares of all task groups among their schedulable entities, - * to reflect load distribution across cpus. - */ -static int rebalance_shares(struct sched_domain *sd, int this_cpu) -{ - struct cfs_rq *cfs_rq; - struct rq *rq = cpu_rq(this_cpu); - cpumask_t sdspan = sd->span; - int balanced = 1; - - /* Walk thr' all the task groups that we have */ - for_each_leaf_cfs_rq(rq, cfs_rq) { - int i; - unsigned long total_load = 0, total_shares; - struct task_group *tg = cfs_rq->tg; - - /* Gather total task load of this group across cpus */ - for_each_cpu_mask(i, sdspan) - total_load += tg->cfs_rq[i]->load.weight; - - /* Nothing to do if this group has no load */ - if (!total_load) - continue; - - /* - * tg->shares represents the number of cpu shares the task group - * is eligible to hold on a single cpu. On N cpus, it is - * eligible to hold (N * tg->shares) number of cpu shares. - */ - total_shares = tg->shares * cpus_weight(sdspan); - - /* - * redistribute total_shares across cpus as per the task load - * distribution. - */ - for_each_cpu_mask(i, sdspan) { - unsigned long local_load, local_shares; - - local_load = tg->cfs_rq[i]->load.weight; - local_shares = (local_load * total_shares) / total_load; - if (!local_shares) - local_shares = MIN_GROUP_SHARES; - if (local_shares == tg->se[i]->load.weight) - continue; - - spin_lock_irq(&cpu_rq(i)->lock); - set_se_shares(tg->se[i], local_shares); - spin_unlock_irq(&cpu_rq(i)->lock); - balanced = 0; - } - } - - return balanced; -} - -/* - * How frequently should we rebalance_shares() across cpus? - * - * The more frequently we rebalance shares, the more accurate is the fairness - * of cpu bandwidth distribution between task groups. However higher frequency - * also implies increased scheduling overhead. - * - * sysctl_sched_min_bal_int_shares represents the minimum interval between - * consecutive calls to rebalance_shares() in the same sched domain. - * - * sysctl_sched_max_bal_int_shares represents the maximum interval between - * consecutive calls to rebalance_shares() in the same sched domain. - * - * These settings allows for the appropriate trade-off between accuracy of - * fairness and the associated overhead. - * - */ - -/* default: 8ms, units: milliseconds */ -const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; - -/* default: 128ms, units: milliseconds */ -const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; - -/* kernel thread that runs rebalance_shares() periodically */ -static int load_balance_monitor(void *unused) -{ - unsigned int timeout = sysctl_sched_min_bal_int_shares; - struct sched_param schedparm; - int ret; - - /* - * We don't want this thread's execution to be limited by the shares - * assigned to default group (init_task_group). Hence make it run - * as a SCHED_RR RT task at the lowest priority. - */ - schedparm.sched_priority = 1; - ret = sched_setscheduler(current, SCHED_RR, &schedparm); - if (ret) - printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" - " monitor thread (error = %d) \n", ret); - - while (!kthread_should_stop()) { - int i, cpu, balanced = 1; - - /* Prevent cpus going down or coming up */ - get_online_cpus(); - /* lockout changes to doms_cur[] array */ - lock_doms_cur(); - /* - * Enter a rcu read-side critical section to safely walk rq->sd - * chain on various cpus and to walk task group list - * (rq->leaf_cfs_rq_list) in rebalance_shares(). - */ - rcu_read_lock(); - - for (i = 0; i < ndoms_cur; i++) { - cpumask_t cpumap = doms_cur[i]; - struct sched_domain *sd = NULL, *sd_prev = NULL; - - cpu = first_cpu(cpumap); - - /* Find the highest domain at which to balance shares */ - for_each_domain(cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - continue; - sd_prev = sd; - } - - sd = sd_prev; - /* sd == NULL? No load balance reqd in this domain */ - if (!sd) - continue; - - balanced &= rebalance_shares(sd, cpu); - } - - rcu_read_unlock(); - - unlock_doms_cur(); - put_online_cpus(); - - if (!balanced) - timeout = sysctl_sched_min_bal_int_shares; - else if (timeout < sysctl_sched_max_bal_int_shares) - timeout *= 2; - - msleep_interruptible(timeout); - } - - return 0; -} -#endif /* CONFIG_SMP */ - #ifdef CONFIG_FAIR_GROUP_SCHED static void free_fair_sched_group(struct task_group *tg) { @@ -7841,29 +7635,25 @@ void sched_move_task(struct task_struct *tsk) } #ifdef CONFIG_FAIR_GROUP_SCHED -/* rq->lock to be locked by caller */ static void set_se_shares(struct sched_entity *se, unsigned long shares) { struct cfs_rq *cfs_rq = se->cfs_rq; struct rq *rq = cfs_rq->rq; int on_rq; - if (!shares) - shares = MIN_GROUP_SHARES; + spin_lock_irq(&rq->lock); on_rq = se->on_rq; - if (on_rq) { + if (on_rq) dequeue_entity(cfs_rq, se, 0); - dec_cpu_load(rq, se->load.weight); - } se->load.weight = shares; se->load.inv_weight = div64_64((1ULL<<32), shares); - if (on_rq) { + if (on_rq) enqueue_entity(cfs_rq, se, 0); - inc_cpu_load(rq, se->load.weight); - } + + spin_unlock_irq(&rq->lock); } static DEFINE_MUTEX(shares_mutex); @@ -7873,18 +7663,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) int i; unsigned long flags; + /* + * A weight of 0 or 1 can cause arithmetics problems. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ + if (shares < 2) + shares = 2; + mutex_lock(&shares_mutex); if (tg->shares == shares) goto done; - if (shares < MIN_GROUP_SHARES) - shares = MIN_GROUP_SHARES; - - /* - * Prevent any load balance activity (rebalance_shares, - * load_balance_fair) from referring to this group first, - * by taking it off the rq->leaf_cfs_rq_list on each cpu. - */ spin_lock_irqsave(&task_group_lock, flags); for_each_possible_cpu(i) unregister_fair_sched_group(tg, i); @@ -7898,11 +7688,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) * w/o tripping rebalance_share or load_balance_fair. */ tg->shares = shares; - for_each_possible_cpu(i) { - spin_lock_irq(&cpu_rq(i)->lock); + for_each_possible_cpu(i) set_se_shares(tg->se[i], shares); - spin_unlock_irq(&cpu_rq(i)->lock); - } /* * Enable load balance activity on this group, by inserting it back on diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index c8e6492c5925..3df4d46994ca 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -727,8 +727,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) return se->parent; } -#define GROUP_IMBALANCE_PCT 20 - #else /* CONFIG_FAIR_GROUP_SCHED */ #define for_each_sched_entity(se) \ @@ -819,26 +817,15 @@ hrtick_start_fair(struct rq *rq, struct task_struct *p) static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) { struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, - *topse = NULL; /* Highest schedulable entity */ - int incload = 1; + struct sched_entity *se = &p->se; for_each_sched_entity(se) { - topse = se; - if (se->on_rq) { - incload = 0; + if (se->on_rq) break; - } cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, wakeup); wakeup = 1; } - /* Increment cpu load if we just enqueued the first task of a group on - * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs - * at the highest grouping level. - */ - if (incload) - inc_cpu_load(rq, topse->load.weight); hrtick_start_fair(rq, rq->curr); } @@ -851,28 +838,16 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) { struct cfs_rq *cfs_rq; - struct sched_entity *se = &p->se, - *topse = NULL; /* Highest schedulable entity */ - int decload = 1; + struct sched_entity *se = &p->se; for_each_sched_entity(se) { - topse = se; cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, sleep); /* Don't dequeue parent if it has other entities besides us */ - if (cfs_rq->load.weight) { - if (parent_entity(se)) - decload = 0; + if (cfs_rq->load.weight) break; - } sleep = 1; } - /* Decrement cpu load if we just dequeued the last task of a group on - * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs - * at the highest grouping level. - */ - if (decload) - dec_cpu_load(rq, topse->load.weight); hrtick_start_fair(rq, rq->curr); } @@ -1186,6 +1161,25 @@ static struct task_struct *load_balance_next_fair(void *arg) return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); } +#ifdef CONFIG_FAIR_GROUP_SCHED +static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) +{ + struct sched_entity *curr; + struct task_struct *p; + + if (!cfs_rq->nr_running || !first_fair(cfs_rq)) + return MAX_PRIO; + + curr = cfs_rq->curr; + if (!curr) + curr = __pick_next_entity(cfs_rq); + + p = task_of(curr); + + return p->prio; +} +#endif + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -1195,45 +1189,28 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; struct rq_iterator cfs_rq_iterator; - unsigned long load_moved; cfs_rq_iterator.start = load_balance_start_fair; cfs_rq_iterator.next = load_balance_next_fair; for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { #ifdef CONFIG_FAIR_GROUP_SCHED - struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; - unsigned long maxload, task_load, group_weight; - unsigned long thisload, per_task_load; - struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; - - task_load = busy_cfs_rq->load.weight; - group_weight = se->load.weight; + struct cfs_rq *this_cfs_rq; + long imbalance; + unsigned long maxload; - /* - * 'group_weight' is contributed by tasks of total weight - * 'task_load'. To move 'rem_load_move' worth of weight only, - * we need to move a maximum task load of: - * - * maxload = (remload / group_weight) * task_load; - */ - maxload = (rem_load_move * task_load) / group_weight; + this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); - if (!maxload || !task_load) + imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; + /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ + if (imbalance <= 0) continue; - per_task_load = task_load / busy_cfs_rq->nr_running; - /* - * balance_tasks will try to forcibly move atleast one task if - * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if - * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. - */ - if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) - continue; + /* Don't pull more than imbalance/2 */ + imbalance /= 2; + maxload = min(rem_load_move, imbalance); - /* Disable priority-based load balance */ - *this_best_prio = 0; - thisload = this_cfs_rq->load.weight; + *this_best_prio = cfs_rq_best_prio(this_cfs_rq); #else # define maxload rem_load_move #endif @@ -1242,33 +1219,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, * load_balance_[start|next]_fair iterators */ cfs_rq_iterator.arg = busy_cfs_rq; - load_moved = balance_tasks(this_rq, this_cpu, busiest, + rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, maxload, sd, idle, all_pinned, this_best_prio, &cfs_rq_iterator); -#ifdef CONFIG_FAIR_GROUP_SCHED - /* - * load_moved holds the task load that was moved. The - * effective (group) weight moved would be: - * load_moved_eff = load_moved/task_load * group_weight; - */ - load_moved = (group_weight * load_moved) / task_load; - - /* Adjust shares on both cpus to reflect load_moved */ - group_weight -= load_moved; - set_se_shares(se, group_weight); - - se = busy_cfs_rq->tg->se[this_cpu]; - if (!thisload) - group_weight = load_moved; - else - group_weight = se->load.weight + load_moved; - set_se_shares(se, group_weight); -#endif - - rem_load_move -= load_moved; - if (rem_load_move <= 0) break; } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index f54792b175b2..76e828517541 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -393,8 +393,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) */ for_each_sched_rt_entity(rt_se) enqueue_rt_entity(rt_se); - - inc_cpu_load(rq, p->se.load.weight); } static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) @@ -414,8 +412,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) if (rt_rq && rt_rq->rt_nr_running) enqueue_rt_entity(rt_se); } - - dec_cpu_load(rq, p->se.load.weight); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8b7e95411795..b2a2d6889bab 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -311,24 +311,6 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, -#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_min_bal_int_shares", - .data = &sysctl_sched_min_bal_int_shares, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, - { - .ctl_name = CTL_UNNUMBERED, - .procname = "sched_max_bal_int_shares", - .data = &sysctl_sched_max_bal_int_shares, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = &proc_dointvec, - }, -#endif #endif { .ctl_name = CTL_UNNUMBERED, -- cgit v1.2.3 From b6abdb0e6ca5c2c0a7caa4131da2af0750927e72 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 4 Mar 2008 14:28:19 -0800 Subject: cgroup: fix default notify_on_release setting The documentation says the default value of notify_on_release of a child cgroup is inherited from its parent, which is reasonable, but the implementation just sets the flag disabled. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d8abe996e009..e9c2fb01e89b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2232,7 +2232,6 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, mutex_lock(&cgroup_mutex); - cgrp->flags = 0; INIT_LIST_HEAD(&cgrp->sibling); INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); @@ -2242,6 +2241,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->root = parent->root; cgrp->top_cgroup = parent->top_cgroup; + if (notify_on_release(parent)) + set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); if (IS_ERR(css)) { -- cgit v1.2.3 From fb78922ce9c71b24c4af1ffc9c3d60c57ac471fb Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 4 Mar 2008 14:28:24 -0800 Subject: Memory Resource Controller use strstrip while parsing arguments The memory controller has a requirement that while writing values, we need to use echo -n. This patch fixes the problem and makes the UI more consistent. Signed-off-by: Balbir Singh Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/controllers/memory.txt | 6 +++--- kernel/res_counter.c | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/Documentation/controllers/memory.txt b/Documentation/controllers/memory.txt index 6015347b41e2..fba6af45225c 100644 --- a/Documentation/controllers/memory.txt +++ b/Documentation/controllers/memory.txt @@ -164,7 +164,7 @@ c. Enable CONFIG_CGROUP_MEM_CONT Since now we're in the 0 cgroup, We can alter the memory limit: -# echo -n 4M > /cgroups/0/memory.limit_in_bytes +# echo 4M > /cgroups/0/memory.limit_in_bytes NOTE: We can use a suffix (k, K, m, M, g or G) to indicate values in kilo, mega or gigabytes. @@ -185,7 +185,7 @@ number of factors, such as rounding up to page boundaries or the total availability of memory on the system. The user is required to re-read this file after a write to guarantee the value committed by the kernel. -# echo -n 1 > memory.limit_in_bytes +# echo 1 > memory.limit_in_bytes # cat memory.limit_in_bytes 4096 @@ -197,7 +197,7 @@ caches, RSS and Active pages/Inactive pages are shown. The memory.force_empty gives an interface to drop *all* charges by force. -# echo -n 1 > memory.force_empty +# echo 1 > memory.force_empty will drop all charges in cgroup. Currently, this is maintained for test. diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 16cbec2d5d60..efbfc0fc232f 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -113,6 +113,7 @@ ssize_t res_counter_write(struct res_counter *counter, int member, ret = -EINVAL; + strstrip(buf); if (write_strategy) { if (write_strategy(buf, &tmp)) { goto out_free; -- cgit v1.2.3 From 9edddaa200df18e08fe0cf21036e8ae467b1363c Mon Sep 17 00:00:00 2001 From: Ananth N Mavinakayanahalli Date: Tue, 4 Mar 2008 14:28:37 -0800 Subject: Kprobes: indicate kretprobe support in Kconfig Add CONFIG_HAVE_KRETPROBES to the arch//Kconfig file for relevant architectures with kprobes support. This facilitates easy handling of in-kernel modules (like samples/kprobes/kretprobe_example.c) that depend on kretprobes being present in the kernel. Thanks to Sam Ravnborg for helping make the patch more lean. Per Mathieu's suggestion, added CONFIG_KRETPROBES and fixed up dependencies. Signed-off-by: Ananth N Mavinakayanahalli Acked-by: Mathieu Desnoyers Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 7 +++++++ arch/arm/Kconfig | 1 + arch/ia64/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sparc64/Kconfig | 1 + arch/x86/Kconfig | 1 + include/asm-arm/kprobes.h | 1 - include/asm-ia64/kprobes.h | 1 - include/asm-powerpc/kprobes.h | 1 - include/asm-s390/kprobes.h | 1 - include/asm-sparc64/kprobes.h | 2 -- include/asm-x86/kprobes.h | 1 - include/linux/kprobes.h | 6 +++--- kernel/kprobes.c | 9 +++------ 15 files changed, 19 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/arch/Kconfig b/arch/Kconfig index 3d72dc3fc8f5..694c9af520bb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -27,5 +27,12 @@ config KPROBES for kernel debugging, non-intrusive instrumentation and testing. If in doubt, say "N". +config KRETPROBES + def_bool y + depends on KPROBES && HAVE_KRETPROBES + config HAVE_KPROBES def_bool n + +config HAVE_KRETPROBES + def_bool n diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 16b82e1272b0..955fc53c1c01 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -12,6 +12,7 @@ config ARM select SYS_SUPPORTS_APM_EMULATION select HAVE_OPROFILE select HAVE_KPROBES if (!XIP_KERNEL) + select HAVE_KRETPROBES if (HAVE_KPROBES) help The ARM series is a line of low-power-consumption RISC chip designs licensed by ARM Ltd and targeted at embedded applications and diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index dff9edfc7465..56762d3c2a6a 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -18,6 +18,7 @@ config IA64 select HAVE_IDE select HAVE_OPROFILE select HAVE_KPROBES + select HAVE_KRETPROBES default y help The Itanium Processor Family is Intel's 64-bit successor to diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 5b8d8382b762..1189d8d6170d 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -90,6 +90,7 @@ config PPC select HAVE_IDE select HAVE_OPROFILE select HAVE_KPROBES + select HAVE_KRETPROBES config EARLY_PRINTK bool diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b21444b681b6..9892827b6176 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -61,6 +61,7 @@ config S390 def_bool y select HAVE_OPROFILE select HAVE_KPROBES + select HAVE_KRETPROBES source "init/Kconfig" diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 3af378ddb6ae..463d1be32c98 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -10,6 +10,7 @@ config SPARC default y select HAVE_OPROFILE select HAVE_KPROBES + select HAVE_KRETPROBES config SPARC64 bool diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 53800b80a204..f41c9538ca30 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -21,6 +21,7 @@ config X86 select HAVE_IDE select HAVE_OPROFILE select HAVE_KPROBES + select HAVE_KRETPROBES select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) diff --git a/include/asm-arm/kprobes.h b/include/asm-arm/kprobes.h index 4e7bd32288ae..c042194d3ab5 100644 --- a/include/asm-arm/kprobes.h +++ b/include/asm-arm/kprobes.h @@ -20,7 +20,6 @@ #include #include -#define ARCH_SUPPORTS_KRETPROBES #define __ARCH_WANT_KPROBES_INSN_SLOT #define MAX_INSN_SIZE 2 #define MAX_STACK_SIZE 64 /* 32 would probably be OK */ diff --git a/include/asm-ia64/kprobes.h b/include/asm-ia64/kprobes.h index a93ce9ef07ff..adbaba14eb0a 100644 --- a/include/asm-ia64/kprobes.h +++ b/include/asm-ia64/kprobes.h @@ -82,7 +82,6 @@ struct kprobe_ctlblk { struct prev_kprobe prev_kprobe[ARCH_PREV_KPROBE_SZ]; }; -#define ARCH_SUPPORTS_KRETPROBES #define kretprobe_blacklist_size 0 #define SLOT0_OPCODE_SHIFT (37) diff --git a/include/asm-powerpc/kprobes.h b/include/asm-powerpc/kprobes.h index afabad230dbb..d0e7701fa1f6 100644 --- a/include/asm-powerpc/kprobes.h +++ b/include/asm-powerpc/kprobes.h @@ -80,7 +80,6 @@ typedef unsigned int kprobe_opcode_t; #define is_trap(instr) (IS_TW(instr) || IS_TWI(instr)) #endif -#define ARCH_SUPPORTS_KRETPROBES #define flush_insn_slot(p) do { } while (0) #define kretprobe_blacklist_size 0 diff --git a/include/asm-s390/kprobes.h b/include/asm-s390/kprobes.h index 948db3d0d05c..330f68caffe4 100644 --- a/include/asm-s390/kprobes.h +++ b/include/asm-s390/kprobes.h @@ -46,7 +46,6 @@ typedef u16 kprobe_opcode_t; ? (MAX_STACK_SIZE) \ : (((unsigned long)current_thread_info()) + THREAD_SIZE - (ADDR))) -#define ARCH_SUPPORTS_KRETPROBES #define kretprobe_blacklist_size 0 #define KPROBE_SWAP_INST 0x10 diff --git a/include/asm-sparc64/kprobes.h b/include/asm-sparc64/kprobes.h index 7237dd87663e..5879d71afdaa 100644 --- a/include/asm-sparc64/kprobes.h +++ b/include/asm-sparc64/kprobes.h @@ -14,8 +14,6 @@ typedef u32 kprobe_opcode_t; #define arch_remove_kprobe(p) do {} while (0) -#define ARCH_SUPPORTS_KRETPROBES - #define flush_insn_slot(p) \ do { flushi(&(p)->ainsn.insn[0]); \ flushi(&(p)->ainsn.insn[1]); \ diff --git a/include/asm-x86/kprobes.h b/include/asm-x86/kprobes.h index 143476a3cb52..61ad7b5d142e 100644 --- a/include/asm-x86/kprobes.h +++ b/include/asm-x86/kprobes.h @@ -42,7 +42,6 @@ typedef u8 kprobe_opcode_t; : (((unsigned long)current_thread_info()) + THREAD_SIZE \ - (unsigned long)(ADDR))) -#define ARCH_SUPPORTS_KRETPROBES #define flush_insn_slot(p) do { } while (0) extern const int kretprobe_blacklist_size; diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 4a6ce82ba039..0f28486f6360 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -125,11 +125,11 @@ struct jprobe { DECLARE_PER_CPU(struct kprobe *, current_kprobe); DECLARE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); -#ifdef ARCH_SUPPORTS_KRETPROBES +#ifdef CONFIG_KRETPROBES extern void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs); extern int arch_trampoline_kprobe(struct kprobe *p); -#else /* ARCH_SUPPORTS_KRETPROBES */ +#else /* CONFIG_KRETPROBES */ static inline void arch_prepare_kretprobe(struct kretprobe *rp, struct pt_regs *regs) { @@ -138,7 +138,7 @@ static inline int arch_trampoline_kprobe(struct kprobe *p) { return 0; } -#endif /* ARCH_SUPPORTS_KRETPROBES */ +#endif /* CONFIG_KRETPROBES */ /* * Function-return probe - * Note: diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 7a86e6432338..e6a61dcbc578 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -678,8 +678,7 @@ void __kprobes unregister_jprobe(struct jprobe *jp) unregister_kprobe(&jp->kp); } -#ifdef ARCH_SUPPORTS_KRETPROBES - +#ifdef CONFIG_KRETPROBES /* * This kprobe pre_handler is registered with every kretprobe. When probe * hits it will set up the return probe. @@ -769,8 +768,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp) return ret; } -#else /* ARCH_SUPPORTS_KRETPROBES */ - +#else /* CONFIG_KRETPROBES */ int __kprobes register_kretprobe(struct kretprobe *rp) { return -ENOSYS; @@ -781,8 +779,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p, { return 0; } - -#endif /* ARCH_SUPPORTS_KRETPROBES */ +#endif /* CONFIG_KRETPROBES */ void __kprobes unregister_kretprobe(struct kretprobe *rp) { -- cgit v1.2.3 From 544adb41077a10d299a1094f12ec55a5843a9bdb Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Tue, 4 Mar 2008 14:29:00 -0800 Subject: markers: don't risk NULL deref in marker get_marker() may return NULL, so test for it. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jesper Juhl Acked-by: Mathieu Desnoyers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/marker.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index 50effc01d9a2..48a4ea5afffd 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -698,14 +698,12 @@ int marker_probe_unregister(const char *name, { struct marker_entry *entry; struct marker_probe_closure *old; - int ret = 0; + int ret = -ENOENT; mutex_lock(&markers_mutex); entry = get_marker(name); - if (!entry) { - ret = -ENOENT; + if (!entry) goto end; - } if (entry->rcu_pending) rcu_barrier(); old = marker_entry_remove_probe(entry, probe, probe_private); @@ -713,12 +711,15 @@ int marker_probe_unregister(const char *name, marker_update_probes(); /* may update entry */ mutex_lock(&markers_mutex); entry = get_marker(name); + if (!entry) + goto end; entry->oldptr = old; entry->rcu_pending = 1; /* write rcu_pending before calling the RCU callback */ smp_wmb(); call_rcu(&entry->rcu, free_old_closure); remove_marker(name); /* Ignore busy error message */ + ret = 0; end: mutex_unlock(&markers_mutex); return ret; -- cgit v1.2.3 From b2a5cd6938879b5bcfef0a73c28fea84c49519c2 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 4 Mar 2008 14:29:44 -0800 Subject: kprobes: fix a null pointer bug in register_kretprobe() Fix a bug in regiseter_kretprobe() which does not check rp->kp.symbol_name == NULL before calling kprobe_lookup_name. For maintainability, this introduces kprobe_addr helper function which resolves addr field. It is used by register_kprobe and register_kretprobe. Signed-off-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kprobes.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e6a61dcbc578..fcfb580c3afc 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -498,27 +498,36 @@ static int __kprobes in_kprobes_functions(unsigned long addr) return 0; } +/* + * If we have a symbol_name argument, look it up and add the offset field + * to it. This way, we can specify a relative address to a symbol. + */ +static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p) +{ + kprobe_opcode_t *addr = p->addr; + if (p->symbol_name) { + if (addr) + return NULL; + kprobe_lookup_name(p->symbol_name, addr); + } + + if (!addr) + return NULL; + return (kprobe_opcode_t *)(((char *)addr) + p->offset); +} + static int __kprobes __register_kprobe(struct kprobe *p, unsigned long called_from) { int ret = 0; struct kprobe *old_p; struct module *probed_mod; + kprobe_opcode_t *addr; - /* - * If we have a symbol_name argument look it up, - * and add it to the address. That way the addr - * field can either be global or relative to a symbol. - */ - if (p->symbol_name) { - if (p->addr) - return -EINVAL; - kprobe_lookup_name(p->symbol_name, p->addr); - } - - if (!p->addr) + addr = kprobe_addr(p); + if (!addr) return -EINVAL; - p->addr = (kprobe_opcode_t *)(((char *)p->addr)+ p->offset); + p->addr = addr; if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr)) @@ -721,12 +730,12 @@ int __kprobes register_kretprobe(struct kretprobe *rp) int ret = 0; struct kretprobe_instance *inst; int i; - void *addr = rp->kp.addr; + void *addr; if (kretprobe_blacklist_size) { - if (addr == NULL) - kprobe_lookup_name(rp->kp.symbol_name, addr); - addr += rp->kp.offset; + addr = kprobe_addr(&rp->kp); + if (!addr) + return -EINVAL; for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { if (kretprobe_blacklist[i].addr == addr) -- cgit v1.2.3 From 9b37ccfc637be27d9a652fcedc35e6e782c3aa78 Mon Sep 17 00:00:00 2001 From: Pavel Roskin Date: Thu, 28 Feb 2008 17:11:02 -0500 Subject: module: allow ndiswrapper to use GPL-only symbols A change after 2.6.24 broke ndiswrapper by accidentally removing its access to GPL-only symbols. Revert that change and add comments about the reasons why ndiswrapper and driverloader are treated in a special way. Signed-off-by: Pavel Roskin Acked-by: Greg KH Acked-by: Ingo Molnar Cc: Rusty Russell Cc: Jon Masters Signed-off-by: Linus Torvalds --- kernel/module.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 901cd6ac2f11..be4807fb90e4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1933,8 +1933,15 @@ static struct module *load_module(void __user *umod, /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); + /* + * ndiswrapper is under GPL by itself, but loads proprietary modules. + * Don't use add_taint_module(), as it would prevent ndiswrapper from + * using GPL-only symbols it needs. + */ if (strcmp(mod->name, "ndiswrapper") == 0) - add_taint_module(mod, TAINT_PROPRIETARY_MODULE); + add_taint(TAINT_PROPRIETARY_MODULE); + + /* driverloader was caught wrongly pretending to be under GPL */ if (strcmp(mod->name, "driverloader") == 0) add_taint_module(mod, TAINT_PROPRIETARY_MODULE); -- cgit v1.2.3 From 41f7f60d31e5e1dfc9a92957b3e14e08a2f04964 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 4 Mar 2008 23:32:38 -0800 Subject: cpusets: fix obsolete comment mm migration is no longer done in cpuset_update_task_memory_state() so it can no longer take current->mm->mmap_sem, so fix the obsolete comment. [ This changed in commit 04c19fa6f16047abff2288ddbc1f0798ede5a849 ("cpuset: migrate all tasks in cpuset at once") when the mm migration was moved from cpuset_update_task_memory_state() to update_nodemask() ] Signed-off-by: David Rientjes Cc: Paul Jackson Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e296ed81d4d..a1b61f414228 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -322,8 +322,8 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * Call without callback_mutex or task_lock() held. May be * called with or without cgroup_mutex held. Thanks in part to * 'the_top_cpuset_hack', the task's cpuset pointer will never - * be NULL. This routine also might acquire callback_mutex and - * current->mm->mmap_sem during call. + * be NULL. This routine also might acquire callback_mutex during + * call. * * Reading current->cpuset->mems_generation doesn't need task_lock * to guard the current->cpuset derefence, because it is guarded -- cgit v1.2.3 From 810b38179e9e4d4f57b4b733767bb08f8291a965 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 29 Feb 2008 15:21:01 -0500 Subject: sched: retain vruntime Kei Tokunaga reported an interactivity problem when moving tasks between control groups. Tasks would retain their old vruntime when moved between groups, this can cause funny lags. Re-set the vruntime on group move to fit within the new tree. Reported-by: Kei Tokunaga Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/sched.c | 5 +++++ kernel/sched_fair.c | 14 ++++++++++++++ 3 files changed, 23 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 9ae4030067a9..11d8e9a74eff 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -899,6 +899,10 @@ struct sched_class { int running); void (*prio_changed) (struct rq *this_rq, struct task_struct *task, int oldprio, int running); + +#ifdef CONFIG_FAIR_GROUP_SCHED + void (*moved_group) (struct task_struct *p); +#endif }; struct load_weight { diff --git a/kernel/sched.c b/kernel/sched.c index dcd553cc4ee8..0b949c4e73ad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7625,6 +7625,11 @@ void sched_move_task(struct task_struct *tsk) set_task_rq(tsk, task_cpu(tsk)); +#ifdef CONFIG_FAIR_GROUP_SCHED + if (tsk->sched_class->moved_group) + tsk->sched_class->moved_group(tsk); +#endif + if (on_rq) { if (unlikely(running)) tsk->sched_class->set_curr_task(rq); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3df4d46994ca..e2a530515619 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1353,6 +1353,16 @@ static void set_curr_task_fair(struct rq *rq) set_next_entity(cfs_rq_of(se), se); } +#ifdef CONFIG_FAIR_GROUP_SCHED +static void moved_group_fair(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + update_curr(cfs_rq); + place_entity(cfs_rq, &p->se, 1); +} +#endif + /* * All the scheduling class methods: */ @@ -1381,6 +1391,10 @@ static const struct sched_class fair_sched_class = { .prio_changed = prio_changed_fair, .switched_to = switched_to_fair, + +#ifdef CONFIG_FAIR_GROUP_SCHED + .moved_group = moved_group_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG -- cgit v1.2.3 From 6fa46fa526f2cab9ce21fa5e39501553a40d196d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 5 Mar 2008 10:00:12 -0500 Subject: sched: balance RT task resched only on runqueue Sripathi Kodi reported a crash in the -rt kernel: https://bugzilla.redhat.com/show_bug.cgi?id=435674 this is due to a place that can reschedule a task without holding the tasks runqueue lock. This was caused by the RT balancing code that pulls RT tasks to the current run queue and will reschedule the current task. There's a slight chance that the pulling of the RT tasks will release the current runqueue's lock and retake it (in the double_lock_balance). During this time that the runqueue is released, the current task can migrate to another runqueue. In the prio_changed_rt code, after the pull, if the current task is of lesser priority than one of the RT tasks pulled, resched_task is called on the current task. If the current task had migrated in that small window, resched_task will be called without holding the runqueue lock for the runqueue that the task is on. This race condition also exists in the mainline kernel and this patch adds a check to make sure the task hasn't migrated before calling resched_task. Signed-off-by: Steven Rostedt Tested-by: Sripathi Kodi Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 76e828517541..0a6d2e516420 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1107,9 +1107,11 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p, pull_rt_task(rq); /* * If there's a higher priority task waiting to run - * then reschedule. + * then reschedule. Note, the above pull_rt_task + * can release the rq lock and p could migrate. + * Only reschedule if p is still on the same runqueue. */ - if (p->prio > rq->rt.highest_prio) + if (p->prio > rq->rt.highest_prio && rq->curr == p) resched_task(p); #else /* For UP simply resched on drop of prio */ -- cgit v1.2.3 From 150d8bede7f85eb00d8f4d628e6b0bae68739e3b Mon Sep 17 00:00:00 2001 From: Pavel Roskin Date: Wed, 5 Mar 2008 16:56:37 -0500 Subject: sched: export task_nice The API is trivial, and so is the implementation. Signed-off-by: Pavel Roskin Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 0b949c4e73ad..63a469f8853d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4422,7 +4422,7 @@ int task_nice(const struct task_struct *p) { return TASK_NICE(p); } -EXPORT_SYMBOL_GPL(task_nice); +EXPORT_SYMBOL(task_nice); /** * idle_cpu - is a given cpu idle currently? -- cgit v1.2.3 From 1868f958eb56fc41c5985c8732e564a400c5fdf5 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 7 Mar 2008 09:35:06 +0800 Subject: sched: fix the wrong time slice value for SCHED_FIFO tasks Function sys_sched_rr_get_interval returns wrong time slice value for SCHED_FIFO tasks. The time slice for SCHED_FIFO tasks should be 0. Signed-off-by: Miao Xie Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 63a469f8853d..5b13e4b0e009 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5100,7 +5100,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) time_slice = 0; if (p->policy == SCHED_RR) { time_slice = DEF_TIMESLICE; - } else { + } else if (p->policy != SCHED_FIFO) { struct sched_entity *se = &p->se; unsigned long flags; struct rq *rq; -- cgit v1.2.3 From 2692a2406b9262bbb101708815be99ec2988e48b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Feb 2008 12:00:46 +0100 Subject: sched: rt-group: fixup schedulability constraints calculation it was only possible to configure the rt-group scheduling parameters beyond the default value in a very small range. that's because div64_64() has a different calling convention than do_div() :/ fix a few untidies while we are here; sysctl_sched_rt_period may overflow due to that multiplication, so cast to u64 first. Also that RUNTIME_INF juggling makes little sense although its an effective NOP. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 5b13e4b0e009..b8ee864c7481 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7726,9 +7726,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) if (runtime == RUNTIME_INF) return 1ULL << 16; - runtime *= (1ULL << 16); - div64_64(runtime, period); - return runtime; + return div64_64(runtime << 16, period); } static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) @@ -7757,18 +7755,16 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) u64 rt_runtime, rt_period; int err = 0; - rt_period = sysctl_sched_rt_period * NSEC_PER_USEC; + rt_period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC; rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC; if (rt_runtime_us == -1) - rt_runtime = rt_period; + rt_runtime = RUNTIME_INF; mutex_lock(&rt_constraints_mutex); if (!__rt_schedulable(tg, rt_period, rt_runtime)) { err = -EINVAL; goto unlock; } - if (rt_runtime_us == -1) - rt_runtime = RUNTIME_INF; tg->rt_runtime = rt_runtime; unlock: mutex_unlock(&rt_constraints_mutex); -- cgit v1.2.3 From 521f1a2489c41f8b1181b0a8eb52e1c34284d50b Mon Sep 17 00:00:00 2001 From: Dhaval Giani Date: Thu, 28 Feb 2008 15:21:56 +0530 Subject: sched: don't allow rt_runtime_us to be zero for groups having rt tasks This patch checks if we can set the rt_runtime_us to 0. If there is a realtime task in the group, we don't want to set the rt_runtime_us as 0 or bad things will happen. (that task wont get any CPU time despite being TASK_RUNNNG) Signed-off-by: Dhaval Giani Signed-off-by: Ingo Molnar --- kernel/sched.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b8ee864c7481..52b98675acb2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7750,6 +7750,17 @@ static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) return total + to_ratio(period, runtime) < global_ratio; } +/* Must be called with tasklist_lock held */ +static inline int tg_has_rt_tasks(struct task_group *tg) +{ + struct task_struct *g, *p; + do_each_thread(g, p) { + if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) + return 1; + } while_each_thread(g, p); + return 0; +} + int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) { u64 rt_runtime, rt_period; @@ -7761,12 +7772,18 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us) rt_runtime = RUNTIME_INF; mutex_lock(&rt_constraints_mutex); + read_lock(&tasklist_lock); + if (rt_runtime_us == 0 && tg_has_rt_tasks(tg)) { + err = -EBUSY; + goto unlock; + } if (!__rt_schedulable(tg, rt_period, rt_runtime)) { err = -EINVAL; goto unlock; } tg->rt_runtime = rt_runtime; unlock: + read_unlock(&tasklist_lock); mutex_unlock(&rt_constraints_mutex); return err; -- cgit v1.2.3 From 6efcae460186c0c1c94afff58a92784e1fc0d10b Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Sat, 8 Mar 2008 11:41:22 -0800 Subject: Fix waitid si_code regression In commit ee7c82da830ea860b1f9274f1f0cdf99f206e7c2 ("wait_task_stopped: simplify and fix races with SIGCONT/SIGKILL/untrace"), the magic (short) cast when storing si_code was lost in wait_task_stopped. This leaks the in-kernel CLD_* values that do not match what userland expects. Signed-off-by: Roland McGrath Cc: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index cd20bf07e9e3..53872bf993fa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1378,7 +1378,7 @@ unlock_sig: if (!retval && infop) retval = put_user(0, &infop->si_errno); if (!retval && infop) - retval = put_user(why, &infop->si_code); + retval = put_user((short)why, &infop->si_code); if (!retval && infop) retval = put_user(exit_code, &infop->si_status); if (!retval && infop) -- cgit v1.2.3 From e48af19f56eb47a1f908ee8f16df9d246f955b21 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 25 Feb 2008 18:31:57 +0100 Subject: ntp: use unsigned input for do_div() The kernel NTP code shouldn't hand 64-bit *signed* values to do_div(). Make it instead hand 64-bit unsigned values. This gets rid of a couple of warnings. Signed-off-by: David Howells Cc: Roman Zippel Cc: Ingo Molnar Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/ntp.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c88b5910e7ab..d4bca927f715 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -342,14 +342,16 @@ int do_adjtimex(struct timex *txc) freq_adj = shift_right(freq_adj, time_constant * 2 + (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { + u64 utemp64; temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL); if (time_offset < 0) { - temp64 = -temp64; - do_div(temp64, mtemp); - freq_adj -= temp64; + utemp64 = -temp64; + do_div(utemp64, mtemp); + freq_adj -= utemp64; } else { - do_div(temp64, mtemp); - freq_adj += temp64; + utemp64 = temp64; + do_div(utemp64, mtemp); + freq_adj += utemp64; } } freq_adj += time_freq; -- cgit v1.2.3 From a79017660ea4597ec489fab3b5aaf71dd776dfc7 Mon Sep 17 00:00:00 2001 From: Karsten Wiese Date: Tue, 4 Mar 2008 14:59:55 -0800 Subject: time: don't touch an offlined CPU's ts->tick_stopped in tick_cancel_sched_timer() Silences WARN_ONs in rcu_enter_nohz() and rcu_exit_nohz(), which appeared before caused by (repeated) calls to: $ echo 0 > /sys/devices/system/cpu/cpu1/online $ echo 1 > /sys/devices/system/cpu/cpu1/online Signed-off-by: Karsten Wiese Cc: johnstul@us.ibm.com Cc: Rafael Wysocki Cc: Steven Rostedt Cc: Ingo Molnar Acked-by: Paul E. McKenney Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2968298f8f36..686da821d376 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -640,7 +640,7 @@ void tick_cancel_sched_timer(int cpu) if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); - ts->tick_stopped = 0; + ts->nohz_mode = NOHZ_MODE_INACTIVE; } #endif /* HIGH_RES_TIMERS */ -- cgit v1.2.3 From 10a398d04c4a1fc395840f4d040493375f562302 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Tue, 4 Mar 2008 15:14:26 -0800 Subject: time: remove obsolete CLOCK_TICK_ADJUST The first version of the ntp_interval/tick_length inconsistent usage patch was recently merged as bbe4d18ac2e058c56adb0cd71f49d9ed3216a405 http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=bbe4d18ac2e058c56adb0cd71f49d9ed3216a405 While the fix did greatly improve the situation, it was correctly pointed out by Roman that it does have a small bug: If the users change clocksources after the system has been running and NTP has made corrections, the correctoins made against the old clocksource will be applied against the new clocksource, causing error. The second attempt, which corrects the issue in the NTP_INTERVAL_LENGTH definition has also made it up-stream as commit e13a2e61dd5152f5499d2003470acf9c838eab84 http://git.kernel.org/gitweb.cgi?p=linux/kernel/git/torvalds/linux-2.6.git;a=commit;h=e13a2e61dd5152f5499d2003470acf9c838eab84 Roman has correctly pointed out that CLOCK_TICK_ADJUST is calculated based on the PIT's frequency, and isn't really relevant to non-PIT driven clocksources (that is, clocksources other then jiffies and pit). This patch reverts both of those changes, and simply removes CLOCK_TICK_ADJUST. This does remove the granularity error correction for users of PIT and Jiffies clocksource users, but the granularity error but for the majority of users, it should be within the 500ppm range NTP can accommodate for. For systems that have granularity errors greater then 500ppm, the "ntp_tick_adj=" boot option can be used to compensate. [johnstul@us.ibm.com: provided changelog] [mattilinnanvuori@yahoo.com: maek ntp_tick_adj static] Signed-off-by: Roman Zippel Acked-by: john stultz Signed-off-by: Matti Linnanvuori Signed-off-by: Andrew Morton Cc: mingo@elte.hu Signed-off-by: Thomas Gleixner --- include/linux/timex.h | 9 +-------- kernel/time/ntp.c | 11 ++++++++++- kernel/time/timekeeping.c | 6 ++---- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/timex.h b/include/linux/timex.h index c3f374786a43..8ea3e71ba7fa 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -232,14 +232,7 @@ static inline int ntp_synced(void) #else #define NTP_INTERVAL_FREQ (HZ) #endif - -#define CLOCK_TICK_OVERFLOW (LATCH * HZ - CLOCK_TICK_RATE) -#define CLOCK_TICK_ADJUST (((s64)CLOCK_TICK_OVERFLOW * NSEC_PER_SEC) / \ - (s64)CLOCK_TICK_RATE) - -/* Because using NSEC_PER_SEC would be too easy */ -#define NTP_INTERVAL_LENGTH ((((s64)TICK_USEC * NSEC_PER_USEC * USER_HZ) + \ - CLOCK_TICK_ADJUST) / NTP_INTERVAL_FREQ) +#define NTP_INTERVAL_LENGTH (NSEC_PER_SEC/NTP_INTERVAL_FREQ) /* Returns how long ticks are at present, in ns / 2^(SHIFT_SCALE-10). */ extern u64 current_tick_length(void); diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index d4bca927f715..5fd9b9469770 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ long time_freq; /* frequency offset (scaled ppm)*/ static long time_reftime; /* time at last adjustment (s) */ long time_adjust; +static long ntp_tick_adj; static void ntp_update_frequency(void) { u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; - second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT; second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); tick_length_base = second_length; @@ -402,3 +403,11 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) notify_cmos_timer(); return(result); } + +static int __init ntp_tick_adj_setup(char *str) +{ + ntp_tick_adj = simple_strtol(str, NULL, 0); + return 1; +} + +__setup("ntp_tick_adj=", ntp_tick_adj_setup); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1af9fb050fe2..671af612b768 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -187,8 +187,7 @@ static void change_clocksource(void) clock->error = 0; clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, - (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); tick_clock_notify(); @@ -245,8 +244,7 @@ void __init timekeeping_init(void) ntp_clear(); clock = clocksource_get_next(); - clocksource_calculate_interval(clock, - (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); clock->cycle_last = clocksource_read(clock); xtime.tv_sec = sec; -- cgit v1.2.3 From 393d94d98b19089ec172566e23557997931b137e Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Sat, 8 Mar 2008 00:10:15 -0500 Subject: cpu hotplug: adjust root-domain->online span in response to hotplug event We currently set the root-domain online span automatically when the domain is added to the cpu if the cpu is already a member of cpu_online_map. This was done as a hack/bug-fix for s2ram, but it also causes a problem with hotplug CPU_DOWN transitioning. The right way to fix the original problem is to actually respond to CPU_UP events, instead of CPU_ONLINE, which is already too late. This solves the hung reboot regression reported by Andrew Morton and others. Signed-off-by: Gregory Haskins Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 52b98675acb2..b02e4fc25645 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5813,6 +5813,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); + + /* Update our root-domain */ + if (rq->rd) { + BUG_ON(!cpu_isset(cpu, rq->rd->span)); + cpu_set(cpu, rq->rd->online); + } + task_rq_unlock(rq, &flags); cpu_rq(cpu)->migration_thread = p; break; @@ -5821,15 +5828,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_ONLINE_FROZEN: /* Strictly unnecessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); - - /* Update our root-domain */ - rq = cpu_rq(cpu); - spin_lock_irqsave(&rq->lock, flags); - if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_set(cpu, rq->rd->online); - } - spin_unlock_irqrestore(&rq->lock, flags); break; #ifdef CONFIG_HOTPLUG_CPU @@ -6105,8 +6103,6 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) rq->rd = rd; cpu_set(rq->cpu, rd->span); - if (cpu_isset(rq->cpu, cpu_online_map)) - cpu_set(rq->cpu, rd->online); for (class = sched_class_highest; class; class = class->next) { if (class->join_domain) -- cgit v1.2.3 From 6c5db22d280302c33dafb309c25bf2841fb99c37 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 10 Mar 2008 11:43:52 -0700 Subject: modules: fix module waiting for dependent modules' init Commit c9a3ba55 (module: wait for dependent modules doing init.) didn't quite work because the waiter holds the module lock, meaning that the state of the module it's waiting for cannot change. Fortunately, it's fairly simple to update the state outside the lock and do the wakeup. Thanks to Jan Glauber for tracking this down and testing (qdio and qeth). Signed-off-by: Rusty Russell Cc: Jan Glauber Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index be4807fb90e4..68d05d2f4d8a 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2179,9 +2179,11 @@ sys_init_module(void __user *umod, return ret; } - /* Now it's a first class citizen! */ - mutex_lock(&module_mutex); + /* Now it's a first class citizen! Wake up anyone waiting for it. */ mod->state = MODULE_STATE_LIVE; + wake_up(&module_wq); + + mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); unwind_remove_table(mod->unwind_info, 1); @@ -2190,7 +2192,6 @@ sys_init_module(void __user *umod, mod->init_size = 0; mod->init_text_size = 0; mutex_unlock(&module_mutex); - wake_up(&module_wq); return 0; } -- cgit v1.2.3 From e24e2e64c468c8060bb7173abecdf11d00ed5751 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 10 Mar 2008 11:43:53 -0700 Subject: modules: warn about suspicious return values from module's ->init() hook Return value convention of module's init functions is 0/-E. Sometimes, e.g. during forward-porting mistakes happen and buggy module created, where result of comparison "workqueue != NULL" is propagated all the way up to sys_init_module. What happens is that some other module created workqueue in question, our module created it again and module was successfully loaded. Or it could be some other bug. Let's make such mistakes much more visible. In retrospective, such messages would noticeably shorten some of my head-scratching sessions. Note, that dump_stack() is just a way to get attention from user. Sample message: sys_init_module: 'foo'->init suspiciously returned 1, it should follow 0/-E convention sys_init_module: loading module anyway... Pid: 4223, comm: modprobe Not tainted 2.6.24-25f666300625d894ebe04bac2b4b3aadb907c861 #5 Call Trace: [] sys_init_module+0xe5/0x1d0 [] system_call_after_swapgs+0x7b/0x80 Signed-off-by: Alexey Dobriyan Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 68d05d2f4d8a..5d437bffd8dc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2178,6 +2178,14 @@ sys_init_module(void __user *umod, wake_up(&module_wq); return ret; } + if (ret > 0) { + printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " + "it should follow 0/-E convention\n" + KERN_WARNING "%s: loading module anyway...\n", + __func__, mod->name, ret, + __func__); + dump_stack(); + } /* Now it's a first class citizen! Wake up anyone waiting for it. */ mod->state = MODULE_STATE_LIVE; -- cgit v1.2.3 From 21bbb39c376ce6beeeb549d155f0d53dc76ed000 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 10 Mar 2008 11:43:57 -0700 Subject: rcu: move PREEMPT_RCU config option back under PREEMPT The original preemptible-RCU patch put the choice between classic and preemptible RCU into kernel/Kconfig.preempt, which resulted in build failures on machines not supporting CONFIG_PREEMPT. This choice was therefore moved to init/Kconfig, which worked, but placed the choice between classic and preemptible RCU at the top level, a very obtuse choice indeed. This patch changes from the Kconfig "choice" mechanism to a pair of booleans, only one of which (CONFIG_PREEMPT_RCU) is user-visible, and is located in kernel/Kconfig.preempt, where one would expect it to be. The other (CONFIG_CLASSIC_RCU) is in init/Kconfig so that it is available to all architectures, hopefully avoiding build breakage. Thanks to Roman Zippel for suggesting this approach. Signed-off-by: Paul E. McKenney Cc: Ingo Molnar Acked-by: Steven Rostedt Cc: Dipankar Sarma Cc: Josh Triplett Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Roman Zippel Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 34 +++------------------------------- kernel/Kconfig.preempt | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 074ac97f55e3..a97924bc5b8d 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -865,38 +865,10 @@ source "block/Kconfig" config PREEMPT_NOTIFIERS bool -choice - prompt "RCU implementation type:" - default CLASSIC_RCU - help - This allows you to choose either the classic RCU implementation - that is designed for best read-side performance on non-realtime - systems, or the preemptible RCU implementation for best latency - on realtime systems. Note that some kernel preemption modes - will restrict your choice. - - Select the default if you are unsure. - config CLASSIC_RCU - bool "Classic RCU" + def_bool !PREEMPT_RCU help This option selects the classic RCU implementation that is designed for best read-side performance on non-realtime - systems. - - Say Y if you are unsure. - -config PREEMPT_RCU - bool "Preemptible RCU" - depends on PREEMPT - help - This option reduces the latency of the kernel by making certain - RCU sections preemptible. Normally RCU code is non-preemptible, if - this option is selected then read-only RCU sections become - preemptible. This helps latency, but may expose bugs due to - now-naive assumptions about each RCU read-side critical section - remaining on a given CPU through its execution. - - Say N if you are unsure. - -endchoice + systems. Classic RCU is the default. Note that the + PREEMPT_RCU symbol is used to select/deselect this option. diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0669b70fa6a3..9fdba03dc1fc 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -52,8 +52,23 @@ config PREEMPT endchoice +config PREEMPT_RCU + bool "Preemptible RCU" + depends on PREEMPT + default n + help + This option reduces the latency of the kernel by making certain + RCU sections preemptible. Normally RCU code is non-preemptible, if + this option is selected then read-only RCU sections become + preemptible. This helps latency, but may expose bugs due to + now-naive assumptions about each RCU read-side critical section + remaining on a given CPU through its execution. + + Say N if you are unsure. + config RCU_TRACE bool "Enable tracing for RCU - currently stats in debugfs" + depends on PREEMPT_RCU select DEBUG_FS default y help -- cgit v1.2.3 From 1f94ef598e8d29b92b9fc85d43c832e03721d3cb Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 10 Mar 2008 16:52:41 -0400 Subject: Revert "cpu hotplug: adjust root-domain->online span in response to hotplug event" This reverts commit 393d94d98b19089ec172566e23557997931b137e. Lets fix this right. Signed-off-by: Gregory Haskins Cc: Gautham R Shenoy Cc: "Siddha, Suresh B" Cc: "Rafael J. Wysocki" Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index b02e4fc25645..52b98675acb2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5813,13 +5813,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); - - /* Update our root-domain */ - if (rq->rd) { - BUG_ON(!cpu_isset(cpu, rq->rd->span)); - cpu_set(cpu, rq->rd->online); - } - task_rq_unlock(rq, &flags); cpu_rq(cpu)->migration_thread = p; break; @@ -5828,6 +5821,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_ONLINE_FROZEN: /* Strictly unnecessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); + + /* Update our root-domain */ + rq = cpu_rq(cpu); + spin_lock_irqsave(&rq->lock, flags); + if (rq->rd) { + BUG_ON(!cpu_isset(cpu, rq->rd->span)); + cpu_set(cpu, rq->rd->online); + } + spin_unlock_irqrestore(&rq->lock, flags); break; #ifdef CONFIG_HOTPLUG_CPU @@ -6103,6 +6105,8 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) rq->rd = rd; cpu_set(rq->cpu, rd->span); + if (cpu_isset(rq->cpu, cpu_online_map)) + cpu_set(rq->cpu, rd->online); for (class = sched_class_highest; class; class = class->next) { if (class->join_domain) -- cgit v1.2.3 From 08f503b0c089968b2542659a89dfd50c5c59bb0b Mon Sep 17 00:00:00 2001 From: Gregory Haskins Date: Mon, 10 Mar 2008 17:59:11 -0400 Subject: keep rd->online and cpu_online_map in sync It is possible to allow the root-domain cache of online cpus to become out of sync with the global cpu_online_map. This is because we currently trigger removal of cpus too early in the notifier chain. Other DOWN_PREPARE handlers may in fact run and reconfigure the root-domain topology, thereby stomping on our own offline handling. The end result is that rd->online may become out of sync with cpu_online_map, which results in potential task misrouting. So change the offline handling to be more tightly coupled with the global offline process by triggering on CPU_DYING intead of CPU_DOWN_PREPARE. Signed-off-by: Gregory Haskins Cc: Gautham R Shenoy Cc: "Siddha, Suresh B" Cc: "Rafael J. Wysocki" Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 52b98675acb2..1cb53fb1fe3d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5881,7 +5881,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_unlock_irq(&rq->lock); break; - case CPU_DOWN_PREPARE: + case CPU_DYING: + case CPU_DYING_FROZEN: /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); -- cgit v1.2.3 From a82f7119fd940c1505fc9fdf93d835fa52bc075d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 12 Mar 2008 00:34:57 +0100 Subject: Hibernation: Fix mark_nosave_pages() There is a problem in the hibernation code that triggers on some NUMA systems on which pfn_valid() returns 'true' for some PFNs that don't belong to any zone. Namely, there is a BUG_ON() in memory_bm_find_bit() that triggers for PFNs not belonging to any zone and passing the pfn_valid() test. On the affected systems it triggers when we mark PFNs reported by the platform as not saveable, because the PFNs in question belong to a region mapped directly using iorepam() (i.e. the ACPI data area) and they pass the pfn_valid() test. Modify memory_bm_find_bit() so that it returns an error if given PFN doesn't belong to any zone instead of crashing the kernel and ignore the result returned by it in mark_nosave_pages(), while marking the "nosave" memory regions. This doesn't affect the hibernation functionality, as we won't touch the PFNs in question anyway. http://bugzilla.kernel.org/show_bug.cgi?id=9966 . Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- kernel/power/snapshot.c | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 72a020cabb4c..5f91a07c4eac 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * of @bm->cur_zone_bm are updated. */ -static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, +static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) { struct zone_bitmap *zone_bm; @@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { zone_bm = zone_bm->next; - BUG_ON(!zone_bm); + if (!zone_bm) + return -EFAULT; } bm->cur.zone_bm = zone_bm; } @@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, pfn -= bb->start_pfn; *bit_nr = pfn % BM_BITS_PER_CHUNK; *addr = bb->data + pfn / BM_BITS_PER_CHUNK; + return 0; } static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); set_bit(bit, addr); } +static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + if (!error) + set_bit(bit, addr); + return error; +} + static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); clear_bit(bit, addr); } @@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); return test_bit(bit, addr); } @@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm) region->end_pfn << PAGE_SHIFT); for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) - memory_bm_set_bit(bm, pfn); + if (pfn_valid(pfn)) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } } } -- cgit v1.2.3 From 53471121a8aad3f0b590bfce7c95a1f5f52150f3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 12 Mar 2008 18:10:51 -0400 Subject: documentation: Move power-related files to Documentation/power/ Move 00-INDEX entries to power/00-INDEX (and add entry for pm_qos_interface.txt). Update references to moved filenames. Fix some trailing whitespace. Signed-off-by: Randy Dunlap Signed-off-by: Len Brown --- Documentation/00-INDEX | 4 - Documentation/kernel-parameters.txt | 2 +- Documentation/pm.txt | 257 ----------------------------- Documentation/pm_qos_interface.txt | 59 ------- Documentation/power/00-INDEX | 6 + Documentation/power/pm.txt | 257 +++++++++++++++++++++++++++++ Documentation/power/pm_qos_interface.txt | 59 +++++++ Documentation/power/power_supply_class.txt | 169 +++++++++++++++++++ Documentation/power_supply_class.txt | 169 ------------------- arch/x86/Kconfig | 2 +- kernel/power/Kconfig | 2 +- 11 files changed, 494 insertions(+), 492 deletions(-) delete mode 100644 Documentation/pm.txt delete mode 100644 Documentation/pm_qos_interface.txt create mode 100644 Documentation/power/pm.txt create mode 100644 Documentation/power/pm_qos_interface.txt create mode 100644 Documentation/power/power_supply_class.txt delete mode 100644 Documentation/power_supply_class.txt (limited to 'kernel') diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 1d51c0c57811..fc8e7c7d182f 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -299,12 +299,8 @@ pcmcia/ - info on the Linux PCMCIA driver. pi-futex.txt - documentation on lightweight PI-futexes. -pm.txt - - info on Linux power management support. pnp.txt - Linux Plug and Play documentation. -power_supply_class.txt - - Tells userspace about battery, UPS, AC or DC power supply properties power/ - directory with info on Linux PCI power management. powerpc/ diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 533e67febf81..49318b99e581 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -138,7 +138,7 @@ and is between 256 and 4096 characters. It is defined in the file strict -- Be less tolerant of platforms that are not strictly ACPI specification compliant. - See also Documentation/pm.txt, pci=noacpi + See also Documentation/power/pm.txt, pci=noacpi acpi_apic_instance= [ACPI, IOAPIC] Format: diff --git a/Documentation/pm.txt b/Documentation/pm.txt deleted file mode 100644 index da8589a0e07d..000000000000 --- a/Documentation/pm.txt +++ /dev/null @@ -1,257 +0,0 @@ - Linux Power Management Support - -This document briefly describes how to use power management with your -Linux system and how to add power management support to Linux drivers. - -APM or ACPI? ------------- -If you have a relatively recent x86 mobile, desktop, or server system, -odds are it supports either Advanced Power Management (APM) or -Advanced Configuration and Power Interface (ACPI). ACPI is the newer -of the two technologies and puts power management in the hands of the -operating system, allowing for more intelligent power management than -is possible with BIOS controlled APM. - -The best way to determine which, if either, your system supports is to -build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is -enabled by default). If a working ACPI implementation is found, the -ACPI driver will override and disable APM, otherwise the APM driver -will be used. - -No, sorry, you cannot have both ACPI and APM enabled and running at -once. Some people with broken ACPI or broken APM implementations -would like to use both to get a full set of working features, but you -simply cannot mix and match the two. Only one power management -interface can be in control of the machine at once. Think about it.. - -User-space Daemons ------------------- -Both APM and ACPI rely on user-space daemons, apmd and acpid -respectively, to be completely functional. Obtain both of these -daemons from your Linux distribution or from the Internet (see below) -and be sure that they are started sometime in the system boot process. -Go ahead and start both. If ACPI or APM is not available on your -system the associated daemon will exit gracefully. - - apmd: http://worldvisions.ca/~apenwarr/apmd/ - acpid: http://acpid.sf.net/ - -Driver Interface -- OBSOLETE, DO NOT USE! -----------------************************* - -Note: pm_register(), pm_access(), pm_dev_idle() and friends are -obsolete. Please do not use them. Instead you should properly hook -your driver into the driver model, and use its suspend()/resume() -callbacks to do this kind of stuff. - -If you are writing a new driver or maintaining an old driver, it -should include power management support. Without power management -support, a single driver may prevent a system with power management -capabilities from ever being able to suspend (safely). - -Overview: -1) Register each instance of a device with "pm_register" -2) Call "pm_access" before accessing the hardware. - (this will ensure that the hardware is awake and ready) -3) Your "pm_callback" is called before going into a - suspend state (ACPI D1-D3) or after resuming (ACPI D0) - from a suspend. -4) Call "pm_dev_idle" when the device is not being used - (optional but will improve device idle detection) -5) When unloaded, unregister the device with "pm_unregister" - -/* - * Description: Register a device with the power-management subsystem - * - * Parameters: - * type - device type (PCI device, system device, ...) - * id - instance number or unique identifier - * cback - request handler callback (suspend, resume, ...) - * - * Returns: Registered PM device or NULL on error - * - * Examples: - * dev = pm_register(PM_SYS_DEV, PM_SYS_VGA, vga_callback); - * - * struct pci_dev *pci_dev = pci_find_dev(...); - * dev = pm_register(PM_PCI_DEV, PM_PCI_ID(pci_dev), callback); - */ -struct pm_dev *pm_register(pm_dev_t type, unsigned long id, pm_callback cback); - -/* - * Description: Unregister a device with the power management subsystem - * - * Parameters: - * dev - PM device previously returned from pm_register - */ -void pm_unregister(struct pm_dev *dev); - -/* - * Description: Unregister all devices with a matching callback function - * - * Parameters: - * cback - previously registered request callback - * - * Notes: Provided for easier porting from old APM interface - */ -void pm_unregister_all(pm_callback cback); - -/* - * Power management request callback - * - * Parameters: - * dev - PM device previously returned from pm_register - * rqst - request type - * data - data, if any, associated with the request - * - * Returns: 0 if the request is successful - * EINVAL if the request is not supported - * EBUSY if the device is now busy and cannot handle the request - * ENOMEM if the device was unable to handle the request due to memory - * - * Details: The device request callback will be called before the - * device/system enters a suspend state (ACPI D1-D3) or - * or after the device/system resumes from suspend (ACPI D0). - * For PM_SUSPEND, the ACPI D-state being entered is passed - * as the "data" argument to the callback. The device - * driver should save (PM_SUSPEND) or restore (PM_RESUME) - * device context when the request callback is called. - * - * Once a driver returns 0 (success) from a suspend - * request, it should not process any further requests or - * access the device hardware until a call to "pm_access" is made. - */ -typedef int (*pm_callback)(struct pm_dev *dev, pm_request_t rqst, void *data); - -Driver Details --------------- -This is just a quick Q&A as a stopgap until a real driver writers' -power management guide is available. - -Q: When is a device suspended? - -Devices can be suspended based on direct user request (eg. laptop lid -closes), system power policy (eg. sleep after 30 minutes of console -inactivity), or device power policy (eg. power down device after 5 -minutes of inactivity) - -Q: Must a driver honor a suspend request? - -No, a driver can return -EBUSY from a suspend request and this -will stop the system from suspending. When a suspend request -fails, all suspended devices are resumed and the system continues -to run. Suspend can be retried at a later time. - -Q: Can the driver block suspend/resume requests? - -Yes, a driver can delay its return from a suspend or resume -request until the device is ready to handle requests. It -is advantageous to return as quickly as possible from a -request as suspend/resume are done serially. - -Q: What context is a suspend/resume initiated from? - -A suspend or resume is initiated from a kernel thread context. -It is safe to block, allocate memory, initiate requests -or anything else you can do within the kernel. - -Q: Will requests continue to arrive after a suspend? - -Possibly. It is the driver's responsibility to queue(*), -fail, or drop any requests that arrive after returning -success to a suspend request. It is important that the -driver not access its device until after it receives -a resume request as the device's bus may no longer -be active. - -(*) If a driver queues requests for processing after - resume be aware that the device, network, etc. - might be in a different state than at suspend time. - It's probably better to drop requests unless - the driver is a storage device. - -Q: Do I have to manage bus-specific power management registers - -No. It is the responsibility of the bus driver to manage -PCI, USB, etc. power management registers. The bus driver -or the power management subsystem will also enable any -wake-on functionality that the device has. - -Q: So, really, what do I need to do to support suspend/resume? - -You need to save any device context that would -be lost if the device was powered off and then restore -it at resume time. When ACPI is active, there are -three levels of device suspend states; D1, D2, and D3. -(The suspend state is passed as the "data" argument -to the device callback.) With D3, the device is powered -off and loses all context, D1 and D2 are shallower power -states and require less device context to be saved. To -play it safe, just save everything at suspend and restore -everything at resume. - -Q: Where do I store device context for suspend? - -Anywhere in memory, kmalloc a buffer or store it -in the device descriptor. You are guaranteed that the -contents of memory will be restored and accessible -before resume, even when the system suspends to disk. - -Q: What do I need to do for ACPI vs. APM vs. etc? - -Drivers need not be aware of the specific power management -technology that is active. They just need to be aware -of when the overlying power management system requests -that they suspend or resume. - -Q: What about device dependencies? - -When a driver registers a device, the power management -subsystem uses the information provided to build a -tree of device dependencies (eg. USB device X is on -USB controller Y which is on PCI bus Z) When power -management wants to suspend a device, it first sends -a suspend request to its driver, then the bus driver, -and so on up to the system bus. Device resumes -proceed in the opposite direction. - -Q: Who do I contact for additional information about - enabling power management for my specific driver/device? - -ACPI Development mailing list: linux-acpi@vger.kernel.org - -System Interface -- OBSOLETE, DO NOT USE! -----------------************************* -If you are providing new power management support to Linux (ie. -adding support for something like APM or ACPI), you should -communicate with drivers through the existing generic power -management interface. - -/* - * Send a request to all devices - * - * Parameters: - * rqst - request type - * data - data, if any, associated with the request - * - * Returns: 0 if the request is successful - * See "pm_callback" return for errors - * - * Details: Walk list of registered devices and call pm_send - * for each until complete or an error is encountered. - * If an error is encountered for a suspend request, - * return all devices to the state they were in before - * the suspend request. - */ -int pm_send_all(pm_request_t rqst, void *data); - -/* - * Find a matching device - * - * Parameters: - * type - device type (PCI device, system device, or 0 to match all devices) - * from - previous match or NULL to start from the beginning - * - * Returns: Matching device or NULL if none found - */ -struct pm_dev *pm_find(pm_dev_t type, struct pm_dev *from); diff --git a/Documentation/pm_qos_interface.txt b/Documentation/pm_qos_interface.txt deleted file mode 100644 index 49adb1a33514..000000000000 --- a/Documentation/pm_qos_interface.txt +++ /dev/null @@ -1,59 +0,0 @@ -PM quality of Service interface. - -This interface provides a kernel and user mode interface for registering -performance expectations by drivers, subsystems and user space applications on -one of the parameters. - -Currently we have {cpu_dma_latency, network_latency, network_throughput} as the -initial set of pm_qos parameters. - -The infrastructure exposes multiple misc device nodes one per implemented -parameter. The set of parameters implement is defined by pm_qos_power_init() -and pm_qos_params.h. This is done because having the available parameters -being runtime configurable or changeable from a driver was seen as too easy to -abuse. - -For each parameter a list of performance requirements is maintained along with -an aggregated target value. The aggregated target value is updated with -changes to the requirement list or elements of the list. Typically the -aggregated target value is simply the max or min of the requirement values held -in the parameter list elements. - -From kernel mode the use of this interface is simple: -pm_qos_add_requirement(param_id, name, target_value): -Will insert a named element in the list for that identified PM_QOS parameter -with the target value. Upon change to this list the new target is recomputed -and any registered notifiers are called only if the target value is now -different. - -pm_qos_update_requirement(param_id, name, new_target_value): -Will search the list identified by the param_id for the named list element and -then update its target value, calling the notification tree if the aggregated -target is changed. with that name is already registered. - -pm_qos_remove_requirement(param_id, name): -Will search the identified list for the named element and remove it, after -removal it will update the aggregate target and call the notification tree if -the target was changed as a result of removing the named requirement. - - -From user mode: -Only processes can register a pm_qos requirement. To provide for automatic -cleanup for process the interface requires the process to register its -parameter requirements in the following way: - -To register the default pm_qos target for the specific parameter, the process -must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] - -As long as the device node is held open that process has a registered -requirement on the parameter. The name of the requirement is "process_" -derived from the current->pid from within the open system call. - -To change the requested target value the process needs to write a s32 value to -the open device node. This translates to a pm_qos_update_requirement call. - -To remove the user mode request for a target value simply close the device -node. - - - diff --git a/Documentation/power/00-INDEX b/Documentation/power/00-INDEX index 8db4e41a052d..a55d7f1c836d 100644 --- a/Documentation/power/00-INDEX +++ b/Documentation/power/00-INDEX @@ -14,6 +14,12 @@ notifiers.txt - Registering suspend notifiers in device drivers pci.txt - How the PCI Subsystem Does Power Management +pm.txt + - info on Linux power management support. +pm_qos_interface.txt + - info on Linux PM Quality of Service interface +power_supply_class.txt + - Tells userspace about battery, UPS, AC or DC power supply properties s2ram.txt - How to get suspend to ram working (and debug it when it isn't) states.txt diff --git a/Documentation/power/pm.txt b/Documentation/power/pm.txt new file mode 100644 index 000000000000..be841507e43f --- /dev/null +++ b/Documentation/power/pm.txt @@ -0,0 +1,257 @@ + Linux Power Management Support + +This document briefly describes how to use power management with your +Linux system and how to add power management support to Linux drivers. + +APM or ACPI? +------------ +If you have a relatively recent x86 mobile, desktop, or server system, +odds are it supports either Advanced Power Management (APM) or +Advanced Configuration and Power Interface (ACPI). ACPI is the newer +of the two technologies and puts power management in the hands of the +operating system, allowing for more intelligent power management than +is possible with BIOS controlled APM. + +The best way to determine which, if either, your system supports is to +build a kernel with both ACPI and APM enabled (as of 2.3.x ACPI is +enabled by default). If a working ACPI implementation is found, the +ACPI driver will override and disable APM, otherwise the APM driver +will be used. + +No, sorry, you cannot have both ACPI and APM enabled and running at +once. Some people with broken ACPI or broken APM implementations +would like to use both to get a full set of working features, but you +simply cannot mix and match the two. Only one power management +interface can be in control of the machine at once. Think about it.. + +User-space Daemons +------------------ +Both APM and ACPI rely on user-space daemons, apmd and acpid +respectively, to be completely functional. Obtain both of these +daemons from your Linux distribution or from the Internet (see below) +and be sure that they are started sometime in the system boot process. +Go ahead and start both. If ACPI or APM is not available on your +system the associated daemon will exit gracefully. + + apmd: http://worldvisions.ca/~apenwarr/apmd/ + acpid: http://acpid.sf.net/ + +Driver Interface -- OBSOLETE, DO NOT USE! +----------------************************* + +Note: pm_register(), pm_access(), pm_dev_idle() and friends are +obsolete. Please do not use them. Instead you should properly hook +your driver into the driver model, and use its suspend()/resume() +callbacks to do this kind of stuff. + +If you are writing a new driver or maintaining an old driver, it +should include power management support. Without power management +support, a single driver may prevent a system with power management +capabilities from ever being able to suspend (safely). + +Overview: +1) Register each instance of a device with "pm_register" +2) Call "pm_access" before accessing the hardware. + (this will ensure that the hardware is awake and ready) +3) Your "pm_callback" is called before going into a + suspend state (ACPI D1-D3) or after resuming (ACPI D0) + from a suspend. +4) Call "pm_dev_idle" when the device is not being used + (optional but will improve device idle detection) +5) When unloaded, unregister the device with "pm_unregister" + +/* + * Description: Register a device with the power-management subsystem + * + * Parameters: + * type - device type (PCI device, system device, ...) + * id - instance number or unique identifier + * cback - request handler callback (suspend, resume, ...) + * + * Returns: Registered PM device or NULL on error + * + * Examples: + * dev = pm_register(PM_SYS_DEV, PM_SYS_VGA, vga_callback); + * + * struct pci_dev *pci_dev = pci_find_dev(...); + * dev = pm_register(PM_PCI_DEV, PM_PCI_ID(pci_dev), callback); + */ +struct pm_dev *pm_register(pm_dev_t type, unsigned long id, pm_callback cback); + +/* + * Description: Unregister a device with the power management subsystem + * + * Parameters: + * dev - PM device previously returned from pm_register + */ +void pm_unregister(struct pm_dev *dev); + +/* + * Description: Unregister all devices with a matching callback function + * + * Parameters: + * cback - previously registered request callback + * + * Notes: Provided for easier porting from old APM interface + */ +void pm_unregister_all(pm_callback cback); + +/* + * Power management request callback + * + * Parameters: + * dev - PM device previously returned from pm_register + * rqst - request type + * data - data, if any, associated with the request + * + * Returns: 0 if the request is successful + * EINVAL if the request is not supported + * EBUSY if the device is now busy and cannot handle the request + * ENOMEM if the device was unable to handle the request due to memory + * + * Details: The device request callback will be called before the + * device/system enters a suspend state (ACPI D1-D3) or + * or after the device/system resumes from suspend (ACPI D0). + * For PM_SUSPEND, the ACPI D-state being entered is passed + * as the "data" argument to the callback. The device + * driver should save (PM_SUSPEND) or restore (PM_RESUME) + * device context when the request callback is called. + * + * Once a driver returns 0 (success) from a suspend + * request, it should not process any further requests or + * access the device hardware until a call to "pm_access" is made. + */ +typedef int (*pm_callback)(struct pm_dev *dev, pm_request_t rqst, void *data); + +Driver Details +-------------- +This is just a quick Q&A as a stopgap until a real driver writers' +power management guide is available. + +Q: When is a device suspended? + +Devices can be suspended based on direct user request (eg. laptop lid +closes), system power policy (eg. sleep after 30 minutes of console +inactivity), or device power policy (eg. power down device after 5 +minutes of inactivity) + +Q: Must a driver honor a suspend request? + +No, a driver can return -EBUSY from a suspend request and this +will stop the system from suspending. When a suspend request +fails, all suspended devices are resumed and the system continues +to run. Suspend can be retried at a later time. + +Q: Can the driver block suspend/resume requests? + +Yes, a driver can delay its return from a suspend or resume +request until the device is ready to handle requests. It +is advantageous to return as quickly as possible from a +request as suspend/resume are done serially. + +Q: What context is a suspend/resume initiated from? + +A suspend or resume is initiated from a kernel thread context. +It is safe to block, allocate memory, initiate requests +or anything else you can do within the kernel. + +Q: Will requests continue to arrive after a suspend? + +Possibly. It is the driver's responsibility to queue(*), +fail, or drop any requests that arrive after returning +success to a suspend request. It is important that the +driver not access its device until after it receives +a resume request as the device's bus may no longer +be active. + +(*) If a driver queues requests for processing after + resume be aware that the device, network, etc. + might be in a different state than at suspend time. + It's probably better to drop requests unless + the driver is a storage device. + +Q: Do I have to manage bus-specific power management registers + +No. It is the responsibility of the bus driver to manage +PCI, USB, etc. power management registers. The bus driver +or the power management subsystem will also enable any +wake-on functionality that the device has. + +Q: So, really, what do I need to do to support suspend/resume? + +You need to save any device context that would +be lost if the device was powered off and then restore +it at resume time. When ACPI is active, there are +three levels of device suspend states; D1, D2, and D3. +(The suspend state is passed as the "data" argument +to the device callback.) With D3, the device is powered +off and loses all context, D1 and D2 are shallower power +states and require less device context to be saved. To +play it safe, just save everything at suspend and restore +everything at resume. + +Q: Where do I store device context for suspend? + +Anywhere in memory, kmalloc a buffer or store it +in the device descriptor. You are guaranteed that the +contents of memory will be restored and accessible +before resume, even when the system suspends to disk. + +Q: What do I need to do for ACPI vs. APM vs. etc? + +Drivers need not be aware of the specific power management +technology that is active. They just need to be aware +of when the overlying power management system requests +that they suspend or resume. + +Q: What about device dependencies? + +When a driver registers a device, the power management +subsystem uses the information provided to build a +tree of device dependencies (eg. USB device X is on +USB controller Y which is on PCI bus Z) When power +management wants to suspend a device, it first sends +a suspend request to its driver, then the bus driver, +and so on up to the system bus. Device resumes +proceed in the opposite direction. + +Q: Who do I contact for additional information about + enabling power management for my specific driver/device? + +ACPI Development mailing list: linux-acpi@vger.kernel.org + +System Interface -- OBSOLETE, DO NOT USE! +----------------************************* +If you are providing new power management support to Linux (ie. +adding support for something like APM or ACPI), you should +communicate with drivers through the existing generic power +management interface. + +/* + * Send a request to all devices + * + * Parameters: + * rqst - request type + * data - data, if any, associated with the request + * + * Returns: 0 if the request is successful + * See "pm_callback" return for errors + * + * Details: Walk list of registered devices and call pm_send + * for each until complete or an error is encountered. + * If an error is encountered for a suspend request, + * return all devices to the state they were in before + * the suspend request. + */ +int pm_send_all(pm_request_t rqst, void *data); + +/* + * Find a matching device + * + * Parameters: + * type - device type (PCI device, system device, or 0 to match all devices) + * from - previous match or NULL to start from the beginning + * + * Returns: Matching device or NULL if none found + */ +struct pm_dev *pm_find(pm_dev_t type, struct pm_dev *from); diff --git a/Documentation/power/pm_qos_interface.txt b/Documentation/power/pm_qos_interface.txt new file mode 100644 index 000000000000..49adb1a33514 --- /dev/null +++ b/Documentation/power/pm_qos_interface.txt @@ -0,0 +1,59 @@ +PM quality of Service interface. + +This interface provides a kernel and user mode interface for registering +performance expectations by drivers, subsystems and user space applications on +one of the parameters. + +Currently we have {cpu_dma_latency, network_latency, network_throughput} as the +initial set of pm_qos parameters. + +The infrastructure exposes multiple misc device nodes one per implemented +parameter. The set of parameters implement is defined by pm_qos_power_init() +and pm_qos_params.h. This is done because having the available parameters +being runtime configurable or changeable from a driver was seen as too easy to +abuse. + +For each parameter a list of performance requirements is maintained along with +an aggregated target value. The aggregated target value is updated with +changes to the requirement list or elements of the list. Typically the +aggregated target value is simply the max or min of the requirement values held +in the parameter list elements. + +From kernel mode the use of this interface is simple: +pm_qos_add_requirement(param_id, name, target_value): +Will insert a named element in the list for that identified PM_QOS parameter +with the target value. Upon change to this list the new target is recomputed +and any registered notifiers are called only if the target value is now +different. + +pm_qos_update_requirement(param_id, name, new_target_value): +Will search the list identified by the param_id for the named list element and +then update its target value, calling the notification tree if the aggregated +target is changed. with that name is already registered. + +pm_qos_remove_requirement(param_id, name): +Will search the identified list for the named element and remove it, after +removal it will update the aggregate target and call the notification tree if +the target was changed as a result of removing the named requirement. + + +From user mode: +Only processes can register a pm_qos requirement. To provide for automatic +cleanup for process the interface requires the process to register its +parameter requirements in the following way: + +To register the default pm_qos target for the specific parameter, the process +must open one of /dev/[cpu_dma_latency, network_latency, network_throughput] + +As long as the device node is held open that process has a registered +requirement on the parameter. The name of the requirement is "process_" +derived from the current->pid from within the open system call. + +To change the requested target value the process needs to write a s32 value to +the open device node. This translates to a pm_qos_update_requirement call. + +To remove the user mode request for a target value simply close the device +node. + + + diff --git a/Documentation/power/power_supply_class.txt b/Documentation/power/power_supply_class.txt new file mode 100644 index 000000000000..a8686e5a6857 --- /dev/null +++ b/Documentation/power/power_supply_class.txt @@ -0,0 +1,169 @@ +Linux power supply class +======================== + +Synopsis +~~~~~~~~ +Power supply class used to represent battery, UPS, AC or DC power supply +properties to user-space. + +It defines core set of attributes, which should be applicable to (almost) +every power supply out there. Attributes are available via sysfs and uevent +interfaces. + +Each attribute has well defined meaning, up to unit of measure used. While +the attributes provided are believed to be universally applicable to any +power supply, specific monitoring hardware may not be able to provide them +all, so any of them may be skipped. + +Power supply class is extensible, and allows to define drivers own attributes. +The core attribute set is subject to the standard Linux evolution (i.e. +if it will be found that some attribute is applicable to many power supply +types or their drivers, it can be added to the core set). + +It also integrates with LED framework, for the purpose of providing +typically expected feedback of battery charging/fully charged status and +AC/USB power supply online status. (Note that specific details of the +indication (including whether to use it at all) are fully controllable by +user and/or specific machine defaults, per design principles of LED +framework). + + +Attributes/properties +~~~~~~~~~~~~~~~~~~~~~ +Power supply class has predefined set of attributes, this eliminates code +duplication across drivers. Power supply class insist on reusing its +predefined attributes *and* their units. + +So, userspace gets predictable set of attributes and their units for any +kind of power supply, and can process/present them to a user in consistent +manner. Results for different power supplies and machines are also directly +comparable. + +See drivers/power/ds2760_battery.c and drivers/power/pda_power.c for the +example how to declare and handle attributes. + + +Units +~~~~~ +Quoting include/linux/power_supply.h: + + All voltages, currents, charges, energies, time and temperatures in µV, + µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise + stated. It's driver's job to convert its raw values to units in which + this class operates. + + +Attributes/properties detailed +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +~ ~ ~ ~ ~ ~ ~ Charge/Energy/Capacity - how to not confuse ~ ~ ~ ~ ~ ~ ~ +~ ~ +~ Because both "charge" (µAh) and "energy" (µWh) represents "capacity" ~ +~ of battery, this class distinguish these terms. Don't mix them! ~ +~ ~ +~ CHARGE_* attributes represents capacity in µAh only. ~ +~ ENERGY_* attributes represents capacity in µWh only. ~ +~ CAPACITY attribute represents capacity in *percents*, from 0 to 100. ~ +~ ~ +~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ + +Postfixes: +_AVG - *hardware* averaged value, use it if your hardware is really able to +report averaged values. +_NOW - momentary/instantaneous values. + +STATUS - this attribute represents operating status (charging, full, +discharging (i.e. powering a load), etc.). This corresponds to +BATTERY_STATUS_* values, as defined in battery.h. + +HEALTH - represents health of the battery, values corresponds to +POWER_SUPPLY_HEALTH_*, defined in battery.h. + +VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN - design values for maximal and +minimal power supply voltages. Maximal/minimal means values of voltages +when battery considered "full"/"empty" at normal conditions. Yes, there is +no direct relation between voltage and battery capacity, but some dumb +batteries use voltage for very approximated calculation of capacity. +Battery driver also can use this attribute just to inform userspace +about maximal and minimal voltage thresholds of a given battery. + +VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that +these ones should be used if hardware could only guess (measure and +retain) the thresholds of a given power supply. + +CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when +battery considered full/empty. + +ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN - same as above but for energy. + +CHARGE_FULL, CHARGE_EMPTY - These attributes means "last remembered value +of charge when battery became full/empty". It also could mean "value of +charge when battery considered full/empty at given conditions (temperature, +age)". I.e. these attributes represents real thresholds, not design values. + +ENERGY_FULL, ENERGY_EMPTY - same as above but for energy. + +CAPACITY - capacity in percents. + +TEMP - temperature of the power supply. +TEMP_AMBIENT - ambient temperature. + +TIME_TO_EMPTY - seconds left for battery to be considered empty (i.e. +while battery powers a load) +TIME_TO_FULL - seconds left for battery to be considered full (i.e. +while battery is charging) + + +Battery <-> external power supply interaction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Often power supplies are acting as supplies and supplicants at the same +time. Batteries are good example. So, batteries usually care if they're +externally powered or not. + +For that case, power supply class implements notification mechanism for +batteries. + +External power supply (AC) lists supplicants (batteries) names in +"supplied_to" struct member, and each power_supply_changed() call +issued by external power supply will notify supplicants via +external_power_changed callback. + + +QA +~~ +Q: Where is POWER_SUPPLY_PROP_XYZ attribute? +A: If you cannot find attribute suitable for your driver needs, feel free + to add it and send patch along with your driver. + + The attributes available currently are the ones currently provided by the + drivers written. + + Good candidates to add in future: model/part#, cycle_time, manufacturer, + etc. + + +Q: I have some very specific attribute (e.g. battery color), should I add + this attribute to standard ones? +A: Most likely, no. Such attribute can be placed in the driver itself, if + it is useful. Of course, if the attribute in question applicable to + large set of batteries, provided by many drivers, and/or comes from + some general battery specification/standard, it may be a candidate to + be added to the core attribute set. + + +Q: Suppose, my battery monitoring chip/firmware does not provides capacity + in percents, but provides charge_{now,full,empty}. Should I calculate + percentage capacity manually, inside the driver, and register CAPACITY + attribute? The same question about time_to_empty/time_to_full. +A: Most likely, no. This class is designed to export properties which are + directly measurable by the specific hardware available. + + Inferring not available properties using some heuristics or mathematical + model is not subject of work for a battery driver. Such functionality + should be factored out, and in fact, apm_power, the driver to serve + legacy APM API on top of power supply class, uses a simple heuristic of + approximating remaining battery capacity based on its charge, current, + voltage and so on. But full-fledged battery model is likely not subject + for kernel at all, as it would require floating point calculation to deal + with things like differential equations and Kalman filters. This is + better be handled by batteryd/libbattery, yet to be written. diff --git a/Documentation/power_supply_class.txt b/Documentation/power_supply_class.txt deleted file mode 100644 index a8686e5a6857..000000000000 --- a/Documentation/power_supply_class.txt +++ /dev/null @@ -1,169 +0,0 @@ -Linux power supply class -======================== - -Synopsis -~~~~~~~~ -Power supply class used to represent battery, UPS, AC or DC power supply -properties to user-space. - -It defines core set of attributes, which should be applicable to (almost) -every power supply out there. Attributes are available via sysfs and uevent -interfaces. - -Each attribute has well defined meaning, up to unit of measure used. While -the attributes provided are believed to be universally applicable to any -power supply, specific monitoring hardware may not be able to provide them -all, so any of them may be skipped. - -Power supply class is extensible, and allows to define drivers own attributes. -The core attribute set is subject to the standard Linux evolution (i.e. -if it will be found that some attribute is applicable to many power supply -types or their drivers, it can be added to the core set). - -It also integrates with LED framework, for the purpose of providing -typically expected feedback of battery charging/fully charged status and -AC/USB power supply online status. (Note that specific details of the -indication (including whether to use it at all) are fully controllable by -user and/or specific machine defaults, per design principles of LED -framework). - - -Attributes/properties -~~~~~~~~~~~~~~~~~~~~~ -Power supply class has predefined set of attributes, this eliminates code -duplication across drivers. Power supply class insist on reusing its -predefined attributes *and* their units. - -So, userspace gets predictable set of attributes and their units for any -kind of power supply, and can process/present them to a user in consistent -manner. Results for different power supplies and machines are also directly -comparable. - -See drivers/power/ds2760_battery.c and drivers/power/pda_power.c for the -example how to declare and handle attributes. - - -Units -~~~~~ -Quoting include/linux/power_supply.h: - - All voltages, currents, charges, energies, time and temperatures in µV, - µA, µAh, µWh, seconds and tenths of degree Celsius unless otherwise - stated. It's driver's job to convert its raw values to units in which - this class operates. - - -Attributes/properties detailed -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -~ ~ ~ ~ ~ ~ ~ Charge/Energy/Capacity - how to not confuse ~ ~ ~ ~ ~ ~ ~ -~ ~ -~ Because both "charge" (µAh) and "energy" (µWh) represents "capacity" ~ -~ of battery, this class distinguish these terms. Don't mix them! ~ -~ ~ -~ CHARGE_* attributes represents capacity in µAh only. ~ -~ ENERGY_* attributes represents capacity in µWh only. ~ -~ CAPACITY attribute represents capacity in *percents*, from 0 to 100. ~ -~ ~ -~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ - -Postfixes: -_AVG - *hardware* averaged value, use it if your hardware is really able to -report averaged values. -_NOW - momentary/instantaneous values. - -STATUS - this attribute represents operating status (charging, full, -discharging (i.e. powering a load), etc.). This corresponds to -BATTERY_STATUS_* values, as defined in battery.h. - -HEALTH - represents health of the battery, values corresponds to -POWER_SUPPLY_HEALTH_*, defined in battery.h. - -VOLTAGE_MAX_DESIGN, VOLTAGE_MIN_DESIGN - design values for maximal and -minimal power supply voltages. Maximal/minimal means values of voltages -when battery considered "full"/"empty" at normal conditions. Yes, there is -no direct relation between voltage and battery capacity, but some dumb -batteries use voltage for very approximated calculation of capacity. -Battery driver also can use this attribute just to inform userspace -about maximal and minimal voltage thresholds of a given battery. - -VOLTAGE_MAX, VOLTAGE_MIN - same as _DESIGN voltage values except that -these ones should be used if hardware could only guess (measure and -retain) the thresholds of a given power supply. - -CHARGE_FULL_DESIGN, CHARGE_EMPTY_DESIGN - design charge values, when -battery considered full/empty. - -ENERGY_FULL_DESIGN, ENERGY_EMPTY_DESIGN - same as above but for energy. - -CHARGE_FULL, CHARGE_EMPTY - These attributes means "last remembered value -of charge when battery became full/empty". It also could mean "value of -charge when battery considered full/empty at given conditions (temperature, -age)". I.e. these attributes represents real thresholds, not design values. - -ENERGY_FULL, ENERGY_EMPTY - same as above but for energy. - -CAPACITY - capacity in percents. - -TEMP - temperature of the power supply. -TEMP_AMBIENT - ambient temperature. - -TIME_TO_EMPTY - seconds left for battery to be considered empty (i.e. -while battery powers a load) -TIME_TO_FULL - seconds left for battery to be considered full (i.e. -while battery is charging) - - -Battery <-> external power supply interaction -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Often power supplies are acting as supplies and supplicants at the same -time. Batteries are good example. So, batteries usually care if they're -externally powered or not. - -For that case, power supply class implements notification mechanism for -batteries. - -External power supply (AC) lists supplicants (batteries) names in -"supplied_to" struct member, and each power_supply_changed() call -issued by external power supply will notify supplicants via -external_power_changed callback. - - -QA -~~ -Q: Where is POWER_SUPPLY_PROP_XYZ attribute? -A: If you cannot find attribute suitable for your driver needs, feel free - to add it and send patch along with your driver. - - The attributes available currently are the ones currently provided by the - drivers written. - - Good candidates to add in future: model/part#, cycle_time, manufacturer, - etc. - - -Q: I have some very specific attribute (e.g. battery color), should I add - this attribute to standard ones? -A: Most likely, no. Such attribute can be placed in the driver itself, if - it is useful. Of course, if the attribute in question applicable to - large set of batteries, provided by many drivers, and/or comes from - some general battery specification/standard, it may be a candidate to - be added to the core attribute set. - - -Q: Suppose, my battery monitoring chip/firmware does not provides capacity - in percents, but provides charge_{now,full,empty}. Should I calculate - percentage capacity manually, inside the driver, and register CAPACITY - attribute? The same question about time_to_empty/time_to_full. -A: Most likely, no. This class is designed to export properties which are - directly measurable by the specific hardware available. - - Inferring not available properties using some heuristics or mathematical - model is not subject of work for a battery driver. Such functionality - should be factored out, and in fact, apm_power, the driver to serve - legacy APM API on top of power supply class, uses a simple heuristic of - approximating remaining battery capacity based on its charge, current, - voltage and so on. But full-fledged battery model is likely not subject - for kernel at all, as it would require floating point calculation to deal - with things like differential equations and Kalman filters. This is - better be handled by batteryd/libbattery, yet to be written. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 237fc128143d..6c70fed0f9a0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1259,7 +1259,7 @@ menuconfig APM machines with more than one CPU. In order to use APM, you will need supporting software. For location - and more information, read and the + and more information, read and the Battery Powered Linux mini-HOWTO, available from . diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 79833170bb9c..6233f3b4ae66 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -190,7 +190,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read and the + and more information, read and the Battery Powered Linux mini-HOWTO, available from . -- cgit v1.2.3 From 0e1f34833bd9170ccc93ab759e48e695917fa48f Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 10 Mar 2008 11:01:20 -0700 Subject: sched: fix race in schedule() Fix a hard to trigger crash seen in the -rt kernel that also affects the vanilla scheduler. There is a race condition between schedule() and some dequeue/enqueue functions; rt_mutex_setprio(), __setscheduler() and sched_move_task(). When scheduling to idle, idle_balance() is called to pull tasks from other busy processor. It might drop the rq lock. It means that those 3 functions encounter on_rq=0 and running=1. The current task should be put when running. Here is a possible scenario: CPU0 CPU1 | schedule() | ->deactivate_task() | ->idle_balance() | -->load_balance_newidle() rt_mutex_setprio() | | --->double_lock_balance() *get lock *rel lock * on_rq=0, ruuning=1 | * sched_class is changed | *rel lock *get lock : | : ->put_prev_task_rt() ->pick_next_task_fair() => panic The current process of CPU1(P1) is scheduling. Deactivated P1, and the scheduler looks for another process on other CPU's runqueue because CPU1 will be idle. idle_balance(), load_balance_newidle() and double_lock_balance() are called and double_lock_balance() could drop the rq lock. On the other hand, CPU0 is trying to boost the priority of P1. The result of boosting only P1's prio and sched_class are changed to RT. The sched entities of P1 and P1's group are never put. It makes cfs_rq invalid, because the cfs_rq has curr and no leaf, but pick_next_task_fair() is called, then the kernel panics. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 38 ++++++++++++++++---------------------- 1 file changed, 16 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1cb53fb1fe3d..9df9ba73cb7a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4268,11 +4268,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; running = task_current(rq, p); - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } + if (running) + p->sched_class->put_prev_task(rq, p); if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -4281,10 +4280,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; + if (running) + p->sched_class->set_curr_task(rq); if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); - enqueue_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio, running); @@ -4581,19 +4579,17 @@ recheck: update_rq_clock(rq); on_rq = p->se.on_rq; running = task_current(rq, p); - if (on_rq) { + if (on_rq) deactivate_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } + if (running) + p->sched_class->put_prev_task(rq, p); oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); + if (running) + p->sched_class->set_curr_task(rq); if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); - activate_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio, running); @@ -7618,11 +7614,10 @@ void sched_move_task(struct task_struct *tsk) running = task_current(rq, tsk); on_rq = tsk->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, tsk, 0); - if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); - } + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); set_task_rq(tsk, task_cpu(tsk)); @@ -7631,11 +7626,10 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_class->moved_group(tsk); #endif - if (on_rq) { - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + if (on_rq) enqueue_task(rq, tsk, 0); - } task_rq_unlock(rq, &flags); } -- cgit v1.2.3 From 3fe69747dab906cd6a8523230276a9820d6a514f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Mar 2008 20:55:51 +0100 Subject: sched: min_vruntime fix Current min_vruntime tracking is incorrect and will cause serious problems when we don't run the leftmost task for some reason. min_vruntime does two things; 1) it's used to determine a forward direction when the u64 vruntime wraps, 2) it's used to track the leftmost vruntime to position newly enqueued tasks from. The current logic advances min_vruntime whenever the current task's vruntime advance. Because the current task may pass the leftmost task still waiting we're failing the second goal. This causes new tasks to be placed too far ahead and thus penalizes their runtime. Fix this by making min_vruntime the min_vruntime of the waiting tasks by tracking it in enqueue/dequeue, and compare against current's vruntime to obtain the absolute minimum when placing new tasks. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 46 ++++++++++++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e2a530515619..9d003c9d2a48 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * Maintain a cache of leftmost tree entries (it is frequently * used): */ - if (leftmost) + if (leftmost) { cfs_rq->rb_leftmost = &se->run_node; + /* + * maintain cfs_rq->min_vruntime to be a monotonic increasing + * value tracking the leftmost vruntime in the tree. + */ + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, se->vruntime); + } rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); @@ -184,8 +191,21 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) - cfs_rq->rb_leftmost = rb_next(&se->run_node); + if (cfs_rq->rb_leftmost == &se->run_node) { + struct rb_node *next_node; + struct sched_entity *next; + + next_node = rb_next(&se->run_node); + cfs_rq->rb_leftmost = next_node; + + if (next_node) { + next = rb_entry(next_node, + struct sched_entity, run_node); + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, + next->vruntime); + } + } rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } @@ -303,7 +323,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; - u64 vruntime; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); @@ -315,19 +334,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, &curr->load); } curr->vruntime += delta_exec_weighted; - - /* - * maintain cfs_rq->min_vruntime to be a monotonic increasing - * value tracking the leftmost vruntime in the tree. - */ - if (first_fair(cfs_rq)) { - vruntime = min_vruntime(curr->vruntime, - __pick_next_entity(cfs_rq)->vruntime); - } else - vruntime = curr->vruntime; - - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, vruntime); } static void update_curr(struct cfs_rq *cfs_rq) @@ -493,7 +499,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime; - vruntime = cfs_rq->min_vruntime; + if (first_fair(cfs_rq)) { + vruntime = min_vruntime(cfs_rq->min_vruntime, + __pick_next_entity(cfs_rq)->vruntime); + } else + vruntime = cfs_rq->min_vruntime; if (sched_feat(TREE_AVG)) { struct sched_entity *last = __pick_last_entity(cfs_rq); -- cgit v1.2.3 From e89996ae3f9e88d4fd75751a15c10b19d197e702 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 14 Mar 2008 23:48:28 +0100 Subject: sched: fix update_load_add()/sub() Clear the cached inverse value when updating load. This is needed for calc_delta_mine() to work correctly when using the rq load. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9df9ba73cb7a..3a4ba3dc0f49 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1108,11 +1108,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; + lw->inv_weight = 0; } static inline void update_load_sub(struct load_weight *lw, unsigned long dec) { lw->weight -= dec; + lw->inv_weight = 0; } /* -- cgit v1.2.3 From 27d117266097101dcf79c4576903cdcdd0eabffc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 14 Mar 2008 22:20:01 +0100 Subject: sched: fix calc_delta_mine() lw->weight can be 0 for a short time during bootup. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 3a4ba3dc0f49..6b06f23261c0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1084,7 +1084,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, u64 tmp; if (unlikely(!lw->inv_weight)) - lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; + lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); tmp = (u64)delta_exec * weight; /* -- cgit v1.2.3 From aa2ac25229cd4d0280f6174c42712744ad61b140 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 14 Mar 2008 21:12:12 +0100 Subject: sched: fix overload performance: buddy wakeups Currently we schedule to the leftmost task in the runqueue. When the runtimes are very short because of some server/client ping-pong, especially in over-saturated workloads, this will cycle through all tasks trashing the cache. Reduce cache trashing by keeping dependent tasks together by running newly woken tasks first. However, by not running the leftmost task first we could starve tasks because the wakee can gain unlimited runtime. Therefore we only run the wakee if its within a small (wakeup_granularity) window of the leftmost task. This preserves fairness, but does alternate server/client task groups. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- kernel/sched_fair.c | 26 ++++++++++++++++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6b06f23261c0..d1ad69b270ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -301,7 +301,7 @@ struct cfs_rq { /* 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr; + struct sched_entity *curr, *next; unsigned long nr_spread_over; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9d003c9d2a48..31c4a2988b64 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -207,6 +207,9 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) } } + if (cfs_rq->next == se) + cfs_rq->next = NULL; + rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } @@ -626,12 +629,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static struct sched_entity * +pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 diff, gran; + + if (!cfs_rq->next) + return se; + + diff = cfs_rq->next->vruntime - se->vruntime; + if (diff < 0) + return se; + + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); + if (diff > gran) + return se; + + return cfs_rq->next; +} + static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = NULL; if (first_fair(cfs_rq)) { se = __pick_next_entity(cfs_rq); + se = pick_next(cfs_rq, se); set_next_entity(cfs_rq, se); } @@ -1070,6 +1093,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) resched_task(curr); return; } + + cfs_rq_of(pse)->next = pse; + /* * Batch tasks do not preempt (their preemption is driven by * the tick): -- cgit v1.2.3 From e22ecef1d2658ba54ed7d3fdb5d60829fb434c23 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 14 Mar 2008 22:16:08 +0100 Subject: sched: fix fair sleepers Fair sleepers need to scale their latency target down by runqueue weight. Otherwise busy systems will gain ever larger sleep bonus. Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched_fair.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 31c4a2988b64..31aa1b9fa762 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -528,8 +528,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) - vruntime -= sysctl_sched_latency; + if (sched_feat(NEW_FAIR_SLEEPERS)) { + vruntime -= calc_delta_fair(sysctl_sched_latency, + &cfs_rq->load); + } /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); -- cgit v1.2.3 From 6a6029b8cefe0ca7e82f27f3904dbedba3de4e06 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 14 Mar 2008 22:17:08 +0100 Subject: sched: simplify sched_slice() Use the existing calc_delta_mine() calculation for sched_slice(). This saves a divide and simplifies the code because we share it with the other /cfs_rq->load users. It also improves code size: text data bss dec hex filename 42659 2740 144 45543 b1e7 sched.o.before 42093 2740 144 44977 afb1 sched.o.after Signed-off-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- kernel/sched_fair.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 31aa1b9fa762..f2cc59080efa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -283,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running); - - slice *= se->load.weight; - do_div(slice, cfs_rq->load.weight); - - return slice; + return calc_delta_mine(__sched_period(cfs_rq->nr_running), + se->load.weight, &cfs_rq->load); } /* -- cgit v1.2.3