diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.preempt | 15 | ||||
-rw-r--r-- | kernel/exit.c | 2 | ||||
-rw-r--r-- | kernel/module.c | 15 | ||||
-rw-r--r-- | kernel/power/Kconfig | 2 | ||||
-rw-r--r-- | kernel/power/snapshot.c | 41 | ||||
-rw-r--r-- | kernel/relay.c | 5 | ||||
-rw-r--r-- | kernel/sched.c | 58 | ||||
-rw-r--r-- | kernel/sched_debug.c | 1 | ||||
-rw-r--r-- | kernel/sched_fair.c | 277 | ||||
-rw-r--r-- | kernel/time/ntp.c | 23 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 2 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 6 |
12 files changed, 297 insertions, 150 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 0669b70fa6a3..9fdba03dc1fc 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -52,8 +52,23 @@ config PREEMPT endchoice +config PREEMPT_RCU + bool "Preemptible RCU" + depends on PREEMPT + default n + help + This option reduces the latency of the kernel by making certain + RCU sections preemptible. Normally RCU code is non-preemptible, if + this option is selected then read-only RCU sections become + preemptible. This helps latency, but may expose bugs due to + now-naive assumptions about each RCU read-side critical section + remaining on a given CPU through its execution. + + Say N if you are unsure. + config RCU_TRACE bool "Enable tracing for RCU - currently stats in debugfs" + depends on PREEMPT_RCU select DEBUG_FS default y help diff --git a/kernel/exit.c b/kernel/exit.c index cd20bf07e9e3..53872bf993fa 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1378,7 +1378,7 @@ unlock_sig: if (!retval && infop) retval = put_user(0, &infop->si_errno); if (!retval && infop) - retval = put_user(why, &infop->si_code); + retval = put_user((short)why, &infop->si_code); if (!retval && infop) retval = put_user(exit_code, &infop->si_status); if (!retval && infop) diff --git a/kernel/module.c b/kernel/module.c index be4807fb90e4..5d437bffd8dc 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2178,10 +2178,20 @@ sys_init_module(void __user *umod, wake_up(&module_wq); return ret; } + if (ret > 0) { + printk(KERN_WARNING "%s: '%s'->init suspiciously returned %d, " + "it should follow 0/-E convention\n" + KERN_WARNING "%s: loading module anyway...\n", + __func__, mod->name, ret, + __func__); + dump_stack(); + } - /* Now it's a first class citizen! */ - mutex_lock(&module_mutex); + /* Now it's a first class citizen! Wake up anyone waiting for it. */ mod->state = MODULE_STATE_LIVE; + wake_up(&module_wq); + + mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); unwind_remove_table(mod->unwind_info, 1); @@ -2190,7 +2200,6 @@ sys_init_module(void __user *umod, mod->init_size = 0; mod->init_text_size = 0; mutex_unlock(&module_mutex); - wake_up(&module_wq); return 0; } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 79833170bb9c..6233f3b4ae66 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -190,7 +190,7 @@ config APM_EMULATION notification of APM "events" (e.g. battery status change). In order to use APM, you will need supporting software. For location - and more information, read <file:Documentation/pm.txt> and the + and more information, read <file:Documentation/power/pm.txt> and the Battery Powered Linux mini-HOWTO, available from <http://www.tldp.org/docs.html#howto>. diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 72a020cabb4c..5f91a07c4eac 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -447,7 +447,7 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * of @bm->cur_zone_bm are updated. */ -static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, +static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) { struct zone_bitmap *zone_bm; @@ -461,7 +461,8 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { zone_bm = zone_bm->next; - BUG_ON(!zone_bm); + if (!zone_bm) + return -EFAULT; } bm->cur.zone_bm = zone_bm; } @@ -479,23 +480,40 @@ static void memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, pfn -= bb->start_pfn; *bit_nr = pfn % BM_BITS_PER_CHUNK; *addr = bb->data + pfn / BM_BITS_PER_CHUNK; + return 0; } static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); set_bit(bit, addr); } +static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + int error; + + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + if (!error) + set_bit(bit, addr); + return error; +} + static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); clear_bit(bit, addr); } @@ -503,8 +521,10 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) { void *addr; unsigned int bit; + int error; - memory_bm_find_bit(bm, pfn, &addr, &bit); + error = memory_bm_find_bit(bm, pfn, &addr, &bit); + BUG_ON(error); return test_bit(bit, addr); } @@ -709,8 +729,15 @@ static void mark_nosave_pages(struct memory_bitmap *bm) region->end_pfn << PAGE_SHIFT); for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++) - if (pfn_valid(pfn)) - memory_bm_set_bit(bm, pfn); + if (pfn_valid(pfn)) { + /* + * It is safe to ignore the result of + * mem_bm_set_bit_check() here, since we won't + * touch the PFNs for which the error is + * returned anyway. + */ + mem_bm_set_bit_check(bm, pfn); + } } } diff --git a/kernel/relay.c b/kernel/relay.c index d080b9d161a7..4c035a8a248c 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1066,7 +1066,7 @@ static int subbuf_splice_actor(struct file *in, unsigned int flags, int *nonpad_ret) { - unsigned int pidx, poff, total_len, subbuf_pages, ret; + unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; struct rchan_buf *rbuf = in->private_data; unsigned int subbuf_size = rbuf->chan->subbuf_size; uint64_t pos = (uint64_t) *ppos; @@ -1097,8 +1097,9 @@ static int subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; + nr_pages = min_t(unsigned int, subbuf_pages, PIPE_BUFFERS); - for (total_len = 0; spd.nr_pages < subbuf_pages; spd.nr_pages++) { + for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { unsigned int this_len, this_end, private; unsigned int cur_pos = read_start + total_len; diff --git a/kernel/sched.c b/kernel/sched.c index 52b98675acb2..3f7c5eb254e2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -301,7 +301,7 @@ struct cfs_rq { /* 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ - struct sched_entity *curr; + struct sched_entity *curr, *next; unsigned long nr_spread_over; @@ -1084,7 +1084,7 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, u64 tmp; if (unlikely(!lw->inv_weight)) - lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; + lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); tmp = (u64)delta_exec * weight; /* @@ -1108,11 +1108,13 @@ calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; + lw->inv_weight = 0; } static inline void update_load_sub(struct load_weight *lw, unsigned long dec) { lw->weight -= dec; + lw->inv_weight = 0; } /* @@ -1394,6 +1396,12 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) { s64 delta; + /* + * Buddy candidates are cache hot: + */ + if (&p->se == cfs_rq_of(&p->se)->next) + return 1; + if (p->sched_class != &fair_sched_class) return 0; @@ -1853,10 +1861,11 @@ out_activate: schedstat_inc(p, se.nr_wakeups_remote); update_rq_clock(rq); activate_task(rq, p, 1); - check_preempt_curr(rq, p); success = 1; out_running: + check_preempt_curr(rq, p); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -1890,6 +1899,8 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.last_wakeup = 0; + p->se.avg_overlap = 0; #ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; @@ -4268,11 +4279,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) oldprio = p->prio; on_rq = p->se.on_rq; running = task_current(rq, p); - if (on_rq) { + if (on_rq) dequeue_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } + if (running) + p->sched_class->put_prev_task(rq, p); if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -4281,10 +4291,9 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; + if (running) + p->sched_class->set_curr_task(rq); if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); - enqueue_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio, running); @@ -4581,19 +4590,17 @@ recheck: update_rq_clock(rq); on_rq = p->se.on_rq; running = task_current(rq, p); - if (on_rq) { + if (on_rq) deactivate_task(rq, p, 0); - if (running) - p->sched_class->put_prev_task(rq, p); - } + if (running) + p->sched_class->put_prev_task(rq, p); oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); + if (running) + p->sched_class->set_curr_task(rq); if (on_rq) { - if (running) - p->sched_class->set_curr_task(rq); - activate_task(rq, p, 0); check_class_changed(rq, p, prev_class, oldprio, running); @@ -5881,7 +5888,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_unlock_irq(&rq->lock); break; - case CPU_DOWN_PREPARE: + case CPU_DYING: + case CPU_DYING_FROZEN: /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); @@ -7617,11 +7625,10 @@ void sched_move_task(struct task_struct *tsk) running = task_current(rq, tsk); on_rq = tsk->se.on_rq; - if (on_rq) { + if (on_rq) dequeue_task(rq, tsk, 0); - if (unlikely(running)) - tsk->sched_class->put_prev_task(rq, tsk); - } + if (unlikely(running)) + tsk->sched_class->put_prev_task(rq, tsk); set_task_rq(tsk, task_cpu(tsk)); @@ -7630,11 +7637,10 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_class->moved_group(tsk); #endif - if (on_rq) { - if (unlikely(running)) - tsk->sched_class->set_curr_task(rq); + if (unlikely(running)) + tsk->sched_class->set_curr_task(rq); + if (on_rq) enqueue_task(rq, tsk, 0); - } task_rq_unlock(rq, &flags); } diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 4b5e24cf2f4a..ef358ba07683 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -288,6 +288,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) PN(se.exec_start); PN(se.vruntime); PN(se.sum_exec_runtime); + PN(se.avg_overlap); nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e2a530515619..b85cac4b5e25 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -73,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; /* * SCHED_OTHER wake-up granularity. - * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) + * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -175,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * Maintain a cache of leftmost tree entries (it is frequently * used): */ - if (leftmost) + if (leftmost) { cfs_rq->rb_leftmost = &se->run_node; + /* + * maintain cfs_rq->min_vruntime to be a monotonic increasing + * value tracking the leftmost vruntime in the tree. + */ + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, se->vruntime); + } rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); @@ -184,8 +191,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) - cfs_rq->rb_leftmost = rb_next(&se->run_node); + if (cfs_rq->rb_leftmost == &se->run_node) { + struct rb_node *next_node; + struct sched_entity *next; + + next_node = rb_next(&se->run_node); + cfs_rq->rb_leftmost = next_node; + + if (next_node) { + next = rb_entry(next_node, + struct sched_entity, run_node); + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, + next->vruntime); + } + } + + if (cfs_rq->next == se) + cfs_rq->next = NULL; rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } @@ -260,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running); - - slice *= se->load.weight; - do_div(slice, cfs_rq->load.weight); - - return slice; + return calc_delta_mine(__sched_period(cfs_rq->nr_running), + se->load.weight, &cfs_rq->load); } /* @@ -303,7 +322,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; - u64 vruntime; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); @@ -315,19 +333,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, &curr->load); } curr->vruntime += delta_exec_weighted; - - /* - * maintain cfs_rq->min_vruntime to be a monotonic increasing - * value tracking the leftmost vruntime in the tree. - */ - if (first_fair(cfs_rq)) { - vruntime = min_vruntime(curr->vruntime, - __pick_next_entity(cfs_rq)->vruntime); - } else - vruntime = curr->vruntime; - - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, vruntime); } static void update_curr(struct cfs_rq *cfs_rq) @@ -493,7 +498,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime; - vruntime = cfs_rq->min_vruntime; + if (first_fair(cfs_rq)) { + vruntime = min_vruntime(cfs_rq->min_vruntime, + __pick_next_entity(cfs_rq)->vruntime); + } else + vruntime = cfs_rq->min_vruntime; if (sched_feat(TREE_AVG)) { struct sched_entity *last = __pick_last_entity(cfs_rq); @@ -515,8 +524,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS)) - vruntime -= sysctl_sched_latency; + if (sched_feat(NEW_FAIR_SLEEPERS)) { + vruntime -= calc_delta_fair(sysctl_sched_latency, + &cfs_rq->load); + } /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); @@ -545,6 +556,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) account_entity_enqueue(cfs_rq, se); } +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + +static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (!se->last_wakeup) + return; + + update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); + se->last_wakeup = 0; +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -555,6 +581,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { + update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -616,12 +643,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static struct sched_entity * +pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 diff, gran; + + if (!cfs_rq->next) + return se; + + diff = cfs_rq->next->vruntime - se->vruntime; + if (diff < 0) + return se; + + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); + if (diff > gran) + return se; + + return cfs_rq->next; +} + static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = NULL; if (first_fair(cfs_rq)) { se = __pick_next_entity(cfs_rq); + se = pick_next(cfs_rq, se); set_next_entity(cfs_rq, se); } @@ -949,96 +996,121 @@ static inline int wake_idle(int cpu, struct task_struct *p) #endif #ifdef CONFIG_SMP -static int select_task_rq_fair(struct task_struct *p, int sync) + +static const struct sched_class fair_sched_class; + +static int +wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, + struct task_struct *p, int prev_cpu, int this_cpu, int sync, + int idx, unsigned long load, unsigned long this_load, + unsigned int imbalance) { - int cpu, this_cpu; - struct rq *rq; - struct sched_domain *sd, *this_sd = NULL; - int new_cpu; + struct task_struct *curr = this_rq->curr; + unsigned long tl = this_load; + unsigned long tl_per_task; + + if (!(this_sd->flags & SD_WAKE_AFFINE)) + return 0; + + /* + * If the currently running task will sleep within + * a reasonable amount of time then attract this newly + * woken task: + */ + if (sync && curr->sched_class == &fair_sched_class) { + if (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + return 1; + } + + schedstat_inc(p, se.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || + 100*(tl + p->se.load.weight) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); - cpu = task_cpu(p); - rq = task_rq(p); - this_cpu = smp_processor_id(); - new_cpu = cpu; + return 1; + } + return 0; +} - if (cpu == this_cpu) - goto out_set_cpu; +static int select_task_rq_fair(struct task_struct *p, int sync) +{ + struct sched_domain *sd, *this_sd = NULL; + int prev_cpu, this_cpu, new_cpu; + unsigned long load, this_load; + struct rq *rq, *this_rq; + unsigned int imbalance; + int idx; + + prev_cpu = task_cpu(p); + rq = task_rq(p); + this_cpu = smp_processor_id(); + this_rq = cpu_rq(this_cpu); + new_cpu = prev_cpu; + /* + * 'this_sd' is the first domain that both + * this_cpu and prev_cpu are present in: + */ for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { + if (cpu_isset(prev_cpu, sd->span)) { this_sd = sd; break; } } if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) - goto out_set_cpu; + goto out; /* * Check for affine wakeup and passive balancing possibilities. */ - if (this_sd) { - int idx = this_sd->wake_idx; - unsigned int imbalance; - unsigned long load, this_load; - - imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - - load = source_load(cpu, idx); - this_load = target_load(this_cpu, idx); - - new_cpu = this_cpu; /* Wake to this CPU if we can */ - - if (this_sd->flags & SD_WAKE_AFFINE) { - unsigned long tl = this_load; - unsigned long tl_per_task; - - /* - * Attract cache-cold tasks on sync wakeups: - */ - if (sync && !task_hot(p, rq->clock, this_sd)) - goto out_set_cpu; - - schedstat_inc(p, se.nr_wakeups_affine_attempts); - tl_per_task = cpu_avg_load_per_task(this_cpu); - - /* - * If sync wakeup then subtract the (maximum possible) - * effect of the currently running task from the load - * of the current CPU: - */ - if (sync) - tl -= current->se.load.weight; - - if ((tl <= load && - tl + target_load(cpu, idx) <= tl_per_task) || - 100*(tl + p->se.load.weight) <= imbalance*load) { - /* - * This domain has SD_WAKE_AFFINE and - * p is cache cold in this domain, and - * there is no bad imbalance. - */ - schedstat_inc(this_sd, ttwu_move_affine); - schedstat_inc(p, se.nr_wakeups_affine); - goto out_set_cpu; - } - } + if (!this_sd) + goto out; - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - if (this_sd->flags & SD_WAKE_BALANCE) { - if (imbalance*this_load <= 100*load) { - schedstat_inc(this_sd, ttwu_move_balance); - schedstat_inc(p, se.nr_wakeups_passive); - goto out_set_cpu; - } + idx = this_sd->wake_idx; + + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); + + if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + load, this_load, imbalance)) + return this_cpu; + + if (prev_cpu == this_cpu) + goto out; + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); + return this_cpu; } } - new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ -out_set_cpu: +out: return wake_idle(new_cpu, p); } #endif /* CONFIG_SMP */ @@ -1060,6 +1132,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) resched_task(curr); return; } + + se->last_wakeup = se->sum_exec_runtime; + if (unlikely(se == pse)) + return; + + cfs_rq_of(pse)->next = pse; + /* * Batch tasks do not preempt (their preemption is driven by * the tick): diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index c88b5910e7ab..5fd9b9469770 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -42,12 +42,13 @@ long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ long time_freq; /* frequency offset (scaled ppm)*/ static long time_reftime; /* time at last adjustment (s) */ long time_adjust; +static long ntp_tick_adj; static void ntp_update_frequency(void) { u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ) << TICK_LENGTH_SHIFT; - second_length += (s64)CLOCK_TICK_ADJUST << TICK_LENGTH_SHIFT; + second_length += (s64)ntp_tick_adj << TICK_LENGTH_SHIFT; second_length += (s64)time_freq << (TICK_LENGTH_SHIFT - SHIFT_NSEC); tick_length_base = second_length; @@ -342,14 +343,16 @@ int do_adjtimex(struct timex *txc) freq_adj = shift_right(freq_adj, time_constant * 2 + (SHIFT_PLL + 2) * 2 - SHIFT_NSEC); if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) { + u64 utemp64; temp64 = time_offset << (SHIFT_NSEC - SHIFT_FLL); if (time_offset < 0) { - temp64 = -temp64; - do_div(temp64, mtemp); - freq_adj -= temp64; + utemp64 = -temp64; + do_div(utemp64, mtemp); + freq_adj -= utemp64; } else { - do_div(temp64, mtemp); - freq_adj += temp64; + utemp64 = temp64; + do_div(utemp64, mtemp); + freq_adj += utemp64; } } freq_adj += time_freq; @@ -400,3 +403,11 @@ leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) notify_cmos_timer(); return(result); } + +static int __init ntp_tick_adj_setup(char *str) +{ + ntp_tick_adj = simple_strtol(str, NULL, 0); + return 1; +} + +__setup("ntp_tick_adj=", ntp_tick_adj_setup); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2968298f8f36..686da821d376 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -640,7 +640,7 @@ void tick_cancel_sched_timer(int cpu) if (ts->sched_timer.base) hrtimer_cancel(&ts->sched_timer); - ts->tick_stopped = 0; + ts->nohz_mode = NOHZ_MODE_INACTIVE; } #endif /* HIGH_RES_TIMERS */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1af9fb050fe2..671af612b768 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -187,8 +187,7 @@ static void change_clocksource(void) clock->error = 0; clock->xtime_nsec = 0; - clocksource_calculate_interval(clock, - (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); tick_clock_notify(); @@ -245,8 +244,7 @@ void __init timekeeping_init(void) ntp_clear(); clock = clocksource_get_next(); - clocksource_calculate_interval(clock, - (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); + clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); clock->cycle_last = clocksource_read(clock); xtime.tv_sec = sec; |