From 517a92c4e19fcea815332d3155e9fb7723251274 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 14 Feb 2008 09:02:13 +0100 Subject: panic: print more informative messages on stackprotect failure pointed out by pageexec@freemail.hu: we just simply panic() when there's a stackprotector attack - giving the attacked person no information about what kernel code the attack went against. print out the attacked function. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 425567f45b9f..f236001cc4db 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -327,7 +327,8 @@ EXPORT_SYMBOL(warn_on_slowpath); */ void __stack_chk_fail(void) { - panic("stack-protector: Kernel stack is corrupted"); + panic("stack-protector: Kernel stack is corrupted in: %p\n", + __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); #endif -- cgit v1.2.3 From 5cb273013e182a35e7db614d3e20a144cba71e53 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 14 Feb 2008 09:07:01 +0100 Subject: panic: print out stacktrace if DEBUG_BUGVERBOSE if CONFIG_DEBUG_BUGVERBOSE is set then the user most definitely wanted to see as much information about kernel crashes as possible - so give them at least a stack dump. this is particularly useful for stackprotector related panics, where the stacktrace can give us the exact location of the (attempted) attack. Pointed out by pageexec@freemail.hu in the stackprotector breakage threads. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index f236001cc4db..17aad578a2f2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -80,6 +80,9 @@ NORET_TYPE void panic(const char * fmt, ...) vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf); +#ifdef CONFIG_DEBUG_BUGVERBOSE + dump_stack(); +#endif bust_spinlocks(0); /* -- cgit v1.2.3 From 54371a43a66f4477889769b4fa00df936855dc8f Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 15 Feb 2008 15:33:12 -0800 Subject: x86: add CONFIG_CC_STACKPROTECTOR self-test This patch adds a simple self-test capability to the stackprotector feature. The test deliberately overflows a stack buffer and then checks if the canary trap function gets called. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 17aad578a2f2..50cf9257b234 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -324,14 +324,82 @@ EXPORT_SYMBOL(warn_on_slowpath); #endif #ifdef CONFIG_CC_STACKPROTECTOR + +static unsigned long __stack_check_testing; +/* + * Self test function for the stack-protector feature. + * This test requires that the local variable absolutely has + * a stack slot, hence the barrier()s. + */ +static noinline void __stack_chk_test_func(void) +{ + unsigned long foo; + barrier(); + /* + * we need to make sure we're not about to clobber the return address, + * while real exploits do this, it's unhealthy on a running system. + * Besides, if we would, the test is already failed anyway so + * time to pull the emergency brake on it. + */ + if ((unsigned long)__builtin_return_address(0) == + *(((unsigned long *)&foo)+1)) { + printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); + return; + } +#ifdef CONFIG_FRAME_POINTER + /* We also don't want to clobber the frame pointer */ + if ((unsigned long)__builtin_return_address(0) == + *(((unsigned long *)&foo)+2)) { + printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); + return; + } +#endif + barrier(); + if (current->stack_canary == *(((unsigned long *)&foo)+1)) + *(((unsigned long *)&foo)+1) = 0; + else + printk(KERN_ERR "No -fstack-protector canary found\n"); + barrier(); +} + +static int __stack_chk_test(void) +{ + printk(KERN_INFO "Testing -fstack-protector-all feature\n"); + __stack_check_testing = (unsigned long)&__stack_chk_test_func; + __stack_chk_test_func(); + if (__stack_check_testing) { + printk(KERN_ERR "-fstack-protector-all test failed\n"); + WARN_ON(1); + } + return 0; +} /* * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ void __stack_chk_fail(void) { + if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) { + long delta; + + delta = (unsigned long)__builtin_return_address(0) - + __stack_check_testing; + /* + * The test needs to happen inside the test function, so + * check if the return address is close to that function. + * The function is only 2 dozen bytes long, but keep a wide + * safety margin to avoid panic()s for normal users regardless + * of the quality of the compiler. + */ + if (delta >= 0 && delta <= 400) { + __stack_check_testing = 0; + return; + } + } panic("stack-protector: Kernel stack is corrupted in: %p\n", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); + +late_initcall(__stack_chk_test); #endif -- cgit v1.2.3 From b719ac56c0032bc1602914c6ea70b0f1581b08c7 Mon Sep 17 00:00:00 2001 From: Daniel Walker Date: Mon, 14 Apr 2008 10:03:50 -0700 Subject: panic.c: fix whitespace additions trivial: remove white space addition in stack protector Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/panic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 50cf9257b234..866be9b72e4f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -341,14 +341,14 @@ static noinline void __stack_chk_test_func(void) * Besides, if we would, the test is already failed anyway so * time to pull the emergency brake on it. */ - if ((unsigned long)__builtin_return_address(0) == + if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+1)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); return; } #ifdef CONFIG_FRAME_POINTER /* We also don't want to clobber the frame pointer */ - if ((unsigned long)__builtin_return_address(0) == + if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+2)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); return; -- cgit v1.2.3 From b40a4392a3c262e0d1b5379b4e142a8eefa63439 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 18 Apr 2008 06:16:45 -0700 Subject: stackprotector: turn not having the right gcc into a #warning If the user selects the stack-protector config option, but does not have a gcc that has the right bits enabled (for example because it isn't build with a glibc that supports TLS, as is common for cross-compilers, but also because it may be too old), then the runtime test fails right now. This patch adds a warning message for this scenario. This warning accomplishes two goals 1) the user is informed that the security option he selective isn't available 2) the user is suggested to turn of the CONFIG option that won't work for him, and would make the runtime test fail anyway. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Makefile | 2 +- kernel/panic.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 3cff3c894cf3..c3e0eeeb1dd2 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -73,7 +73,7 @@ else stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \ - "$(CC)" -fstack-protector ) + "$(CC)" "-fstack-protector -DGCC_HAS_SP" ) stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \ "$(CC)" -fstack-protector-all ) diff --git a/kernel/panic.c b/kernel/panic.c index 866be9b72e4f..6729e3f4ebcb 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -325,6 +325,9 @@ EXPORT_SYMBOL(warn_on_slowpath); #ifdef CONFIG_CC_STACKPROTECTOR +#ifndef GCC_HAS_SP +#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. +#endif static unsigned long __stack_check_testing; /* * Self test function for the stack-protector feature. -- cgit v1.2.3 From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 22 Apr 2008 16:38:23 -0500 Subject: stackprotector: use canary at end of stack to indicate overruns at oops time (Updated with a common max-stack-used checker that knows about the canary, as suggested by Joe Perches) Use a canary at the end of the stack to clearly indicate at oops time whether the stack has ever overflowed. This is a very simple implementation with a couple of drawbacks: 1) a thread may legitimately use exactly up to the last word on the stack -- but the chances of doing this and then oopsing later seem slim 2) it's possible that the stack usage isn't dense enough that the canary location could get skipped over -- but the worst that happens is that we don't flag the overrun -- though this happens fairly often in my testing :( With the code in place, an intentionally-bloated stack oops might do: BUG: unable to handle kernel paging request at ffff8103f84cc680 IP: [] update_curr+0x9a/0xa8 PGD 8063 PUD 0 Thread overran stack or stack corrupted Oops: 0000 [1] SMP CPU 0 ... ... unless the stack overrun is so bad that it corrupts some other thread. Signed-off-by: Eric Sandeen Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/mm/fault.c | 7 +++++++ include/linux/magic.h | 1 + include/linux/sched.h | 13 +++++++++++++ kernel/exit.c | 5 +---- kernel/fork.c | 5 +++++ kernel/sched.c | 7 +------ 6 files changed, 28 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fd7e1798c75a..1f524df68b96 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -581,6 +582,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long address; int write, si_code; int fault; + unsigned long *stackend; + #ifdef CONFIG_X86_64 unsigned long flags; #endif @@ -850,6 +853,10 @@ no_context: show_fault_oops(regs, error_code, address); + stackend = end_of_stack(tsk); + if (*stackend != STACK_END_MAGIC) + printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); + tsk->thread.cr2 = address; tsk->thread.trap_no = 14; tsk->thread.error_code = error_code; diff --git a/include/linux/magic.h b/include/linux/magic.h index 1fa0c2ce4dec..74e68e201166 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -42,4 +42,5 @@ #define FUTEXFS_SUPER_MAGIC 0xBAD1DEA #define INOTIFYFS_SUPER_MAGIC 0x2BAD1DEA +#define STACK_END_MAGIC 0x57AC6E9D #endif /* __LINUX_MAGIC_H__ */ diff --git a/include/linux/sched.h b/include/linux/sched.h index d6a515158783..c5181e77f305 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1969,6 +1969,19 @@ static inline unsigned long *end_of_stack(struct task_struct *p) extern void thread_info_cache_init(void); +#ifdef CONFIG_DEBUG_STACK_USAGE +static inline unsigned long stack_not_used(struct task_struct *p) +{ + unsigned long *n = end_of_stack(p); + + do { /* Skip over canary */ + n++; + } while (!*n); + + return (unsigned long)n - (unsigned long)end_of_stack(p); +} +#endif + /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available */ diff --git a/kernel/exit.c b/kernel/exit.c index 8f6185e69b69..fb8de6cbf2c7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -899,12 +899,9 @@ static void check_stack_usage(void) { static DEFINE_SPINLOCK(low_water_lock); static int lowest_to_date = THREAD_SIZE; - unsigned long *n = end_of_stack(current); unsigned long free; - while (*n == 0) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(current); + free = stack_not_used(current); if (free >= lowest_to_date) return; diff --git a/kernel/fork.c b/kernel/fork.c index 19908b26cf80..d428336e7aa1 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -186,6 +187,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) { struct task_struct *tsk; struct thread_info *ti; + unsigned long *stackend; + int err; prepare_to_copy(orig); @@ -211,6 +214,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) goto out; setup_thread_stack(tsk, orig); + stackend = end_of_stack(tsk); + *stackend = STACK_END_MAGIC; /* for overflow detection */ #ifdef CONFIG_CC_STACKPROTECTOR tsk->stack_canary = get_random_int(); diff --git a/kernel/sched.c b/kernel/sched.c index cfa222a91539..a964ed945094 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5748,12 +5748,7 @@ void sched_show_task(struct task_struct *p) printk(KERN_CONT " %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE - { - unsigned long *n = end_of_stack(p); - while (!*n) - n++; - free = (unsigned long)n - (unsigned long)end_of_stack(p); - } + free = stack_not_used(p); #endif printk(KERN_CONT "%5lu %5d %6d\n", free, task_pid_nr(p), task_pid_nr(p->real_parent)); -- cgit v1.2.3 From aa92db14270b79f0f91a9060b547a46f9e2639da Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 11 Jul 2008 05:09:55 -0700 Subject: stackprotector: better self-test check stackprotector functionality by manipulating the canary briefly during bootup. far more robust than trying to overflow the stack. (which is architecture dependent, etc.) Signed-off-by: Ingo Molnar --- kernel/panic.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 6729e3f4ebcb..28153aec7100 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -347,22 +347,18 @@ static noinline void __stack_chk_test_func(void) if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+1)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - return; } #ifdef CONFIG_FRAME_POINTER /* We also don't want to clobber the frame pointer */ if ((unsigned long)__builtin_return_address(0) == *(((unsigned long *)&foo)+2)) { printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - return; } #endif - barrier(); - if (current->stack_canary == *(((unsigned long *)&foo)+1)) - *(((unsigned long *)&foo)+1) = 0; - else + if (current->stack_canary != *(((unsigned long *)&foo)+1)) printk(KERN_ERR "No -fstack-protector canary found\n"); - barrier(); + + current->stack_canary = ~current->stack_canary; } static int __stack_chk_test(void) @@ -373,7 +369,8 @@ static int __stack_chk_test(void) if (__stack_check_testing) { printk(KERN_ERR "-fstack-protector-all test failed\n"); WARN_ON(1); - } + }; + current->stack_canary = ~current->stack_canary; return 0; } /* -- cgit v1.2.3 From af9ff7868f0f76d3364351b1641b9dfa99588e77 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 12 Jul 2008 09:36:38 -0700 Subject: x86: simplify stackprotector self-check Clean up the code by removing no longer needed code; make sure the pda is updated and kept in sync Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- include/asm-x86/pda.h | 1 + kernel/panic.c | 29 +++++++---------------------- 2 files changed, 8 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h index 62b734986a44..a5ff5bb76299 100644 --- a/include/asm-x86/pda.h +++ b/include/asm-x86/pda.h @@ -131,4 +131,5 @@ do { \ #define PDA_STACKOFFSET (5*8) +#define refresh_stack_canary() write_pda(stack_canary, current->stack_canary) #endif diff --git a/kernel/panic.c b/kernel/panic.c index 28153aec7100..87445a894c3a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -328,37 +328,21 @@ EXPORT_SYMBOL(warn_on_slowpath); #ifndef GCC_HAS_SP #warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. #endif + static unsigned long __stack_check_testing; + /* * Self test function for the stack-protector feature. * This test requires that the local variable absolutely has - * a stack slot, hence the barrier()s. + * a stack slot. */ static noinline void __stack_chk_test_func(void) { - unsigned long foo; - barrier(); - /* - * we need to make sure we're not about to clobber the return address, - * while real exploits do this, it's unhealthy on a running system. - * Besides, if we would, the test is already failed anyway so - * time to pull the emergency brake on it. - */ - if ((unsigned long)__builtin_return_address(0) == - *(((unsigned long *)&foo)+1)) { - printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - } -#ifdef CONFIG_FRAME_POINTER - /* We also don't want to clobber the frame pointer */ - if ((unsigned long)__builtin_return_address(0) == - *(((unsigned long *)&foo)+2)) { - printk(KERN_ERR "No -fstack-protector-stack-frame!\n"); - } -#endif - if (current->stack_canary != *(((unsigned long *)&foo)+1)) - printk(KERN_ERR "No -fstack-protector canary found\n"); + unsigned long dummy_buffer[64]; /* force gcc to use the canary */ current->stack_canary = ~current->stack_canary; + refresh_stack_canary(); + dummy_buffer[3] = 1; /* fool gcc into keeping the variable */ } static int __stack_chk_test(void) @@ -371,6 +355,7 @@ static int __stack_chk_test(void) WARN_ON(1); }; current->stack_canary = ~current->stack_canary; + refresh_stack_canary(); return 0; } /* -- cgit v1.2.3 From 4f962d4d65923d7b722192e729840cfb79af0a5a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 13 Jul 2008 21:42:44 +0200 Subject: stackprotector: remove self-test turns out gcc generates such stackprotector-failure sequences in certain circumstances: movq -8(%rbp), %rax # D.16032, xorq %gs:40, %rax #, jne .L17 #, leave ret .L17: call __stack_chk_fail # .size __stack_chk_test_func, .-__stack_chk_test_func .section .init.text,"ax",@progbits .type panic_setup, @function panic_setup: pushq %rbp # note that there's no jump back to the failing context after the call to __stack_chk_fail - i.e. it has a ((noreturn)) attribute. Which is fair enough in the normal case but kills the self-test. (as we cannot reliably return in the self-test) Signed-off-by: Ingo Molnar --- kernel/panic.c | 47 ----------------------------------------------- 1 file changed, 47 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 87445a894c3a..c35c9eca3eb2 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -329,62 +329,15 @@ EXPORT_SYMBOL(warn_on_slowpath); #warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. #endif -static unsigned long __stack_check_testing; - -/* - * Self test function for the stack-protector feature. - * This test requires that the local variable absolutely has - * a stack slot. - */ -static noinline void __stack_chk_test_func(void) -{ - unsigned long dummy_buffer[64]; /* force gcc to use the canary */ - - current->stack_canary = ~current->stack_canary; - refresh_stack_canary(); - dummy_buffer[3] = 1; /* fool gcc into keeping the variable */ -} - -static int __stack_chk_test(void) -{ - printk(KERN_INFO "Testing -fstack-protector-all feature\n"); - __stack_check_testing = (unsigned long)&__stack_chk_test_func; - __stack_chk_test_func(); - if (__stack_check_testing) { - printk(KERN_ERR "-fstack-protector-all test failed\n"); - WARN_ON(1); - }; - current->stack_canary = ~current->stack_canary; - refresh_stack_canary(); - return 0; -} /* * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value */ void __stack_chk_fail(void) { - if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) { - long delta; - - delta = (unsigned long)__builtin_return_address(0) - - __stack_check_testing; - /* - * The test needs to happen inside the test function, so - * check if the return address is close to that function. - * The function is only 2 dozen bytes long, but keep a wide - * safety margin to avoid panic()s for normal users regardless - * of the quality of the compiler. - */ - if (delta >= 0 && delta <= 400) { - __stack_check_testing = 0; - return; - } - } panic("stack-protector: Kernel stack is corrupted in: %p\n", __builtin_return_address(0)); } EXPORT_SYMBOL(__stack_chk_fail); -late_initcall(__stack_chk_test); #endif -- cgit v1.2.3 From 7f7ace0cda64c99599c23785f8979a072e118058 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 21:58:08 -0800 Subject: cpumask: update irq_desc to use cpumask_var_t Impact: reduce memory usage, use new cpumask API. Replace the affinity and pending_masks with cpumask_var_t's. This adds to the significant size reduction done with the SPARSE_IRQS changes. The added functions (init_alloc_desc_masks & init_copy_desc_masks) are in the include file so they can be inlined (and optimized out for the !CONFIG_CPUMASKS_OFFSTACK case.) [Naming chosen to be consistent with the other init*irq functions, as well as the backwards arg declaration of "from, to" instead of the more common "to, from" standard.] Includes a slight change to the declaration of struct irq_desc to embed the pending_mask within ifdef(CONFIG_SMP) to be consistent with other references, and some small changes to Xen. Tested: sparse/non-sparse/cpumask_offstack/non-cpumask_offstack/nonuma/nosmp on x86_64 Signed-off-by: Mike Travis Cc: Chris Wright Cc: Jeremy Fitzhardinge Cc: KOSAKI Motohiro Cc: Venkatesh Pallipadi Cc: virtualization@lists.osdl.org Cc: xen-devel@lists.xensource.com Cc: Yinghai Lu --- arch/x86/kernel/io_apic.c | 20 ++++++------ arch/x86/kernel/irq_32.c | 2 +- arch/x86/kernel/irq_64.c | 2 +- drivers/xen/events.c | 4 +-- include/linux/irq.h | 81 +++++++++++++++++++++++++++++++++++++++++++++-- kernel/irq/chip.c | 5 ++- kernel/irq/handle.c | 26 ++++++++------- kernel/irq/manage.c | 12 +++---- kernel/irq/migration.c | 12 +++---- kernel/irq/numa_migrate.c | 12 ++++++- kernel/irq/proc.c | 4 +-- 11 files changed, 135 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 1c4a1302536c..1337eab60ecc 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -356,7 +356,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask) if (!cfg->move_in_progress) { /* it means that domain is not changed */ - if (!cpumask_intersects(&desc->affinity, mask)) + if (!cpumask_intersects(desc->affinity, mask)) cfg->move_desc_pending = 1; } } @@ -579,9 +579,9 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) if (assign_irq_vector(irq, cfg, mask)) return BAD_APICID; - cpumask_and(&desc->affinity, cfg->domain, mask); + cpumask_and(desc->affinity, cfg->domain, mask); set_extra_move_desc(desc, mask); - return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask); + return cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask); } static void @@ -2383,7 +2383,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) if (cfg->move_in_progress) send_cleanup_vector(cfg); - cpumask_copy(&desc->affinity, mask); + cpumask_copy(desc->affinity, mask); } static int migrate_irq_remapped_level_desc(struct irq_desc *desc) @@ -2405,11 +2405,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc) } /* everthing is clear. we have right of way */ - migrate_ioapic_irq_desc(desc, &desc->pending_mask); + migrate_ioapic_irq_desc(desc, desc->pending_mask); ret = 0; desc->status &= ~IRQ_MOVE_PENDING; - cpumask_clear(&desc->pending_mask); + cpumask_clear(desc->pending_mask); unmask: unmask_IO_APIC_irq_desc(desc); @@ -2434,7 +2434,7 @@ static void ir_irq_migration(struct work_struct *work) continue; } - desc->chip->set_affinity(irq, &desc->pending_mask); + desc->chip->set_affinity(irq, desc->pending_mask); spin_unlock_irqrestore(&desc->lock, flags); } } @@ -2448,7 +2448,7 @@ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, { if (desc->status & IRQ_LEVEL) { desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(&desc->pending_mask, mask); + cpumask_copy(desc->pending_mask, mask); migrate_irq_remapped_level_desc(desc); return; } @@ -2516,7 +2516,7 @@ static void irq_complete_move(struct irq_desc **descp) /* domain has not changed, but affinity did */ me = smp_processor_id(); - if (cpu_isset(me, desc->affinity)) { + if (cpumask_test_cpu(me, desc->affinity)) { *descp = desc = move_irq_desc(desc, me); /* get the new one */ cfg = desc->chip_data; @@ -4039,7 +4039,7 @@ void __init setup_ioapic_dest(void) */ if (desc->status & (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) - mask = &desc->affinity; + mask = desc->affinity; else mask = TARGET_CPUS; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 74b9ff7341e9..e0f29be8ab0b 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -248,7 +248,7 @@ void fixup_irqs(void) if (irq == 2) continue; - affinity = &desc->affinity; + affinity = desc->affinity; if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) { printk("Breaking affinity for irq %i\n", irq); affinity = cpu_all_mask; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 63c88e6ec025..0b21cb1ea11f 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -100,7 +100,7 @@ void fixup_irqs(void) /* interrupt's are disabled at this point */ spin_lock(&desc->lock); - affinity = &desc->affinity; + affinity = desc->affinity; if (!irq_has_action(irq) || cpumask_equal(affinity, cpu_online_mask)) { spin_unlock(&desc->lock); diff --git a/drivers/xen/events.c b/drivers/xen/events.c index eb0dfdeaa949..e0767ff35d6c 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -125,7 +125,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) BUG_ON(irq == -1); #ifdef CONFIG_SMP - irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu); + cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu)); #endif __clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]); @@ -142,7 +142,7 @@ static void init_evtchn_cpu_bindings(void) /* By default all event channels notify CPU#0. */ for_each_irq_desc(i, desc) { - desc->affinity = cpumask_of_cpu(0); + cpumask_copy(desc->affinity, cpumask_of(0)); } #endif diff --git a/include/linux/irq.h b/include/linux/irq.h index f899b502f186..fa27210f1dfd 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -182,11 +182,11 @@ struct irq_desc { unsigned int irqs_unhandled; spinlock_t lock; #ifdef CONFIG_SMP - cpumask_t affinity; + cpumask_var_t affinity; unsigned int cpu; -#endif #ifdef CONFIG_GENERIC_PENDING_IRQ - cpumask_t pending_mask; + cpumask_var_t pending_mask; +#endif #endif #ifdef CONFIG_PROC_FS struct proc_dir_entry *dir; @@ -422,4 +422,79 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); #endif /* !CONFIG_S390 */ +#ifdef CONFIG_SMP +/** + * init_alloc_desc_masks - allocate cpumasks for irq_desc + * @desc: pointer to irq_desc struct + * @boot: true if need bootmem + * + * Allocates affinity and pending_mask cpumask if required. + * Returns true if successful (or not required). + * Side effect: affinity has all bits set, pending_mask has all bits clear. + */ +static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node, + bool boot) +{ + if (boot) { + alloc_bootmem_cpumask_var(&desc->affinity); + cpumask_setall(desc->affinity); + +#ifdef CONFIG_GENERIC_PENDING_IRQ + alloc_bootmem_cpumask_var(&desc->pending_mask); + cpumask_clear(desc->pending_mask); +#endif + return true; + } + + if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node)) + return false; + cpumask_setall(desc->affinity); + +#ifdef CONFIG_GENERIC_PENDING_IRQ + if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) { + free_cpumask_var(desc->affinity); + return false; + } + cpumask_clear(desc->pending_mask); +#endif + return true; +} + +/** + * init_copy_desc_masks - copy cpumasks for irq_desc + * @old_desc: pointer to old irq_desc struct + * @new_desc: pointer to new irq_desc struct + * + * Insures affinity and pending_masks are copied to new irq_desc. + * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the + * irq_desc struct so the copy is redundant. + */ + +static inline void init_copy_desc_masks(struct irq_desc *old_desc, + struct irq_desc *new_desc) +{ +#ifdef CONFIG_CPUMASKS_OFFSTACK + cpumask_copy(new_desc->affinity, old_desc->affinity); + +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_copy(new_desc->pending_mask, old_desc->pending_mask); +#endif +#endif +} + +#else /* !CONFIG_SMP */ + +static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node, + bool boot) +{ + return true; +} + +static inline void init_copy_desc_masks(struct irq_desc *old_desc, + struct irq_desc *new_desc) +{ +} + +#endif /* CONFIG_SMP */ + #endif /* _LINUX_IRQ_H */ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index f63c706d25e1..c248eba98b43 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq) desc->irq_count = 0; desc->irqs_unhandled = 0; #ifdef CONFIG_SMP - cpumask_setall(&desc->affinity); + cpumask_setall(desc->affinity); +#ifdef CONFIG_GENERIC_PENDING_IRQ + cpumask_clear(desc->pending_mask); +#endif #endif spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index c20db0be9173..b8fa1354f01c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -64,9 +64,6 @@ static struct irq_desc irq_desc_init = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif }; void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) @@ -88,6 +85,8 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) { + int node = cpu_to_node(cpu); + memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); @@ -101,6 +100,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } + if (!init_alloc_desc_masks(desc, node, false)) { + printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); + BUG_ON(1); + } arch_init_chip_data(desc, cpu); } @@ -119,9 +122,6 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif } }; @@ -141,7 +141,7 @@ int __init early_irq_init(void) desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy[i]; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); - + init_alloc_desc_masks(&desc[i], 0, true); irq_desc_ptrs[i] = desc + i; } @@ -188,6 +188,10 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } + if (!init_alloc_desc_masks(desc, node, false)) { + printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); + BUG_ON(1); + } init_one_irq_desc(irq, desc, cpu); irq_desc_ptrs[irq] = desc; @@ -207,9 +211,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { .handle_irq = handle_bad_irq, .depth = 1, .lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock), -#ifdef CONFIG_SMP - .affinity = CPU_MASK_ALL -#endif } }; @@ -222,9 +223,10 @@ int __init early_irq_init(void) desc = irq_desc; count = ARRAY_SIZE(irq_desc); - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { desc[i].irq = i; - + init_alloc_desc_masks(&desc[i], 0, true); + } return arch_early_irq_init(); } diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index cd0cd8dcb345..b98739af4558 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -98,14 +98,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) { - cpumask_copy(&desc->affinity, cpumask); + cpumask_copy(desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); } else { desc->status |= IRQ_MOVE_PENDING; - cpumask_copy(&desc->pending_mask, cpumask); + cpumask_copy(desc->pending_mask, cpumask); } #else - cpumask_copy(&desc->affinity, cpumask); + cpumask_copy(desc->affinity, cpumask); desc->chip->set_affinity(irq, cpumask); #endif desc->status |= IRQ_AFFINITY_SET; @@ -127,16 +127,16 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc) * one of the targets is online. */ if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) { - if (cpumask_any_and(&desc->affinity, cpu_online_mask) + if (cpumask_any_and(desc->affinity, cpu_online_mask) < nr_cpu_ids) goto set_affinity; else desc->status &= ~IRQ_AFFINITY_SET; } - cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity); + cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity); set_affinity: - desc->chip->set_affinity(irq, &desc->affinity); + desc->chip->set_affinity(irq, desc->affinity); return 0; } diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index bd72329e630c..e05ad9be43b7 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -18,7 +18,7 @@ void move_masked_irq(int irq) desc->status &= ~IRQ_MOVE_PENDING; - if (unlikely(cpumask_empty(&desc->pending_mask))) + if (unlikely(cpumask_empty(desc->pending_mask))) return; if (!desc->chip->set_affinity) @@ -38,13 +38,13 @@ void move_masked_irq(int irq) * For correct operation this depends on the caller * masking the irqs. */ - if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask) + if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)) { - cpumask_and(&desc->affinity, - &desc->pending_mask, cpu_online_mask); - desc->chip->set_affinity(irq, &desc->affinity); + cpumask_and(desc->affinity, + desc->pending_mask, cpu_online_mask); + desc->chip->set_affinity(irq, desc->affinity); } - cpumask_clear(&desc->pending_mask); + cpumask_clear(desc->pending_mask); } void move_native_irq(int irq) diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index ecf765c6a77a..f001a4ea6414 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -46,6 +46,7 @@ static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, desc->cpu = cpu; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); + init_copy_desc_masks(old_desc, desc); arch_init_copy_chip_data(old_desc, desc, cpu); } @@ -76,11 +77,20 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, node = cpu_to_node(cpu); desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); if (!desc) { - printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq); + printk(KERN_ERR "irq %d: can not get new irq_desc " + "for migration.\n", irq); /* still use old one */ desc = old_desc; goto out_unlock; } + if (!init_alloc_desc_masks(desc, node, false)) { + printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " + "for migration.\n", irq); + /* still use old one */ + kfree(desc); + desc = old_desc; + goto out_unlock; + } init_copy_one_irq_desc(irq, old_desc, desc, cpu); irq_desc_ptrs[irq] = desc; diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index aae3f742bcec..692363dd591f 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir; static int irq_affinity_proc_show(struct seq_file *m, void *v) { struct irq_desc *desc = irq_to_desc((long)m->private); - const struct cpumask *mask = &desc->affinity; + const struct cpumask *mask = desc->affinity; #ifdef CONFIG_GENERIC_PENDING_IRQ if (desc->status & IRQ_MOVE_PENDING) - mask = &desc->pending_mask; + mask = desc->pending_mask; #endif seq_cpumask(m, mask); seq_putc(m, '\n'); -- cgit v1.2.3 From 802bf931f2688ad125b73db597ce63cc842fb27a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 21:58:09 -0800 Subject: cpumask: fix bug in use cpumask_var_t in irq_desc Impact: fix bug where new irq_desc uses old cpumask pointers which are freed. As Yinghai pointed out, init_copy_one_irq_desc() copies the old desc to the new desc overwriting the cpumask pointers. Since the old_desc and the cpumask pointers are freed, then memory corruption will occur if these old pointers are used. Move the allocation of these pointers to after the copy. Signed-off-by: Mike Travis Cc: Yinghai Lu --- include/linux/irq.h | 9 +++++++-- kernel/irq/handle.c | 8 +------- kernel/irq/numa_migrate.c | 13 ++++++++----- 3 files changed, 16 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index fa27210f1dfd..27a67536511e 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -426,15 +426,18 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); /** * init_alloc_desc_masks - allocate cpumasks for irq_desc * @desc: pointer to irq_desc struct + * @cpu: cpu which will be handling the cpumasks * @boot: true if need bootmem * * Allocates affinity and pending_mask cpumask if required. * Returns true if successful (or not required). * Side effect: affinity has all bits set, pending_mask has all bits clear. */ -static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node, +static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, bool boot) { + int node; + if (boot) { alloc_bootmem_cpumask_var(&desc->affinity); cpumask_setall(desc->affinity); @@ -446,6 +449,8 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node, return true; } + node = cpu_to_node(cpu); + if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node)) return false; cpumask_setall(desc->affinity); @@ -484,7 +489,7 @@ static inline void init_copy_desc_masks(struct irq_desc *old_desc, #else /* !CONFIG_SMP */ -static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node, +static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu, bool boot) { return true; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index b8fa1354f01c..f01c0a30cb42 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -85,8 +85,6 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr) static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) { - int node = cpu_to_node(cpu); - memcpy(desc, &irq_desc_init, sizeof(struct irq_desc)); spin_lock_init(&desc->lock); @@ -100,7 +98,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) printk(KERN_ERR "can not alloc kstat_irqs\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, node, false)) { + if (!init_alloc_desc_masks(desc, cpu, false)) { printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); BUG_ON(1); } @@ -188,10 +186,6 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) printk(KERN_ERR "can not alloc irq_desc\n"); BUG_ON(1); } - if (!init_alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "can not alloc irq_desc cpumasks\n"); - BUG_ON(1); - } init_one_irq_desc(irq, desc, cpu); irq_desc_ptrs[irq] = desc; diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index f001a4ea6414..666260e4c065 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -38,16 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc) old_desc->kstat_irqs = NULL; } -static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, +static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc, struct irq_desc *desc, int cpu) { memcpy(desc, old_desc, sizeof(struct irq_desc)); + if (!init_alloc_desc_masks(desc, cpu, false)) { + printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " + "for migration.\n", irq); + return false; + } spin_lock_init(&desc->lock); desc->cpu = cpu; lockdep_set_class(&desc->lock, &irq_desc_lock_class); init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids); init_copy_desc_masks(old_desc, desc); arch_init_copy_chip_data(old_desc, desc, cpu); + return true; } static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc) @@ -83,15 +89,12 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, desc = old_desc; goto out_unlock; } - if (!init_alloc_desc_masks(desc, node, false)) { - printk(KERN_ERR "irq %d: can not get new irq_desc cpumask " - "for migration.\n", irq); + if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) { /* still use old one */ kfree(desc); desc = old_desc; goto out_unlock; } - init_copy_one_irq_desc(irq, old_desc, desc, cpu); irq_desc_ptrs[irq] = desc; -- cgit v1.2.3 From d38b223c86db3162dc85b5a1997ac8a210e1660b Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 21:58:11 -0800 Subject: cpumask: reduce stack usage in find_lowest_rq Impact: reduce stack usage, cleanup Use a cpumask_var_t in find_lowest_rq() and clean up other old cpumask_t calls. Signed-off-by: Mike Travis --- kernel/sched_rt.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 954e1a81b796..da932f4c8524 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -960,16 +960,17 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask); -static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) +static inline int pick_optimal_cpu(int this_cpu, + const struct cpumask *mask) { int first; /* "this_cpu" is cheaper to preempt than a remote processor */ - if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) + if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask)) return this_cpu; - first = first_cpu(*mask); - if (first != NR_CPUS) + first = cpumask_first(mask); + if (first < nr_cpu_ids) return first; return -1; @@ -981,6 +982,7 @@ static int find_lowest_rq(struct task_struct *task) struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask); int this_cpu = smp_processor_id(); int cpu = task_cpu(task); + cpumask_var_t domain_mask; if (task->rt.nr_cpus_allowed == 1) return -1; /* No other targets possible */ @@ -1013,19 +1015,25 @@ static int find_lowest_rq(struct task_struct *task) if (this_cpu == cpu) this_cpu = -1; /* Skip this_cpu opt if the same */ - for_each_domain(cpu, sd) { - if (sd->flags & SD_WAKE_AFFINE) { - cpumask_t domain_mask; - int best_cpu; + if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) { + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_AFFINE) { + int best_cpu; - cpumask_and(&domain_mask, sched_domain_span(sd), - lowest_mask); + cpumask_and(domain_mask, + sched_domain_span(sd), + lowest_mask); - best_cpu = pick_optimal_cpu(this_cpu, - &domain_mask); - if (best_cpu != -1) - return best_cpu; + best_cpu = pick_optimal_cpu(this_cpu, + domain_mask); + + if (best_cpu != -1) { + free_cpumask_var(domain_mask); + return best_cpu; + } + } } + free_cpumask_var(domain_mask); } /* -- cgit v1.2.3 From 9594949b060efe86ecaa1a66839232a3b9800bc9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:06 -0800 Subject: irq: change references from NR_IRQS to nr_irqs Impact: preparation, cleanup, add KERN_INFO printk Modify references from NR_IRQS to nr_irqs as the later will become variable-sized based on nr_cpu_ids when CONFIG_SPARSE_IRQS=y. Signed-off-by: Mike Travis --- arch/x86/kernel/io_apic.c | 2 +- kernel/irq/handle.c | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 1337eab60ecc..ae80638012de 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3183,7 +3183,7 @@ unsigned int create_irq_nr(unsigned int irq_want) irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new < NR_IRQS; new++) { + for (new = irq_want; new < nr_irqs; new++) { if (platform_legacy_irq(new)) continue; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index f01c0a30cb42..790c5fa7ea39 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -132,6 +132,8 @@ int __init early_irq_init(void) int legacy_count; int i; + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); + desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); @@ -143,7 +145,7 @@ int __init early_irq_init(void) irq_desc_ptrs[i] = desc + i; } - for (i = legacy_count; i < NR_IRQS; i++) + for (i = legacy_count; i < nr_irqs; i++) irq_desc_ptrs[i] = NULL; return arch_early_irq_init(); @@ -151,7 +153,7 @@ int __init early_irq_init(void) struct irq_desc *irq_to_desc(unsigned int irq) { - return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL; + return (irq < nr_irqs) ? irq_desc_ptrs[irq] : NULL; } struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) @@ -160,9 +162,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) unsigned long flags; int node; - if (irq >= NR_IRQS) { - printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n", - irq, NR_IRQS); + if (irq >= nr_irqs) { + printk(KERN_WARNING "irq >= nr_irqs in irq_to_desc_alloc: %d %d\n", + irq, nr_irqs); WARN_ON(1); return NULL; } @@ -214,6 +216,8 @@ int __init early_irq_init(void) int count; int i; + printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS); + desc = irq_desc; count = ARRAY_SIZE(irq_desc); -- cgit v1.2.3 From e2f4d06545ec1f29b0e838ee34cbf3500ea5b9a4 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:06 -0800 Subject: irq: use WARN() instead of WARN_ON(). Impact: cleanup WARN msg. Ingo requested: > While at it, could you please also convert this to a WARN() construct > instead? (in a separate commit) ... and it shall be done. ;-) Signed-off-by: Mike Travis --- kernel/irq/handle.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 790c5fa7ea39..fd1ef16252f4 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -163,9 +163,8 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) int node; if (irq >= nr_irqs) { - printk(KERN_WARNING "irq >= nr_irqs in irq_to_desc_alloc: %d %d\n", - irq, nr_irqs); - WARN_ON(1); + WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n", + irq, nr_irqs); return NULL; } -- cgit v1.2.3 From 0fa0ebbf15addc1be8f73325d809c8547a9de304 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:06 -0800 Subject: irq: allocate irq_desc_ptrs array based on nr_irqs Impact: allocate irq_desc_ptrs in preparation for making it variable-sized. This addresses this memory usage bump when NR_CPUS bumped from 128 to 4096: 34816 +229376 264192 +658% irq_desc_ptrs(.data.read_mostly) The patch is split into two parts, the first simply allocates the irq_desc_ptrs array. Then next will deal with making it variable. This is only when CONFIG_SPARSE_IRQS=y. Signed-off-by: Mike Travis --- kernel/irq/handle.c | 11 +++++++++-- kernel/irq/internals.h | 7 +++++++ 2 files changed, 16 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index fd1ef16252f4..d0b8f7e72790 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "internals.h" @@ -110,7 +111,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu) */ DEFINE_SPINLOCK(sparse_irq_lock); -struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly; +struct irq_desc **irq_desc_ptrs __read_mostly; static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { [0 ... NR_IRQS_LEGACY-1] = { @@ -137,6 +138,9 @@ int __init early_irq_init(void) desc = irq_desc_legacy; legacy_count = ARRAY_SIZE(irq_desc_legacy); + /* allocate irq_desc_ptrs array based on nr_irqs */ + irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); + for (i = 0; i < legacy_count; i++) { desc[i].irq = i; desc[i].kstat_irqs = kstat_irqs_legacy[i]; @@ -153,7 +157,10 @@ int __init early_irq_init(void) struct irq_desc *irq_to_desc(unsigned int irq) { - return (irq < nr_irqs) ? irq_desc_ptrs[irq] : NULL; + if (irq_desc_ptrs && irq < nr_irqs) + return irq_desc_ptrs[irq]; + + return NULL; } struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index e6d0a43cc125..40416a81a0f5 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -16,7 +16,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, extern struct lock_class_key irq_desc_lock_class; extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr); extern spinlock_t sparse_irq_lock; + +#ifdef CONFIG_SPARSE_IRQ +/* irq_desc_ptrs allocated at boot time */ +extern struct irq_desc **irq_desc_ptrs; +#else +/* irq_desc_ptrs is a fixed size array */ extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; +#endif #ifdef CONFIG_PROC_FS extern void register_irq_proc(unsigned int irq, struct irq_desc *desc); -- cgit v1.2.3 From 9332fccdedf8e09448f3b69b624211ae879f6c45 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:07 -0800 Subject: irq: initialize nr_irqs based on nr_cpu_ids Impact: Reduce memory usage. This is the second half of the changes to make the irq_desc_ptrs be variable sized based on nr_cpu_ids. This is done by adding a new "max_nr_irqs" macro to irq_vectors.h (and a dummy in irqnr.h) to return a max NR_IRQS value based on NR_CPUS or nr_cpu_ids. This necessitated moving the define of MAX_IO_APICS to a separate file (asm/apicnum.h) so it could be included without the baggage of the other asm/apicdef.h declarations. Signed-off-by: Mike Travis --- arch/x86/include/asm/apicdef.h | 8 ++------ arch/x86/include/asm/apicnum.h | 12 ++++++++++++ arch/x86/include/asm/irq_vectors.h | 16 +++++++++++----- include/linux/irqnr.h | 7 +++++++ kernel/irq/handle.c | 3 +++ 5 files changed, 35 insertions(+), 11 deletions(-) create mode 100644 arch/x86/include/asm/apicnum.h (limited to 'kernel') diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 63134e31e8b9..1a6454ef7f6c 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -132,12 +132,8 @@ #define APIC_BASE_MSR 0x800 #define X2APIC_ENABLE (1UL << 10) -#ifdef CONFIG_X86_32 -# define MAX_IO_APICS 64 -#else -# define MAX_IO_APICS 128 -# define MAX_LOCAL_APIC 32768 -#endif +/* get MAX_IO_APICS */ +#include /* * All x86-64 systems are xAPIC compatible. diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h new file mode 100644 index 000000000000..82f613c607ce --- /dev/null +++ b/arch/x86/include/asm/apicnum.h @@ -0,0 +1,12 @@ +#ifndef _ASM_X86_APICNUM_H +#define _ASM_X86_APICNUM_H + +/* define MAX_IO_APICS */ +#ifdef CONFIG_X86_32 +# define MAX_IO_APICS 64 +#else +# define MAX_IO_APICS 128 +# define MAX_LOCAL_APIC 32768 +#endif + +#endif /* _ASM_X86_APICNUM_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index f7ff65032b9d..602361ad0e74 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -105,6 +105,8 @@ #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER) +#include /* need MAX_IO_APICS */ + #ifndef CONFIG_SPARSE_IRQ # if NR_CPUS < MAX_IO_APICS # define NR_IRQS (NR_VECTORS + (32 * NR_CPUS)) @@ -112,11 +114,15 @@ # define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) # endif #else -# if (8 * NR_CPUS) > (32 * MAX_IO_APICS) -# define NR_IRQS (NR_VECTORS + (8 * NR_CPUS)) -# else -# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS)) -# endif + +/* defined as a macro so nr_irqs = max_nr_irqs(nr_cpu_ids) can be used */ +# define max_nr_irqs(nr_cpus) \ + ((8 * nr_cpus) > (32 * MAX_IO_APICS) ? \ + (NR_VECTORS + (8 * NR_CPUS)) : \ + (NR_VECTORS + (32 * MAX_IO_APICS))) \ + +# define NR_IRQS max_nr_irqs(NR_CPUS) + #endif #elif defined(CONFIG_X86_VOYAGER) diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 86af92e9e84c..de66e4e10406 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -20,11 +20,18 @@ # define for_each_irq_desc_reverse(irq, desc) \ for (irq = nr_irqs - 1; irq >= 0; irq--) + #else /* CONFIG_GENERIC_HARDIRQS */ +#include /* need possible max_nr_irqs() */ + extern int nr_irqs; extern struct irq_desc *irq_to_desc(unsigned int irq); +# ifndef max_nr_irqs +# define max_nr_irqs(nr_cpus) NR_IRQS +# endif + # define for_each_irq_desc(irq, desc) \ for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; \ irq++, desc = irq_to_desc(irq)) \ diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d0b8f7e72790..ebba7a116f14 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -133,6 +133,9 @@ int __init early_irq_init(void) int legacy_count; int i; + /* initialize nr_irqs based on nr_cpu_ids */ + nr_irqs = max_nr_irqs(nr_cpu_ids); + printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); desc = irq_desc_legacy; -- cgit v1.2.3 From 542d865bbed4ce1f050f586e53cf1cfadda93766 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 10 Jan 2009 22:24:07 -0800 Subject: kstat: modify kstat_irqs_legacy to be variable sized Impact: reduce memory usage. Allocate kstat_irqs_legacy based on nr_cpu_ids to deal with this memory usage bump when NR_CPUS bumped from 128 to 4096: 8192 +253952 262144 +3100% kstat_irqs_legacy(.bss) This is only when CONFIG_SPARSE_IRQS=y. Signed-off-by: Mike Travis --- kernel/irq/handle.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index ebba7a116f14..b39f32ac8f80 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -124,8 +124,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm } }; -/* FIXME: use bootmem alloc ...*/ -static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS]; +static unsigned int *kstat_irqs_legacy; int __init early_irq_init(void) { @@ -144,9 +143,14 @@ int __init early_irq_init(void) /* allocate irq_desc_ptrs array based on nr_irqs */ irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *)); + /* allocate based on nr_cpu_ids */ + /* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */ + kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids * + sizeof(int)); + for (i = 0; i < legacy_count; i++) { desc[i].irq = i; - desc[i].kstat_irqs = kstat_irqs_legacy[i]; + desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids; lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); init_alloc_desc_masks(&desc[i], 0, true); irq_desc_ptrs[i] = desc + i; -- cgit v1.2.3 From 92296c6d6e908c35fca287a21af27be814af9c75 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sun, 11 Jan 2009 09:22:58 -0800 Subject: cpumask, irq: non-x86 build failures Ingo Molnar wrote: > All non-x86 architectures fail to build: > > In file included from /home/mingo/tip/include/linux/random.h:11, > from /home/mingo/tip/include/linux/stackprotector.h:6, > from /home/mingo/tip/init/main.c:17: > /home/mingo/tip/include/linux/irqnr.h:26:63: error: asm/irq_vectors.h: No such file or directory Do not include asm/irq_vectors.h in generic code - it's not available on all architectures. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apicdef.h | 8 ++++++-- include/linux/irqnr.h | 6 ------ kernel/irq/handle.c | 5 +++++ 3 files changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 1a6454ef7f6c..63134e31e8b9 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -132,8 +132,12 @@ #define APIC_BASE_MSR 0x800 #define X2APIC_ENABLE (1UL << 10) -/* get MAX_IO_APICS */ -#include +#ifdef CONFIG_X86_32 +# define MAX_IO_APICS 64 +#else +# define MAX_IO_APICS 128 +# define MAX_LOCAL_APIC 32768 +#endif /* * All x86-64 systems are xAPIC compatible. diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index de66e4e10406..887477bc2ab0 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -23,15 +23,9 @@ #else /* CONFIG_GENERIC_HARDIRQS */ -#include /* need possible max_nr_irqs() */ - extern int nr_irqs; extern struct irq_desc *irq_to_desc(unsigned int irq); -# ifndef max_nr_irqs -# define max_nr_irqs(nr_cpus) NR_IRQS -# endif - # define for_each_irq_desc(irq, desc) \ for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; \ irq++, desc = irq_to_desc(irq)) \ diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index b39f32ac8f80..04d3e46031e5 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -58,6 +58,11 @@ int nr_irqs = NR_IRQS; EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_SPARSE_IRQ + +#ifndef max_nr_irqs +#define max_nr_irqs(nr_cpus) NR_IRQS +#endif + static struct irq_desc irq_desc_init = { .irq = -1, .status = IRQ_DISABLED, -- cgit v1.2.3 From 4a046d1754ee6ebb6f399696805ed61ea0444d4c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 12 Jan 2009 17:39:24 -0800 Subject: x86: arch_probe_nr_irqs Impact: save RAM with large NR_CPUS, get smaller nr_irqs Signed-off-by: Yinghai Lu Signed-off-by: Mike Travis --- arch/x86/include/asm/irq_vectors.h | 7 ++----- arch/x86/kernel/io_apic.c | 16 ++++++++++++++++ include/linux/interrupt.h | 1 + kernel/irq/handle.c | 9 ++------- kernel/softirq.c | 5 +++++ 5 files changed, 26 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 602361ad0e74..a16a2ab2b429 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -115,14 +115,11 @@ # endif #else -/* defined as a macro so nr_irqs = max_nr_irqs(nr_cpu_ids) can be used */ -# define max_nr_irqs(nr_cpus) \ - ((8 * nr_cpus) > (32 * MAX_IO_APICS) ? \ +# define NR_IRQS \ + ((8 * NR_CPUS) > (32 * MAX_IO_APICS) ? \ (NR_VECTORS + (8 * NR_CPUS)) : \ (NR_VECTORS + (32 * MAX_IO_APICS))) \ -# define NR_IRQS max_nr_irqs(NR_CPUS) - #endif #elif defined(CONFIG_X86_VOYAGER) diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index ae80638012de..157986916cd1 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3850,6 +3850,22 @@ void __init probe_nr_irqs_gsi(void) nr_irqs_gsi = nr; } +#ifdef CONFIG_SPARSE_IRQ +int __init arch_probe_nr_irqs(void) +{ + int nr; + + nr = ((8 * nr_cpu_ids) > (32 * nr_ioapics) ? + (NR_VECTORS + (8 * nr_cpu_ids)) : + (NR_VECTORS + (32 * nr_ioapics))); + + if (nr < nr_irqs && nr > nr_irqs_gsi) + nr_irqs = nr; + + return 0; +} +#endif + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 9127f6b51a39..472f11765f60 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -467,6 +467,7 @@ int show_interrupts(struct seq_file *p, void *v); struct irq_desc; extern int early_irq_init(void); +extern int arch_probe_nr_irqs(void); extern int arch_early_irq_init(void); extern int arch_init_chip_data(struct irq_desc *desc, int cpu); diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 04d3e46031e5..375d68cd5bf0 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -59,10 +59,6 @@ EXPORT_SYMBOL_GPL(nr_irqs); #ifdef CONFIG_SPARSE_IRQ -#ifndef max_nr_irqs -#define max_nr_irqs(nr_cpus) NR_IRQS -#endif - static struct irq_desc irq_desc_init = { .irq = -1, .status = IRQ_DISABLED, @@ -137,9 +133,8 @@ int __init early_irq_init(void) int legacy_count; int i; - /* initialize nr_irqs based on nr_cpu_ids */ - nr_irqs = max_nr_irqs(nr_cpu_ids); - + /* initialize nr_irqs based on nr_cpu_ids */ + arch_probe_nr_irqs(); printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs); desc = irq_desc_legacy; diff --git a/kernel/softirq.c b/kernel/softirq.c index bdbe9de9cd8d..0365b4899a3d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -795,6 +795,11 @@ int __init __weak early_irq_init(void) return 0; } +int __init __weak arch_probe_nr_irqs(void) +{ + return 0; +} + int __init __weak arch_early_irq_init(void) { return 0; -- cgit v1.2.3 From 68564a46976017496c2227660930d81240f82355 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 16 Jan 2009 15:31:15 -0800 Subject: work_on_cpu: don't try to get_online_cpus() in work_on_cpu. Impact: remove potential circular lock dependency with cpu hotplug lock This has caused more problems than it solved, with a pile of cpu hotplug locking issues. Followup patches will get_online_cpus() in callers that need it, but if they don't do it they're no worse than before when they were using set_cpus_allowed without locking. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/workqueue.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2f445833ae37..a35afdbc0161 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -991,8 +991,8 @@ static void do_work_for_cpu(struct work_struct *w) * @fn: the function to run * @arg: the function arg * - * This will return -EINVAL in the cpu is not online, or the return value - * of @fn otherwise. + * This will return the value @fn returns. + * It is up to the caller to ensure that the cpu doesn't go offline. */ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) { @@ -1001,14 +1001,8 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) INIT_WORK(&wfc.work, do_work_for_cpu); wfc.fn = fn; wfc.arg = arg; - get_online_cpus(); - if (unlikely(!cpu_online(cpu))) - wfc.ret = -EINVAL; - else { - schedule_work_on(cpu, &wfc.work); - flush_work(&wfc.work); - } - put_online_cpus(); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); return wfc.ret; } -- cgit v1.2.3 From e1d9ec6246a2668a5d037f529877efb7cf176af8 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 16 Jan 2009 15:31:15 -0800 Subject: work_on_cpu: Use our own workqueue. Impact: remove potential clashes with generic kevent workqueue Annoyingly, some places we want to use work_on_cpu are already in workqueues. As per Ingo's suggestion, we create a different workqueue for work_on_cpu. Signed-off-by: Rusty Russell Signed-off-by: Mike Travis --- kernel/workqueue.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a35afdbc0161..1f0c509b40d3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -971,6 +971,8 @@ undo: } #ifdef CONFIG_SMP +static struct workqueue_struct *work_on_cpu_wq __read_mostly; + struct work_for_cpu { struct work_struct work; long (*fn)(void *); @@ -1001,7 +1003,7 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) INIT_WORK(&wfc.work, do_work_for_cpu); wfc.fn = fn; wfc.arg = arg; - schedule_work_on(cpu, &wfc.work); + queue_work_on(cpu, work_on_cpu_wq, &wfc.work); flush_work(&wfc.work); return wfc.ret; @@ -1019,4 +1021,8 @@ void __init init_workqueues(void) hotcpu_notifier(workqueue_cpu_callback, 0); keventd_wq = create_workqueue("events"); BUG_ON(!keventd_wq); +#ifdef CONFIG_SMP + work_on_cpu_wq = create_workqueue("work_on_cpu"); + BUG_ON(!work_on_cpu_wq); +#endif } -- cgit v1.2.3 From 6cd61c0baa8bce32271226198b46c67a7a05d108 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Feb 2009 22:17:39 +0900 Subject: elf: add ELF_CORE_COPY_KERNEL_REGS() ELF core dump is used for both user land core dump and kernel crash dump. Depending on architecture, register might need to be accessed differently for userland and kernel. Allow architectures to define ELF_CORE_COPY_KERNEL_REGS() and use different operation for kernel register dump. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- include/linux/elfcore.h | 9 +++++++++ kernel/kexec.c | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 5ca54d77079f..7605c5e9589f 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -111,6 +111,15 @@ static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *re #endif } +static inline void elf_core_copy_kernel_regs(elf_gregset_t *elfregs, struct pt_regs *regs) +{ +#ifdef ELF_CORE_COPY_KERNEL_REGS + ELF_CORE_COPY_KERNEL_REGS((*elfregs), regs); +#else + elf_core_copy_regs(elfregs, regs); +#endif +} + static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) { #ifdef ELF_CORE_COPY_TASK_REGS diff --git a/kernel/kexec.c b/kernel/kexec.c index 8a6d7b08864e..795e7b67a228 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1130,7 +1130,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu) return; memset(&prstatus, 0, sizeof(prstatus)); prstatus.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, &prstatus, sizeof(prstatus)); final_note(buf); -- cgit v1.2.3 From 5d707e9c8ef2a3596ed5c975c6ff05cec890c2b4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Feb 2009 22:17:39 +0900 Subject: stackprotector: update make rules Impact: no default -fno-stack-protector if stackp is enabled, cleanup Stackprotector make rules had the following problems. * cc support test and warning are scattered across makefile and kernel/panic.c. * -fno-stack-protector was always added regardless of configuration. Update such that cc support test and warning are contained in makefile and -fno-stack-protector is added iff stackp is turned off. While at it, prepare for 32bit support. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- Makefile | 3 ++- arch/x86/Makefile | 17 ++++++++++------- kernel/panic.c | 4 ---- scripts/gcc-x86_64-has-stack-protector.sh | 4 +++- 4 files changed, 15 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/Makefile b/Makefile index 681c1d23b4d4..77a006dae2da 100644 --- a/Makefile +++ b/Makefile @@ -532,8 +532,9 @@ KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN}) endif # Force gcc to behave correct even for buggy distributions -# Arch Makefiles may override this setting +ifndef CONFIG_CC_STACKPROTECTOR KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector) +endif ifdef CONFIG_FRAME_POINTER KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls diff --git a/arch/x86/Makefile b/arch/x86/Makefile index cacee981d166..ab48ab497e5a 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -70,14 +70,17 @@ else # this works around some issues with generating unwind tables in older gccs # newer gccs do it by default KBUILD_CFLAGS += -maccumulate-outgoing-args +endif - stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh - stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \ - "$(CC)" "-fstack-protector -DGCC_HAS_SP" ) - stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \ - "$(CC)" -fstack-protector-all ) - - KBUILD_CFLAGS += $(stackp-y) +ifdef CONFIG_CC_STACKPROTECTOR + cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh + ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC)),y) + stackp-y := -fstack-protector + stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all + KBUILD_CFLAGS += $(stackp-y) + else + $(warning stack protector enabled but no compiler support) + endif endif # Stackpointer is addressed different for 32 bit and 64 bit x86 diff --git a/kernel/panic.c b/kernel/panic.c index 33cab3de1763..32fe4eff1b89 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -359,10 +359,6 @@ EXPORT_SYMBOL(warn_slowpath); #ifdef CONFIG_CC_STACKPROTECTOR -#ifndef GCC_HAS_SP -#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this. -#endif - /* * Called when gcc's -fstack-protector feature is used, and * gcc detects corruption of the on-stack canary value diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh index 325c0a1b03b6..2d69fcdc5609 100644 --- a/scripts/gcc-x86_64-has-stack-protector.sh +++ b/scripts/gcc-x86_64-has-stack-protector.sh @@ -2,5 +2,7 @@ echo "int foo(void) { char X[200]; return 3; }" | $1 -S -xc -c -O0 -mcmodel=kernel -fstack-protector - -o - 2> /dev/null | grep -q "%gs" if [ "$?" -eq "0" ] ; then - echo $2 + echo y +else + echo n fi -- cgit v1.2.3 From 6b588c18f8dacfa6d7957c33c5ff832096e752d3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:07 +0900 Subject: module: reorder module pcpu related functions Impact: cleanup Move percpu_modinit() upwards. This is to ease further changes. Signed-off-by: Tejun Heo --- kernel/module.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index ba22484a987e..52b3497b8748 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -480,21 +480,6 @@ static void percpu_modfree(void *freeme) } } -static unsigned int find_pcpusec(Elf_Ehdr *hdr, - Elf_Shdr *sechdrs, - const char *secstrings) -{ - return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); -} - -static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) -{ - int cpu; - - for_each_possible_cpu(cpu) - memcpy(pcpudest + per_cpu_offset(cpu), from, size); -} - static int percpu_modinit(void) { pcpu_num_used = 2; @@ -513,7 +498,24 @@ static int percpu_modinit(void) return 0; } __initcall(percpu_modinit); + +static unsigned int find_pcpusec(Elf_Ehdr *hdr, + Elf_Shdr *sechdrs, + const char *secstrings) +{ + return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); +} + +static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) +{ + int cpu; + + for_each_possible_cpu(cpu) + memcpy(pcpudest + per_cpu_offset(cpu), from, size); +} + #else /* ... !CONFIG_SMP */ + static inline void *percpu_modalloc(unsigned long size, unsigned long align, const char *name) { @@ -535,6 +537,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src, /* pcpusec should be 0, and size of that section should be 0. */ BUG_ON(size != 0); } + #endif /* CONFIG_SMP */ #define MODINFO_ATTR(field) \ -- cgit v1.2.3 From b36128c830a8f5bd7d4981f5b0b69950f5928ee6 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: alloc_percpu: change percpu_ptr to per_cpu_ptr Impact: cleanup There are two allocated per-cpu accessor macros with almost identical spelling. The original and far more popular is per_cpu_ptr (44 files), so change over the other 4 files. tj: kill percpu_ptr() and update UP too Signed-off-by: Rusty Russell Cc: mingo@redhat.com Cc: lenb@kernel.org Cc: cpufreq@vger.kernel.org Signed-off-by: Tejun Heo --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 2 +- drivers/acpi/processor_perflib.c | 4 ++-- include/linux/percpu.h | 23 +++++++++++------------ kernel/sched.c | 6 +++--- kernel/stop_machine.c | 2 +- 5 files changed, 18 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4b1c319d30c3..22590cf688ae 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!data) return -ENOMEM; - data->acpi_data = percpu_ptr(acpi_perf_data, cpu); + data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); per_cpu(drv_data, cpu) = data; if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index 9cc769b587ff..68fd3d292799 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -516,12 +516,12 @@ int acpi_processor_preregister_performance( continue; } - if (!performance || !percpu_ptr(performance, i)) { + if (!performance || !per_cpu_ptr(performance, i)) { retval = -EINVAL; continue; } - pr->performance = percpu_ptr(performance, i); + pr->performance = per_cpu_ptr(performance, i); cpumask_set_cpu(i, pr->performance->shared_cpu_map); if (acpi_processor_get_psd(pr)) { retval = -EINVAL; diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 3577ffd90d45..c80cfe1260ec 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -81,23 +81,13 @@ struct percpu_data { }; #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) -/* - * Use this to get to a cpu's version of the per-cpu object dynamically - * allocated. Non-atomic access to the current CPU's version should - * probably be combined with get_cpu()/put_cpu(). - */ -#define percpu_ptr(ptr, cpu) \ -({ \ - struct percpu_data *__p = __percpu_disguise(ptr); \ - (__typeof__(ptr))__p->ptrs[(cpu)]; \ -}) extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask); extern void percpu_free(void *__pdata); #else /* CONFIG_SMP */ -#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) +#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); }) static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask) { @@ -122,6 +112,15 @@ static inline void percpu_free(void *__pdata) cpu_possible_map) #define alloc_percpu(type) (type *)__alloc_percpu(sizeof(type)) #define free_percpu(ptr) percpu_free((ptr)) -#define per_cpu_ptr(ptr, cpu) percpu_ptr((ptr), (cpu)) +/* + * Use this to get to a cpu's version of the per-cpu object dynamically + * allocated. Non-atomic access to the current CPU's version should + * probably be combined with get_cpu()/put_cpu(). + */ +#define per_cpu_ptr(ptr, cpu) \ +({ \ + struct percpu_data *__p = __percpu_disguise(ptr); \ + (__typeof__(ptr))__p->ptrs[(cpu)]; \ +}) #endif /* __LINUX_PERCPU_H */ diff --git a/kernel/sched.c b/kernel/sched.c index fc17fd91ab57..9d30ac956328 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9472,7 +9472,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; #ifndef CONFIG_64BIT @@ -9491,7 +9491,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); #ifndef CONFIG_64BIT /* @@ -9587,7 +9587,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); for (; ca; ca = ca->parent) { - u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; } } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 0cd415ee62a2..74541ca49536 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus) * doesn't hit this CPU until we're ready. */ get_cpu(); for_each_online_cpu(i) { - sm_work = percpu_ptr(stop_machine_work, i); + sm_work = per_cpu_ptr(stop_machine_work, i); INIT_WORK(sm_work, stop_cpu); queue_work_on(i, stop_machine_wq, sm_work); } -- cgit v1.2.3 From fbf59bc9d74d1fb30b8e0630743aff2806eafcea Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 20 Feb 2009 16:29:08 +0900 Subject: percpu: implement new dynamic percpu allocator Impact: new scalable dynamic percpu allocator which allows dynamic percpu areas to be accessed the same way as static ones Implement scalable dynamic percpu allocator which can be used for both static and dynamic percpu areas. This will allow static and dynamic areas to share faster direct access methods. This feature is optional and enabled only when CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is defined by arch. Please read comment on top of mm/percpu.c for details. Signed-off-by: Tejun Heo Cc: Andrew Morton --- include/linux/percpu.h | 22 +- kernel/module.c | 31 ++ mm/Makefile | 4 + mm/percpu.c | 890 +++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 943 insertions(+), 4 deletions(-) create mode 100644 mm/percpu.c (limited to 'kernel') diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d99e24ae1811..18080995ff3e 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -76,23 +76,37 @@ #ifdef CONFIG_SMP -struct percpu_data { - void *ptrs[1]; -}; +#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA -#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) +extern void *pcpu_base_addr; +typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); + +extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, + struct page **pages, size_t cpu_size); /* * Use this to get to a cpu's version of the per-cpu object * dynamically allocated. Non-atomic access to the current CPU's * version should probably be combined with get_cpu()/put_cpu(). */ +#define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))) + +#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + +struct percpu_data { + void *ptrs[1]; +}; + +#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata) + #define per_cpu_ptr(ptr, cpu) \ ({ \ struct percpu_data *__p = __percpu_disguise(ptr); \ (__typeof__(ptr))__p->ptrs[(cpu)]; \ }) +#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + extern void *__alloc_percpu(size_t size, size_t align); extern void free_percpu(void *__pdata); diff --git a/kernel/module.c b/kernel/module.c index 52b3497b8748..1f0657ae555b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -51,6 +51,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -366,6 +367,34 @@ static struct module *find_module(const char *name) } #ifdef CONFIG_SMP + +#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + +static void *percpu_modalloc(unsigned long size, unsigned long align, + const char *name) +{ + void *ptr; + + if (align > PAGE_SIZE) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n", + name, align, PAGE_SIZE); + align = PAGE_SIZE; + } + + ptr = __alloc_percpu(size, align); + if (!ptr) + printk(KERN_WARNING + "Could not allocate %lu bytes percpu data\n", size); + return ptr; +} + +static void percpu_modfree(void *freeme) +{ + free_percpu(freeme); +} + +#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + /* Number of blocks used and allocated. */ static unsigned int pcpu_num_used, pcpu_num_allocated; /* Size of each block. -ve means used. */ @@ -499,6 +528,8 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); +#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ + static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const char *secstrings) diff --git a/mm/Makefile b/mm/Makefile index 72255be57f89..818569b68f46 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o +ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +obj-$(CONFIG_SMP) += percpu.o +else obj-$(CONFIG_SMP) += allocpercpu.o +endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o diff --git a/mm/percpu.c b/mm/percpu.c new file mode 100644 index 000000000000..4617d97e877c --- /dev/null +++ b/mm/percpu.c @@ -0,0 +1,890 @@ +/* + * linux/mm/percpu.c - percpu memory allocator + * + * Copyright (C) 2009 SUSE Linux Products GmbH + * Copyright (C) 2009 Tejun Heo + * + * This file is released under the GPLv2. + * + * This is percpu allocator which can handle both static and dynamic + * areas. Percpu areas are allocated in chunks in vmalloc area. Each + * chunk is consisted of num_possible_cpus() units and the first chunk + * is used for static percpu variables in the kernel image (special + * boot time alloc/init handling necessary as these areas need to be + * brought up before allocation services are running). Unit grows as + * necessary and all units grow or shrink in unison. When a chunk is + * filled up, another chunk is allocated. ie. in vmalloc area + * + * c0 c1 c2 + * ------------------- ------------------- ------------ + * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u + * ------------------- ...... ------------------- .... ------------ + * + * Allocation is done in offset-size areas of single unit space. Ie, + * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, + * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring + * percpu base registers UNIT_SIZE apart. + * + * There are usually many small percpu allocations many of them as + * small as 4 bytes. The allocator organizes chunks into lists + * according to free size and tries to allocate from the fullest one. + * Each chunk keeps the maximum contiguous area size hint which is + * guaranteed to be eqaul to or larger than the maximum contiguous + * area in the chunk. This helps the allocator not to iterate the + * chunk maps unnecessarily. + * + * Allocation state in each chunk is kept using an array of integers + * on chunk->map. A positive value in the map represents a free + * region and negative allocated. Allocation inside a chunk is done + * by scanning this map sequentially and serving the first matching + * entry. This is mostly copied from the percpu_modalloc() allocator. + * Chunks are also linked into a rb tree to ease address to chunk + * mapping during free. + * + * To use this allocator, arch code should do the followings. + * + * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA + * + * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate + * regular address to percpu pointer and back + * + * - use pcpu_setup_static() during percpu area initialization to + * setup kernel static percpu area + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define PCPU_MIN_UNIT_PAGES_SHIFT 4 /* also max alloc size */ +#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ +#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ + +struct pcpu_chunk { + struct list_head list; /* linked to pcpu_slot lists */ + struct rb_node rb_node; /* key is chunk->vm->addr */ + int free_size; /* free bytes in the chunk */ + int contig_hint; /* max contiguous size hint */ + struct vm_struct *vm; /* mapped vmalloc region */ + int map_used; /* # of map entries used */ + int map_alloc; /* # of map entries allocated */ + int *map; /* allocation map */ + struct page *page[]; /* #cpus * UNIT_PAGES */ +}; + +static int pcpu_unit_pages_shift; +static int pcpu_unit_pages; +static int pcpu_unit_shift; +static int pcpu_unit_size; +static int pcpu_chunk_size; +static int pcpu_nr_slots; +static size_t pcpu_chunk_struct_size; + +/* the address of the first chunk which starts with the kernel static area */ +void *pcpu_base_addr; +EXPORT_SYMBOL_GPL(pcpu_base_addr); + +/* the size of kernel static area */ +static int pcpu_static_size; + +/* + * One mutex to rule them all. + * + * The following mutex is grabbed in the outermost public alloc/free + * interface functions and released only when the operation is + * complete. As such, every function in this file other than the + * outermost functions are called under pcpu_mutex. + * + * It can easily be switched to use spinlock such that only the area + * allocation and page population commit are protected with it doing + * actual [de]allocation without holding any lock. However, given + * what this allocator does, I think it's better to let them run + * sequentially. + */ +static DEFINE_MUTEX(pcpu_mutex); + +static struct list_head *pcpu_slot; /* chunk list slots */ +static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ + +static int pcpu_size_to_slot(int size) +{ + int highbit = fls(size); + return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); +} + +static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) +{ + if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int)) + return 0; + + return pcpu_size_to_slot(chunk->free_size); +} + +static int pcpu_page_idx(unsigned int cpu, int page_idx) +{ + return (cpu << pcpu_unit_pages_shift) + page_idx; +} + +static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return &chunk->page[pcpu_page_idx(cpu, page_idx)]; +} + +static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + return (unsigned long)chunk->vm->addr + + (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); +} + +static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, + int page_idx) +{ + return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; +} + +/** + * pcpu_realloc - versatile realloc + * @p: the current pointer (can be NULL for new allocations) + * @size: the current size (can be 0 for new allocations) + * @new_size: the wanted new size (can be 0 for free) + * + * More robust realloc which can be used to allocate, resize or free a + * memory area of arbitrary size. If the needed size goes over + * PAGE_SIZE, kernel VM is used. + * + * RETURNS: + * The new pointer on success, NULL on failure. + */ +static void *pcpu_realloc(void *p, size_t size, size_t new_size) +{ + void *new; + + if (new_size <= PAGE_SIZE) + new = kmalloc(new_size, GFP_KERNEL); + else + new = vmalloc(new_size); + if (new_size && !new) + return NULL; + + memcpy(new, p, min(size, new_size)); + if (new_size > size) + memset(new + size, 0, new_size - size); + + if (size <= PAGE_SIZE) + kfree(p); + else + vfree(p); + + return new; +} + +/** + * pcpu_chunk_relocate - put chunk in the appropriate chunk slot + * @chunk: chunk of interest + * @oslot: the previous slot it was on + * + * This function is called after an allocation or free changed @chunk. + * New slot according to the changed state is determined and @chunk is + * moved to the slot. + */ +static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) +{ + int nslot = pcpu_chunk_slot(chunk); + + if (oslot != nslot) { + if (oslot < nslot) + list_move(&chunk->list, &pcpu_slot[nslot]); + else + list_move_tail(&chunk->list, &pcpu_slot[nslot]); + } +} + +static struct rb_node **pcpu_chunk_rb_search(void *addr, + struct rb_node **parentp) +{ + struct rb_node **p = &pcpu_addr_root.rb_node; + struct rb_node *parent = NULL; + struct pcpu_chunk *chunk; + + while (*p) { + parent = *p; + chunk = rb_entry(parent, struct pcpu_chunk, rb_node); + + if (addr < chunk->vm->addr) + p = &(*p)->rb_left; + else if (addr > chunk->vm->addr) + p = &(*p)->rb_right; + else + break; + } + + if (parentp) + *parentp = parent; + return p; +} + +/** + * pcpu_chunk_addr_search - search for chunk containing specified address + * @addr: address to search for + * + * Look for chunk which might contain @addr. More specifically, it + * searchs for the chunk with the highest start address which isn't + * beyond @addr. + * + * RETURNS: + * The address of the found chunk. + */ +static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) +{ + struct rb_node *n, *parent; + struct pcpu_chunk *chunk; + + n = *pcpu_chunk_rb_search(addr, &parent); + if (!n) { + /* no exactly matching chunk, the parent is the closest */ + n = parent; + BUG_ON(!n); + } + chunk = rb_entry(n, struct pcpu_chunk, rb_node); + + if (addr < chunk->vm->addr) { + /* the parent was the next one, look for the previous one */ + n = rb_prev(n); + BUG_ON(!n); + chunk = rb_entry(n, struct pcpu_chunk, rb_node); + } + + return chunk; +} + +/** + * pcpu_chunk_addr_insert - insert chunk into address rb tree + * @new: chunk to insert + * + * Insert @new into address rb tree. + */ +static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) +{ + struct rb_node **p, *parent; + + p = pcpu_chunk_rb_search(new->vm->addr, &parent); + BUG_ON(*p); + rb_link_node(&new->rb_node, parent, p); + rb_insert_color(&new->rb_node, &pcpu_addr_root); +} + +/** + * pcpu_split_block - split a map block + * @chunk: chunk of interest + * @i: index of map block to split + * @head: head size (can be 0) + * @tail: tail size (can be 0) + * + * Split the @i'th map block into two or three blocks. If @head is + * non-zero, @head bytes block is inserted before block @i moving it + * to @i+1 and reducing its size by @head bytes. + * + * If @tail is non-zero, the target block, which can be @i or @i+1 + * depending on @head, is reduced by @tail bytes and @tail byte block + * is inserted after the target block. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail) +{ + int nr_extra = !!head + !!tail; + int target = chunk->map_used + nr_extra; + + /* reallocation required? */ + if (chunk->map_alloc < target) { + int new_alloc = chunk->map_alloc; + int *new; + + while (new_alloc < target) + new_alloc *= 2; + + new = pcpu_realloc(chunk->map, + chunk->map_alloc * sizeof(new[0]), + new_alloc * sizeof(new[0])); + if (!new) + return -ENOMEM; + + chunk->map_alloc = new_alloc; + chunk->map = new; + } + + /* insert a new subblock */ + memmove(&chunk->map[i + nr_extra], &chunk->map[i], + sizeof(chunk->map[0]) * (chunk->map_used - i)); + chunk->map_used += nr_extra; + + if (head) { + chunk->map[i + 1] = chunk->map[i] - head; + chunk->map[i++] = head; + } + if (tail) { + chunk->map[i++] -= tail; + chunk->map[i] = tail; + } + return 0; +} + +/** + * pcpu_alloc_area - allocate area from a pcpu_chunk + * @chunk: chunk of interest + * @size: wanted size + * @align: wanted align + * + * Try to allocate @size bytes area aligned at @align from @chunk. + * Note that this function only allocates the offset. It doesn't + * populate or map the area. + * + * RETURNS: + * Allocated offset in @chunk on success, -errno on failure. + */ +static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) +{ + int oslot = pcpu_chunk_slot(chunk); + int max_contig = 0; + int i, off; + + /* + * The static chunk initially doesn't have map attached + * because kmalloc wasn't available during init. Give it one. + */ + if (unlikely(!chunk->map)) { + chunk->map = pcpu_realloc(NULL, 0, + PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + if (!chunk->map) + return -ENOMEM; + + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[chunk->map_used++] = -pcpu_static_size; + if (chunk->free_size) + chunk->map[chunk->map_used++] = chunk->free_size; + } + + for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) { + bool is_last = i + 1 == chunk->map_used; + int head, tail; + + /* extra for alignment requirement */ + head = ALIGN(off, align) - off; + BUG_ON(i == 0 && head != 0); + + if (chunk->map[i] < 0) + continue; + if (chunk->map[i] < head + size) { + max_contig = max(chunk->map[i], max_contig); + continue; + } + + /* + * If head is small or the previous block is free, + * merge'em. Note that 'small' is defined as smaller + * than sizeof(int), which is very small but isn't too + * uncommon for percpu allocations. + */ + if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) { + if (chunk->map[i - 1] > 0) + chunk->map[i - 1] += head; + else { + chunk->map[i - 1] -= head; + chunk->free_size -= head; + } + chunk->map[i] -= head; + off += head; + head = 0; + } + + /* if tail is small, just keep it around */ + tail = chunk->map[i] - head - size; + if (tail < sizeof(int)) + tail = 0; + + /* split if warranted */ + if (head || tail) { + if (pcpu_split_block(chunk, i, head, tail)) + return -ENOMEM; + if (head) { + i++; + off += head; + max_contig = max(chunk->map[i - 1], max_contig); + } + if (tail) + max_contig = max(chunk->map[i + 1], max_contig); + } + + /* update hint and mark allocated */ + if (is_last) + chunk->contig_hint = max_contig; /* fully scanned */ + else + chunk->contig_hint = max(chunk->contig_hint, + max_contig); + + chunk->free_size -= chunk->map[i]; + chunk->map[i] = -chunk->map[i]; + + pcpu_chunk_relocate(chunk, oslot); + return off; + } + + chunk->contig_hint = max_contig; /* fully scanned */ + pcpu_chunk_relocate(chunk, oslot); + + /* + * Tell the upper layer that this chunk has no area left. + * Note that this is not an error condition but a notification + * to upper layer that it needs to look at other chunks. + * -ENOSPC is chosen as it isn't used in memory subsystem and + * matches the meaning in a way. + */ + return -ENOSPC; +} + +/** + * pcpu_free_area - free area to a pcpu_chunk + * @chunk: chunk of interest + * @freeme: offset of area to free + * + * Free area starting from @freeme to @chunk. Note that this function + * only modifies the allocation map. It doesn't depopulate or unmap + * the area. + */ +static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) +{ + int oslot = pcpu_chunk_slot(chunk); + int i, off; + + for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) + if (off == freeme) + break; + BUG_ON(off != freeme); + BUG_ON(chunk->map[i] > 0); + + chunk->map[i] = -chunk->map[i]; + chunk->free_size += chunk->map[i]; + + /* merge with previous? */ + if (i > 0 && chunk->map[i - 1] >= 0) { + chunk->map[i - 1] += chunk->map[i]; + chunk->map_used--; + memmove(&chunk->map[i], &chunk->map[i + 1], + (chunk->map_used - i) * sizeof(chunk->map[0])); + i--; + } + /* merge with next? */ + if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) { + chunk->map[i] += chunk->map[i + 1]; + chunk->map_used--; + memmove(&chunk->map[i + 1], &chunk->map[i + 2], + (chunk->map_used - (i + 1)) * sizeof(chunk->map[0])); + } + + chunk->contig_hint = max(chunk->map[i], chunk->contig_hint); + pcpu_chunk_relocate(chunk, oslot); +} + +/** + * pcpu_unmap - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * @flush: whether to flush cache and tlb or not + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * If @flush is true, vcache is flushed before unmapping and tlb + * after. + */ +static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, + bool flush) +{ + unsigned int last = num_possible_cpus() - 1; + unsigned int cpu; + + /* + * Each flushing trial can be very expensive, issue flush on + * the whole region at once rather than doing it for each cpu. + * This could be an overkill but is more scalable. + */ + if (flush) + flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); + + for_each_possible_cpu(cpu) + unmap_kernel_range_noflush( + pcpu_chunk_addr(chunk, cpu, page_start), + (page_end - page_start) << PAGE_SHIFT); + + /* ditto as flush_cache_vunmap() */ + if (flush) + flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @off: offset to the area to depopulate + * @size: size of the area to depopulate + * @flush: whether to flush cache and tlb or not + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. If @flush is true, vcache is flushed before unmapping + * and tlb after. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off, + size_t size, bool flush) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int unmap_start = -1; + int uninitialized_var(unmap_end); + unsigned int cpu; + int i; + + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) { + struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + + if (!*pagep) + continue; + + __free_page(*pagep); + + /* + * If it's partial depopulation, it might get + * populated or depopulated again. Mark the + * page gone. + */ + *pagep = NULL; + + unmap_start = unmap_start < 0 ? i : unmap_start; + unmap_end = i + 1; + } + } + + if (unmap_start >= 0) + pcpu_unmap(chunk, unmap_start, unmap_end, flush); +} + +/** + * pcpu_map - map pages into a pcpu_chunk + * @chunk: chunk of interest + * @page_start: page index of the first page to map + * @page_end: page index of the last page to map + 1 + * + * For each cpu, map pages [@page_start,@page_end) into @chunk. + * vcache is flushed afterwards. + */ +static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) +{ + unsigned int last = num_possible_cpus() - 1; + unsigned int cpu; + int err; + + for_each_possible_cpu(cpu) { + err = map_kernel_range_noflush( + pcpu_chunk_addr(chunk, cpu, page_start), + (page_end - page_start) << PAGE_SHIFT, + PAGE_KERNEL, + pcpu_chunk_pagep(chunk, cpu, page_start)); + if (err < 0) + return err; + } + + /* flush at once, please read comments in pcpu_unmap() */ + flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), + pcpu_chunk_addr(chunk, last, page_end)); + return 0; +} + +/** + * pcpu_populate_chunk - populate and map an area of a pcpu_chunk + * @chunk: chunk of interest + * @off: offset to the area to populate + * @size: size of the area to populate + * + * For each cpu, populate and map pages [@page_start,@page_end) into + * @chunk. The area is cleared on return. + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int map_start = -1; + int map_end; + unsigned int cpu; + int i; + + for (i = page_start; i < page_end; i++) { + if (pcpu_chunk_page_occupied(chunk, i)) { + if (map_start >= 0) { + if (pcpu_map(chunk, map_start, map_end)) + goto err; + map_start = -1; + } + continue; + } + + map_start = map_start < 0 ? i : map_start; + map_end = i + 1; + + for_each_possible_cpu(cpu) { + struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + + *pagep = alloc_pages_node(cpu_to_node(cpu), + alloc_mask, 0); + if (!*pagep) + goto err; + } + } + + if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) + goto err; + + for_each_possible_cpu(cpu) + memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0, + size); + + return 0; +err: + /* likely under heavy memory pressure, give memory back */ + pcpu_depopulate_chunk(chunk, off, size, true); + return -ENOMEM; +} + +static void free_pcpu_chunk(struct pcpu_chunk *chunk) +{ + if (!chunk) + return; + if (chunk->vm) + free_vm_area(chunk->vm); + pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0); + kfree(chunk); +} + +static struct pcpu_chunk *alloc_pcpu_chunk(void) +{ + struct pcpu_chunk *chunk; + + chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); + if (!chunk) + return NULL; + + chunk->map = pcpu_realloc(NULL, 0, + PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[chunk->map_used++] = pcpu_unit_size; + + chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); + if (!chunk->vm) { + free_pcpu_chunk(chunk); + return NULL; + } + + INIT_LIST_HEAD(&chunk->list); + chunk->free_size = pcpu_unit_size; + chunk->contig_hint = pcpu_unit_size; + + return chunk; +} + +/** + * __alloc_percpu - allocate percpu area + * @size: size of area to allocate + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align. Might + * sleep. Might trigger writeouts. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_percpu(size_t size, size_t align) +{ + void *ptr = NULL; + struct pcpu_chunk *chunk; + int slot, off; + + if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT || + align > PAGE_SIZE)) { + WARN(true, "illegal size (%zu) or align (%zu) for " + "percpu allocation\n", size, align); + return NULL; + } + + mutex_lock(&pcpu_mutex); + + /* allocate area */ + for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { + list_for_each_entry(chunk, &pcpu_slot[slot], list) { + if (size > chunk->contig_hint) + continue; + off = pcpu_alloc_area(chunk, size, align); + if (off >= 0) + goto area_found; + if (off != -ENOSPC) + goto out_unlock; + } + } + + /* hmmm... no space left, create a new chunk */ + chunk = alloc_pcpu_chunk(); + if (!chunk) + goto out_unlock; + pcpu_chunk_relocate(chunk, -1); + pcpu_chunk_addr_insert(chunk); + + off = pcpu_alloc_area(chunk, size, align); + if (off < 0) + goto out_unlock; + +area_found: + /* populate, map and clear the area */ + if (pcpu_populate_chunk(chunk, off, size)) { + pcpu_free_area(chunk, off); + goto out_unlock; + } + + ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off); +out_unlock: + mutex_unlock(&pcpu_mutex); + return ptr; +} +EXPORT_SYMBOL_GPL(__alloc_percpu); + +static void pcpu_kill_chunk(struct pcpu_chunk *chunk) +{ + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); + list_del(&chunk->list); + rb_erase(&chunk->rb_node, &pcpu_addr_root); + free_pcpu_chunk(chunk); +} + +/** + * free_percpu - free percpu area + * @ptr: pointer to area to free + * + * Free percpu area @ptr. Might sleep. + */ +void free_percpu(void *ptr) +{ + void *addr = __pcpu_ptr_to_addr(ptr); + struct pcpu_chunk *chunk; + int off; + + if (!ptr) + return; + + mutex_lock(&pcpu_mutex); + + chunk = pcpu_chunk_addr_search(addr); + off = addr - chunk->vm->addr; + + pcpu_free_area(chunk, off); + + /* the chunk became fully free, kill one if there are other free ones */ + if (chunk->free_size == pcpu_unit_size) { + struct pcpu_chunk *pos; + + list_for_each_entry(pos, + &pcpu_slot[pcpu_chunk_slot(chunk)], list) + if (pos != chunk) { + pcpu_kill_chunk(pos); + break; + } + } + + mutex_unlock(&pcpu_mutex); +} +EXPORT_SYMBOL_GPL(free_percpu); + +/** + * pcpu_setup_static - initialize kernel static percpu area + * @populate_pte_fn: callback to allocate pagetable + * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages + * + * Initialize kernel static percpu area. The caller should allocate + * all the necessary pages and pass them in @pages. + * @populate_pte_fn() is called on each page to be used for percpu + * mapping and is responsible for making sure all the necessary page + * tables for the page is allocated. + * + * RETURNS: + * The determined pcpu_unit_size which can be used to initialize + * percpu access. + */ +size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn, + struct page **pages, size_t cpu_size) +{ + static struct vm_struct static_vm; + struct pcpu_chunk *static_chunk; + int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE); + unsigned int cpu; + int err, i; + + pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT, + order_base_2(cpu_size) - PAGE_SHIFT); + + pcpu_static_size = cpu_size; + pcpu_unit_pages = 1 << pcpu_unit_pages_shift; + pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift; + pcpu_unit_size = 1 << pcpu_unit_shift; + pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; + pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1; + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + (1 << pcpu_unit_pages_shift) * sizeof(struct page *); + + /* allocate chunk slots */ + pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0])); + for (i = 0; i < pcpu_nr_slots; i++) + INIT_LIST_HEAD(&pcpu_slot[i]); + + /* init and register vm area */ + static_vm.flags = VM_ALLOC; + static_vm.size = pcpu_chunk_size; + vm_area_register_early(&static_vm); + + /* init static_chunk */ + static_chunk = alloc_bootmem(pcpu_chunk_struct_size); + INIT_LIST_HEAD(&static_chunk->list); + static_chunk->vm = &static_vm; + static_chunk->free_size = pcpu_unit_size - pcpu_static_size; + static_chunk->contig_hint = static_chunk->free_size; + + /* assign pages and map them */ + for_each_possible_cpu(cpu) { + for (i = 0; i < nr_cpu_pages; i++) { + *pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++; + populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i)); + } + } + + err = pcpu_map(static_chunk, 0, nr_cpu_pages); + if (err) + panic("failed to setup static percpu area, err=%d\n", err); + + /* link static_chunk in */ + pcpu_chunk_relocate(static_chunk, -1); + pcpu_chunk_addr_insert(static_chunk); + + /* we're done */ + pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0); + return pcpu_unit_size; +} -- cgit v1.2.3 From edcb463997ed7b2ffa3bac76e3e75957318f2e01 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Mar 2009 14:33:59 +0900 Subject: percpu, module: implement reserved allocation and use it for module percpu variables Impact: add reserved allocation functionality and use it for module percpu variables This patch implements reserved allocation from the first chunk. When setting up the first chunk, arch can ask to set aside certain number of bytes right after the core static area which is available only through a separate reserved allocator. This will be used primarily for module static percpu variables on architectures with limited relocation range to ensure that the module perpcu symbols are inside the relocatable range. If reserved area is requested, the first chunk becomes reserved and isn't available for regular allocation. If the first chunk also includes piggy-back dynamic allocation area, a separate chunk mapping the same region is created to serve dynamic allocation. The first one is called static first chunk and the second dynamic first chunk. Although they share the page map, their different area map initializations guarantee they serve disjoint areas according to their purposes. If arch doesn't setup reserved area, reserved allocation is handled like any other allocation. Signed-off-by: Tejun Heo --- arch/x86/kernel/setup_percpu.c | 8 +-- include/linux/percpu.h | 10 +-- kernel/module.c | 2 +- mm/percpu.c | 153 +++++++++++++++++++++++++++++++++++------ 4 files changed, 144 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 38e2b2a470a5..dd4eabc747c8 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -217,7 +217,7 @@ proceed: pr_info("PERCPU: Remapped at %p with large pages, static data " "%zu bytes\n", vm.addr, static_size); - ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE, + ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE, pcpur_size - static_size, vm.addr, NULL); goto out_free_ar; @@ -297,7 +297,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size) pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); - return pcpu_setup_first_chunk(pcpue_get_page, static_size, + return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0, pcpue_unit_size, dyn_size, pcpue_ptr, NULL); } @@ -356,8 +356,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size) pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n", pcpu4k_nr_static_pages, static_size); - ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL, - pcpu4k_populate_pte); + ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1, + NULL, pcpu4k_populate_pte); goto out_free_ar; enomem: diff --git a/include/linux/percpu.h b/include/linux/percpu.h index a96fc53bbd62..8ff15153ae20 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -117,10 +117,10 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno); typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr); extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, - ssize_t unit_size, ssize_t dyn_size, - void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn); + size_t static_size, size_t reserved_size, + ssize_t unit_size, ssize_t dyn_size, + void *base_addr, + pcpu_populate_pte_fn_t populate_pte_fn); /* * Use this to get to a cpu's version of the per-cpu object @@ -129,6 +129,8 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, */ #define per_cpu_ptr(ptr, cpu) SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))) +extern void *__alloc_reserved_percpu(size_t size, size_t align); + #else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ struct percpu_data { diff --git a/kernel/module.c b/kernel/module.c index 1f0657ae555b..f0e04d6b67d8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -381,7 +381,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align, align = PAGE_SIZE; } - ptr = __alloc_percpu(size, align); + ptr = __alloc_reserved_percpu(size, align); if (!ptr) printk(KERN_WARNING "Could not allocate %lu bytes percpu data\n", size); diff --git a/mm/percpu.c b/mm/percpu.c index 5b47d9fe65f5..ef8e169b7731 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -94,6 +94,11 @@ static size_t pcpu_chunk_struct_size __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); +/* optional reserved chunk, only accessible for reserved allocations */ +static struct pcpu_chunk *pcpu_reserved_chunk; +/* offset limit of the reserved chunk */ +static int pcpu_reserved_chunk_limit; + /* * One mutex to rule them all. * @@ -201,13 +206,14 @@ static void *pcpu_realloc(void *p, size_t size, size_t new_size) * * This function is called after an allocation or free changed @chunk. * New slot according to the changed state is determined and @chunk is - * moved to the slot. + * moved to the slot. Note that the reserved chunk is never put on + * chunk slots. */ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); - if (oslot != nslot) { + if (chunk != pcpu_reserved_chunk && oslot != nslot) { if (oslot < nslot) list_move(&chunk->list, &pcpu_slot[nslot]); else @@ -255,6 +261,15 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) struct rb_node *n, *parent; struct pcpu_chunk *chunk; + /* is it in the reserved chunk? */ + if (pcpu_reserved_chunk) { + void *start = pcpu_reserved_chunk->vm->addr; + + if (addr >= start && addr < start + pcpu_reserved_chunk_limit) + return pcpu_reserved_chunk; + } + + /* nah... search the regular ones */ n = *pcpu_chunk_rb_search(addr, &parent); if (!n) { /* no exactly matching chunk, the parent is the closest */ @@ -713,9 +728,10 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) } /** - * __alloc_percpu - allocate percpu area + * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) + * @reserved: allocate from the reserved chunk if available * * Allocate percpu area of @size bytes aligned at @align. Might * sleep. Might trigger writeouts. @@ -723,7 +739,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -void *__alloc_percpu(size_t size, size_t align) +static void *pcpu_alloc(size_t size, size_t align, bool reserved) { void *ptr = NULL; struct pcpu_chunk *chunk; @@ -737,7 +753,18 @@ void *__alloc_percpu(size_t size, size_t align) mutex_lock(&pcpu_mutex); - /* allocate area */ + /* serve reserved allocations from the reserved chunk if available */ + if (reserved && pcpu_reserved_chunk) { + chunk = pcpu_reserved_chunk; + if (size > chunk->contig_hint) + goto out_unlock; + off = pcpu_alloc_area(chunk, size, align); + if (off >= 0) + goto area_found; + goto out_unlock; + } + + /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { list_for_each_entry(chunk, &pcpu_slot[slot], list) { if (size > chunk->contig_hint) @@ -773,8 +800,41 @@ out_unlock: mutex_unlock(&pcpu_mutex); return ptr; } + +/** + * __alloc_percpu - allocate dynamic percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align. Might + * sleep. Might trigger writeouts. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, false); +} EXPORT_SYMBOL_GPL(__alloc_percpu); +/** + * __alloc_reserved_percpu - allocate reserved percpu area + * @size: size of area to allocate in bytes + * @align: alignment of area (max PAGE_SIZE) + * + * Allocate percpu area of @size bytes aligned at @align from reserved + * percpu area if arch has set it up; otherwise, allocation is served + * from the same dynamic area. Might sleep. Might trigger writeouts. + * + * RETURNS: + * Percpu pointer to the allocated area on success, NULL on failure. + */ +void *__alloc_reserved_percpu(size_t size, size_t align) +{ + return pcpu_alloc(size, align, true); +} + static void pcpu_kill_chunk(struct pcpu_chunk *chunk) { WARN_ON(chunk->immutable); @@ -826,6 +886,7 @@ EXPORT_SYMBOL_GPL(free_percpu); * pcpu_setup_first_chunk - initialize the first percpu chunk * @get_page_fn: callback to fetch page pointer * @static_size: the size of static percpu area in bytes + * @reserved_size: the size of reserved percpu area in bytes * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto * @dyn_size: free size for dynamic allocation in bytes, -1 for auto * @base_addr: mapped address, NULL for auto @@ -844,14 +905,22 @@ EXPORT_SYMBOL_GPL(free_percpu); * indicates end of pages for the cpu. Note that @get_page_fn() must * return the same number of pages for all cpus. * + * @reserved_size, if non-zero, specifies the amount of bytes to + * reserve after the static area in the first chunk. This reserves + * the first chunk such that it's available only through reserved + * percpu allocation. This is primarily used to serve module percpu + * static areas on architectures where the addressing model has + * limited offset range for symbol relocations to guarantee module + * percpu symbols fall inside the relocatable range. + * * @unit_size, if non-negative, specifies unit size and must be * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @dyn_size. + * @reserved_size + @dyn_size. * * @dyn_size, if non-negative, limits the number of bytes available * for dynamic allocation in the first chunk. Specifying non-negative * value make percpu leave alone the area beyond @static_size + - * @dyn_size. + * @reserved_size + @dyn_size. * * Non-null @base_addr means that the caller already allocated virtual * region for the first chunk and mapped it. percpu must not mess @@ -861,28 +930,36 @@ EXPORT_SYMBOL_GPL(free_percpu); * @populate_pte_fn is used to populate the pagetable. NULL means the * caller already populated the pagetable. * + * If the first chunk ends up with both reserved and dynamic areas, it + * is served by two chunks - one to serve the core static and reserved + * areas and the other for the dynamic area. They share the same vm + * and page map but uses different area allocation map to stay away + * from each other. The latter chunk is circulated in the chunk slots + * and available for dynamic allocation like any other chunks. + * * RETURNS: * The determined pcpu_unit_size which can be used to initialize * percpu access. */ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, + size_t static_size, size_t reserved_size, ssize_t unit_size, ssize_t dyn_size, void *base_addr, pcpu_populate_pte_fn_t populate_pte_fn) { static struct vm_struct first_vm; - static int smap[2]; - struct pcpu_chunk *schunk; + static int smap[2], dmap[2]; + struct pcpu_chunk *schunk, *dchunk = NULL; unsigned int cpu; int nr_pages; int err, i; /* santiy checks */ - BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC); + BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || + ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); BUG_ON(!static_size); if (unit_size >= 0) { - BUG_ON(unit_size < static_size + + BUG_ON(unit_size < static_size + reserved_size + (dyn_size >= 0 ? dyn_size : 0)); BUG_ON(unit_size & ~PAGE_MASK); } else { @@ -895,7 +972,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, pcpu_unit_pages = unit_size >> PAGE_SHIFT; else pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(static_size)); + PFN_UP(static_size + reserved_size)); pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; @@ -903,7 +980,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); if (dyn_size < 0) - dyn_size = pcpu_unit_size - static_size; + dyn_size = pcpu_unit_size - static_size - reserved_size; /* * Allocate chunk slots. The additional last slot is for @@ -914,20 +991,49 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_slot[i]); - /* init static chunk */ + /* + * Initialize static chunk. If reserved_size is zero, the + * static chunk covers static area + dynamic allocation area + * in the first chunk. If reserved_size is not zero, it + * covers static area + reserved area (mostly used for module + * static percpu allocation). + */ schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); schunk->vm = &first_vm; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); schunk->page = schunk->page_ar; - schunk->free_size = dyn_size; + + if (reserved_size) { + schunk->free_size = reserved_size; + pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ + } else { + schunk->free_size = dyn_size; + dyn_size = 0; /* dynamic area covered */ + } schunk->contig_hint = schunk->free_size; schunk->map[schunk->map_used++] = -static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; + pcpu_reserved_chunk_limit = static_size + schunk->free_size; + + /* init dynamic chunk if necessary */ + if (dyn_size) { + dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); + INIT_LIST_HEAD(&dchunk->list); + dchunk->vm = &first_vm; + dchunk->map = dmap; + dchunk->map_alloc = ARRAY_SIZE(dmap); + dchunk->page = schunk->page_ar; /* share page map with schunk */ + + dchunk->contig_hint = dchunk->free_size = dyn_size; + dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; + dchunk->map[dchunk->map_used++] = dchunk->free_size; + } + /* allocate vm address */ first_vm.flags = VM_ALLOC; first_vm.size = pcpu_chunk_size; @@ -937,12 +1043,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, else { /* * Pages already mapped. No need to remap into - * vmalloc area. In this case the static chunk can't - * be mapped or unmapped by percpu and is marked + * vmalloc area. In this case the first chunks can't + * be mapped or unmapped by percpu and are marked * immutable. */ first_vm.addr = base_addr; schunk->immutable = true; + if (dchunk) + dchunk->immutable = true; } /* assign pages */ @@ -978,8 +1086,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, } /* link the first chunk in */ - pcpu_chunk_relocate(schunk, -1); - pcpu_chunk_addr_insert(schunk); + if (!dchunk) { + pcpu_chunk_relocate(schunk, -1); + pcpu_chunk_addr_insert(schunk); + } else { + pcpu_chunk_relocate(dchunk, -1); + pcpu_chunk_addr_insert(dchunk); + } /* we're done */ pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); -- cgit v1.2.3