From 59ea746337c69f6a5f1bc4d5e8544b3cbf12f801 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 12 Jun 2008 13:56:40 +0200 Subject: MM: virtual address debug Add some (configurable) expensive sanity checking to catch wrong address translations on x86. - create linux/mmdebug.h file to be able include this file in asm headers to not get unsolvable loops in header files - __phys_addr on x86_32 became a function in ioremap.c since PAGE_OFFSET, is_vmalloc_addr and VMALLOC_* non-constasts are undefined if declared in page_32.h - add __phys_addr_const for initializing doublefault_tss.__cr3 Tested on 386, 386pae, x86_64 and x86_64 numa=fake=2. Contains Andi's enable numa virtual address debug patch. Signed-off-by: Jiri Slaby Cc: Andi Kleen Signed-off-by: Ingo Molnar --- lib/Kconfig.debug | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index d2099f41aa1e..9d9dc0ddf13a 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -469,6 +469,15 @@ config DEBUG_VM If unsure, say N. +config DEBUG_VIRTUAL + bool "Debug VM translations" + depends on DEBUG_KERNEL && X86 + help + Enable some costly sanity checks in virtual to page code. This can + catch mistakes with virt_to_page() and friends. + + If unsure, say N. + config DEBUG_WRITECOUNT bool "Debug filesystem writers count" depends on DEBUG_KERNEL -- cgit v1.2.3 From d974ae379a2fbe8948f01eabbc6b19c0a80f09bc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 24 Jul 2008 16:27:46 -0700 Subject: generic, memparse(): constify argument memparse()'s first argument can be const, so it should be. Signed-off-by: Jeremy Fitzhardinge Cc: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 2 +- lib/cmdline.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index fdbbf72ca2eb..7889c2f9b75d 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -176,7 +176,7 @@ extern int vsscanf(const char *, const char *, va_list) extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); -extern unsigned long long memparse(char *ptr, char **retptr); +extern unsigned long long memparse(const char *ptr, char **retptr); extern int core_kernel_text(unsigned long addr); extern int __kernel_text_address(unsigned long addr); diff --git a/lib/cmdline.c b/lib/cmdline.c index 5ba8a942a478..f5f3ad8b62ff 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -126,7 +126,7 @@ char *get_options(const char *str, int nints, int *ints) * megabyte, or one gigabyte, respectively. */ -unsigned long long memparse(char *ptr, char **retptr) +unsigned long long memparse(const char *ptr, char **retptr) { char *endptr; /* local pointer to end of parsed string */ -- cgit v1.2.3 From 67182ae1c42206e516f7efb292b745e826497b24 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 10 Aug 2008 18:35:38 -0700 Subject: rcu, debug: detect stalled grace periods this is a diagnostic patch for Classic RCU. The approach is to record a timestamp at the beginning of the grace period (in rcu_start_batch()), then have rcu_check_callbacks() complain if: 1. it is running on a CPU that has holding up grace periods for a long time (say one second). This will identify the culprit assuming that the culprit has not disabled hardware irqs, instruction execution, or some such. 2. it is running on a CPU that is not holding up grace periods, but grace periods have been held up for an even longer time (say two seconds). It is enabled via the default-off CONFIG_DEBUG_RCU_STALL kernel parameter. Rather than exponential backoff, it backs off to once per 30 seconds. My feeling upon thinking on it was that if you have stalled RCU grace periods for that long, a few extra printk() messages are probably the least of your worries... Signed-off-by: Paul E. McKenney Cc: Peter Zijlstra Cc: Yinghai Lu Cc: David Witbrodt Signed-off-by: Ingo Molnar --- include/linux/rcuclassic.h | 3 ++ kernel/rcuclassic.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++ lib/Kconfig.debug | 13 ++++++++ 3 files changed, 96 insertions(+) (limited to 'lib') diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 04c728147be0..16589958b40e 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -46,6 +46,9 @@ struct rcu_ctrlblk { long cur; /* Current batch number. */ long completed; /* Number of the last completed batch */ long pending; /* Number of the last pending batch */ +#ifdef CONFIG_DEBUG_RCU_STALL + unsigned long gp_check; /* Time grace period should end, in seconds. */ +#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ int signaled; diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index d4271146a9bd..d7ec731de75c 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -286,6 +287,81 @@ static void rcu_do_batch(struct rcu_data *rdp) * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace * period (if necessary). */ + +#ifdef CONFIG_DEBUG_RCU_STALL + +static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_check = get_seconds() + 3; +} +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock(&rcp->lock); + delta = get_seconds() - rcp->gp_check; + if (delta < 2L || + cpus_empty(rcp->cpumask)) { + spin_unlock(&rcp->lock); + return; + rcp->gp_check = get_seconds() + 30; + } + spin_unlock(&rcp->lock); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_cpu_mask(cpu, rcp->cpumask) + printk(" %d", cpu); + printk(" (detected by %d, t=%lu/%lu)\n", + smp_processor_id(), get_seconds(), rcp->gp_check); +} +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", + smp_processor_id(), get_seconds(), rcp->gp_check); + dump_stack(); + spin_lock(&rcp->lock); + if ((long)(get_seconds() - rcp->gp_check) >= 0L) + rcp->gp_check = get_seconds() + 30; + spin_unlock(&rcp->lock); +} +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ + long delta; + + delta = get_seconds() - rcp->gp_check; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { + + /* We haven't checked in, so go dump stack. */ + + print_cpu_stall(rcp); + + } else if (!cpus_empty(rcp->cpumask) && delta >= 2L) { + + /* They had two seconds to dump stack, so complain. */ + + print_other_cpu_stall(rcp); + + } +} + +#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ + +static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) +{ +} +static inline void check_cpu_stall(struct rcu_ctrlblk *rcp, + struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ + /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -296,6 +372,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { rcp->cur++; + record_gp_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -489,6 +566,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { + /* Check for CPU stalls, if enabled. */ + check_cpu_stall(rcp, rdp); + if (rdp->nxtlist) { /* * This cpu has pending rcu entries and the grace period diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index e1d4764435ed..2fb6d90bf1e6 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -597,6 +597,19 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. +config RCU_CPU_STALL + bool "Check for stalled CPUs delaying RCU grace periods" + depends on CLASSIC_RCU + default n + help + This option causes RCU to printk information on which + CPUs are delaying the current grace period, but only when + the grace period extends for excessive time periods. + + Say Y if you want RCU to perform such checks. + + Say N if you are unsure. + config KPROBES_SANITY_TEST bool "Kprobes sanity tests" depends on DEBUG_KERNEL -- cgit v1.2.3 From 1f7c14c62ce63805f9574664a6c6de3633d4a354 Mon Sep 17 00:00:00 2001 From: Mingming Cao Date: Thu, 9 Oct 2008 12:50:59 -0400 Subject: percpu counter: clean up percpu_counter_sum_and_set() percpu_counter_sum_and_set() and percpu_counter_sum() is the same except the former updates the global counter after accounting. Since we are taking the fbc->lock to calculate the precise value of the counter in percpu_counter_sum() anyway, it should simply set fbc->count too, as the percpu_counter_sum_and_set() does. This patch merges these two interfaces into one. Signed-off-by: Mingming Cao Acked-by: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 2 +- include/linux/percpu_counter.h | 12 +++--------- lib/percpu_counter.c | 8 +++----- 3 files changed, 7 insertions(+), 15 deletions(-) (limited to 'lib') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index e9fa960ba6da..00a94d5866c2 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -1624,7 +1624,7 @@ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, #ifdef CONFIG_SMP if (free_blocks - root_blocks < FBC_BATCH) free_blocks = - percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); + percpu_counter_sum(&sbi->s_freeblocks_counter); #endif if (free_blocks <= root_blocks) /* we don't have free space */ diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 208388835357..9007ccdfc112 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount); void percpu_counter_destroy(struct percpu_counter *fbc); void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); -s64 __percpu_counter_sum(struct percpu_counter *fbc, int set); +s64 __percpu_counter_sum(struct percpu_counter *fbc); static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { @@ -44,19 +44,13 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { - s64 ret = __percpu_counter_sum(fbc, 0); + s64 ret = __percpu_counter_sum(fbc); return ret < 0 ? 0 : ret; } -static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc) -{ - return __percpu_counter_sum(fbc, 1); -} - - static inline s64 percpu_counter_sum(struct percpu_counter *fbc) { - return __percpu_counter_sum(fbc, 0); + return __percpu_counter_sum(fbc); } static inline s64 percpu_counter_read(struct percpu_counter *fbc) diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 4a8ba4bf5f6f..a8663890a88c 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add); * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive() */ -s64 __percpu_counter_sum(struct percpu_counter *fbc, int set) +s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; @@ -62,11 +62,9 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc, int set) for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; - if (set) - *pcount = 0; + *pcount = 0; } - if (set) - fbc->count = ret; + fbc->count = ret; spin_unlock(&fbc->lock); return ret; -- cgit v1.2.3 From 673d62cc5ea6fca046650f17f77985b112c62322 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Sun, 31 Aug 2008 23:39:21 +0200 Subject: debugobjects: fix lockdep warning Daniel J. Blueman reported: > ======================================================= > [ INFO: possible circular locking dependency detected ] > 2.6.27-rc4-224c #1 > ------------------------------------------------------- > hald/4680 is trying to acquire lock: > (&n->list_lock){++..}, at: [] add_partial+0x26/0x80 > > but task is already holding lock: > (&obj_hash[i].lock){++..}, at: [] > debug_object_free+0x5c/0x120 We fix it by moving the actual freeing to outside the lock (the lock now only protects the list). The pool lock is also promoted to irq-safe (suggested by Dan). It's necessary because free_pool is now called outside the irq disabled region. So we need to protect against an interrupt handler which calls debug_object_init(). [tglx@linutronix.de: added hlist_move_list helper to avoid looping through the list twice] Reported-by: Daniel J Blueman Signed-off-by: Vegard Nossum Signed-off-by: Thomas Gleixner --- include/linux/list.h | 13 +++++++++++++ lib/debugobjects.c | 31 +++++++++++++++++++++++-------- 2 files changed, 36 insertions(+), 8 deletions(-) (limited to 'lib') diff --git a/include/linux/list.h b/include/linux/list.h index db35ef02e745..969f6e92d089 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -619,6 +619,19 @@ static inline void hlist_add_after(struct hlist_node *n, next->next->pprev = &next->next; } +/* + * Move a list from one list head to another. Fixup the pprev + * reference of the first entry if it exists. + */ +static inline void hlist_move_list(struct hlist_head *old, + struct hlist_head *new) +{ + new->first = old->first; + if (new->first) + new->first->pprev = &new->first; + old->first = NULL; +} + #define hlist_entry(ptr, type, member) container_of(ptr,type,member) #define hlist_for_each(pos, head) \ diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 45a6bde762d1..e3ab374e1334 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -112,6 +112,7 @@ static struct debug_obj *lookup_object(void *addr, struct debug_bucket *b) /* * Allocate a new object. If the pool is empty, switch off the debugger. + * Must be called with interrupts disabled. */ static struct debug_obj * alloc_object(void *addr, struct debug_bucket *b, struct debug_obj_descr *descr) @@ -148,17 +149,18 @@ alloc_object(void *addr, struct debug_bucket *b, struct debug_obj_descr *descr) static void free_object(struct debug_obj *obj) { unsigned long idx = (unsigned long)(obj - obj_static_pool); + unsigned long flags; if (obj_pool_free < ODEBUG_POOL_SIZE || idx < ODEBUG_POOL_SIZE) { - spin_lock(&pool_lock); + spin_lock_irqsave(&pool_lock, flags); hlist_add_head(&obj->node, &obj_pool); obj_pool_free++; obj_pool_used--; - spin_unlock(&pool_lock); + spin_unlock_irqrestore(&pool_lock, flags); } else { - spin_lock(&pool_lock); + spin_lock_irqsave(&pool_lock, flags); obj_pool_used--; - spin_unlock(&pool_lock); + spin_unlock_irqrestore(&pool_lock, flags); kmem_cache_free(obj_cache, obj); } } @@ -171,6 +173,7 @@ static void debug_objects_oom(void) { struct debug_bucket *db = obj_hash; struct hlist_node *node, *tmp; + HLIST_HEAD(freelist); struct debug_obj *obj; unsigned long flags; int i; @@ -179,11 +182,14 @@ static void debug_objects_oom(void) for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { spin_lock_irqsave(&db->lock, flags); - hlist_for_each_entry_safe(obj, node, tmp, &db->list, node) { + hlist_move_list(&db->list, &freelist); + spin_unlock_irqrestore(&db->lock, flags); + + /* Now free them */ + hlist_for_each_entry_safe(obj, node, tmp, &freelist, node) { hlist_del(&obj->node); free_object(obj); } - spin_unlock_irqrestore(&db->lock, flags); } } @@ -498,8 +504,9 @@ void debug_object_free(void *addr, struct debug_obj_descr *descr) return; default: hlist_del(&obj->node); + spin_unlock_irqrestore(&db->lock, flags); free_object(obj); - break; + return; } out_unlock: spin_unlock_irqrestore(&db->lock, flags); @@ -510,6 +517,7 @@ static void __debug_check_no_obj_freed(const void *address, unsigned long size) { unsigned long flags, oaddr, saddr, eaddr, paddr, chunks; struct hlist_node *node, *tmp; + HLIST_HEAD(freelist); struct debug_obj_descr *descr; enum debug_obj_state state; struct debug_bucket *db; @@ -545,11 +553,18 @@ repeat: goto repeat; default: hlist_del(&obj->node); - free_object(obj); + hlist_add_head(&obj->node, &freelist); break; } } spin_unlock_irqrestore(&db->lock, flags); + + /* Now free them */ + hlist_for_each_entry_safe(obj, node, tmp, &freelist, node) { + hlist_del(&obj->node); + free_object(obj); + } + if (cnt > debug_objects_maxchain) debug_objects_maxchain = cnt; } -- cgit v1.2.3 From 7563dc64585324f443f5ac107eb6d89ee813a2d2 Mon Sep 17 00:00:00 2001 From: Tony Breeds Date: Tue, 2 Sep 2008 16:50:38 +1000 Subject: powerpc: Work around gcc's -fno-omit-frame-pointer bug This bug is causing random crashes (http://bugzilla.kernel.org/show_bug.cgi?id=11414). -fno-omit-frame-pointer is only needed on powerpc when -pg is also supplied, and there is a gcc bug that causes incorrect code generation on 32-bit powerpc when -fno-omit-frame-pointer is used---it uses stack locations below the stack pointer, which is not allowed by the ABI because those locations can and sometimes do get corrupted by an interrupt. This ensures that CONFIG_FRAME_POINTER is only selected by ftrace. When CONFIG_FTRACE is enabled we also pass -mno-sched-epilog to work around the gcc codegen bug. Patch based on work by: Andreas Schwab Segher Boessenkool Signed-off-by: Tony Breeds Signed-off-by: Paul Mackerras --- arch/powerpc/Makefile | 5 +++++ arch/powerpc/kernel/Makefile | 7 ++++--- arch/powerpc/platforms/powermac/Makefile | 2 +- lib/Kconfig.debug | 6 +++--- 4 files changed, 13 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 9155c9312c1e..c6be19e9ceae 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -116,6 +116,11 @@ ifeq ($(CONFIG_6xx),y) KBUILD_CFLAGS += -mcpu=powerpc endif +# Work around a gcc code-gen bug with -fno-omit-frame-pointer. +ifeq ($(CONFIG_FTRACE),y) +KBUILD_CFLAGS += -mno-sched-epilog +endif + cpu-as-$(CONFIG_4xx) += -Wa,-m405 cpu-as-$(CONFIG_6xx) += -Wa,-maltivec cpu-as-$(CONFIG_POWER4) += -Wa,-maltivec diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 64f5948ebc9d..946daea780f1 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -14,12 +14,13 @@ endif ifdef CONFIG_FTRACE # Do not trace early boot code -CFLAGS_REMOVE_cputable.o = -pg -CFLAGS_REMOVE_prom_init.o = -pg +CFLAGS_REMOVE_cputable.o = -pg -mno-sched-epilog +CFLAGS_REMOVE_prom_init.o = -pg -mno-sched-epilog +CFLAGS_REMOVE_btext.o = -pg -mno-sched-epilog ifdef CONFIG_DYNAMIC_FTRACE # dynamic ftrace setup. -CFLAGS_REMOVE_ftrace.o = -pg +CFLAGS_REMOVE_ftrace.o = -pg -mno-sched-epilog endif endif diff --git a/arch/powerpc/platforms/powermac/Makefile b/arch/powerpc/platforms/powermac/Makefile index 58ecdd72630f..be60d64be7ad 100644 --- a/arch/powerpc/platforms/powermac/Makefile +++ b/arch/powerpc/platforms/powermac/Makefile @@ -2,7 +2,7 @@ CFLAGS_bootx_init.o += -fPIC ifdef CONFIG_FTRACE # Do not trace early boot code -CFLAGS_REMOVE_bootx_init.o = -pg +CFLAGS_REMOVE_bootx_init.o = -pg -mno-sched-epilog endif obj-y += pic.o setup.o time.o feature.o pci.o \ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 8b5a7d304a5f..0b504814e378 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -394,7 +394,7 @@ config LOCKDEP bool depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT select STACKTRACE - select FRAME_POINTER if !X86 && !MIPS + select FRAME_POINTER if !X86 && !MIPS && !PPC select KALLSYMS select KALLSYMS_ALL @@ -676,13 +676,13 @@ config FAULT_INJECTION_STACKTRACE_FILTER depends on FAULT_INJECTION_DEBUG_FS && STACKTRACE_SUPPORT depends on !X86_64 select STACKTRACE - select FRAME_POINTER + select FRAME_POINTER if !PPC help Provide stacktrace filter for fault-injection capabilities config LATENCYTOP bool "Latency measuring infrastructure" - select FRAME_POINTER if !MIPS + select FRAME_POINTER if !MIPS && !PPC select KALLSYMS select KALLSYMS_ALL select STACKTRACE -- cgit v1.2.3 From 68e91d61346db4359464d06617500141cfd1442a Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 8 Sep 2008 18:10:14 +0900 Subject: swiotlb: remove GFP_DMA hack in swiotlb_alloc_coherent The callers are supposed to set up the gfp flags appropriately. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 977edbdbc1de..3066ffe1f9eb 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -467,13 +467,6 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, void *ret; int order = get_order(size); - /* - * XXX fix me: the DMA API should pass us an explicit DMA mask - * instead, or use ZONE_DMA32 (ia64 overloads ZONE_DMA to be a ~32 - * bit range instead of a 16MB one). - */ - flags |= GFP_DMA; - ret = (void *)__get_free_pages(flags, order); if (ret && address_needs_mapping(hwdev, virt_to_bus(ret))) { /* -- cgit v1.2.3 From 9dfda12b8b769e224ca4efbc35ace4524b9c017b Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 8 Sep 2008 18:53:48 +0900 Subject: swiotlb: use map_single instead of swiotlb_map_single in swiotlb_alloc_coherent We always need swiotlb memory here so address_needs_mapping and swiotlb_force testings are irrelevant. map_single should be used here instead of swiotlb_map_single. Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 3066ffe1f9eb..2fb485d0e7eb 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -483,12 +483,9 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, * swiotlb_map_single(), which will grab memory from * the lowest available address range. */ - dma_addr_t handle; - handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE); - if (swiotlb_dma_mapping_error(hwdev, handle)) + ret = map_single(hwdev, NULL, size, DMA_FROM_DEVICE); + if (!ret) return NULL; - - ret = bus_to_virt(handle); } memset(ret, 0, size); -- cgit v1.2.3 From 21f6c4de4c25c4bdd88c75bc97a78e7fbeebac4d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 8 Sep 2008 18:53:49 +0900 Subject: swiotlb: use unmap_single instead of swiotlb_unmap_single in swiotlb_free_coherent We don't need any check in swiotlb_unmap_single here. unmap_single is appropriate. Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 2fb485d0e7eb..bf61c73a3418 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -513,7 +513,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, free_pages((unsigned long) vaddr, get_order(size)); else /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - swiotlb_unmap_single (hwdev, dma_handle, size, DMA_TO_DEVICE); + unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); } static void -- cgit v1.2.3 From 640aebfe014554ced9c38d2564e38862e488d0eb Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 8 Sep 2008 18:53:50 +0900 Subject: swiotlb: add is_swiotlb_buffer helper function This adds is_swiotlb_buffer() helper function to see whether a buffer belongs to the swiotlb buffer or not. Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index bf61c73a3418..b5f5d1133042 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -283,6 +283,11 @@ address_needs_mapping(struct device *hwdev, dma_addr_t addr) return (addr & ~mask) != 0; } +static int is_swiotlb_buffer(char *addr) +{ + return addr >= io_tlb_start && addr < io_tlb_end; +} + /* * Allocates bounce buffer and returns its kernel virtual address. */ @@ -508,8 +513,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle) { WARN_ON(irqs_disabled()); - if (!(vaddr >= (void *)io_tlb_start - && vaddr < (void *)io_tlb_end)) + if (!is_swiotlb_buffer(vaddr)) free_pages((unsigned long) vaddr, get_order(size)); else /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ @@ -602,7 +606,7 @@ swiotlb_unmap_single_attrs(struct device *hwdev, dma_addr_t dev_addr, char *dma_addr = bus_to_virt(dev_addr); BUG_ON(dir == DMA_NONE); - if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end) + if (is_swiotlb_buffer(dma_addr)) unmap_single(hwdev, dma_addr, size, dir); else if (dir == DMA_FROM_DEVICE) dma_mark_clean(dma_addr, size); @@ -632,7 +636,7 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, char *dma_addr = bus_to_virt(dev_addr); BUG_ON(dir == DMA_NONE); - if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end) + if (is_swiotlb_buffer(dma_addr)) sync_single(hwdev, dma_addr, size, dir, target); else if (dir == DMA_FROM_DEVICE) dma_mark_clean(dma_addr, size); @@ -663,7 +667,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr, char *dma_addr = bus_to_virt(dev_addr) + offset; BUG_ON(dir == DMA_NONE); - if (dma_addr >= io_tlb_start && dma_addr < io_tlb_end) + if (is_swiotlb_buffer(dma_addr)) sync_single(hwdev, dma_addr, size, dir, target); else if (dir == DMA_FROM_DEVICE) dma_mark_clean(dma_addr, size); -- cgit v1.2.3 From deac93df26b20cf8438339b5935b5f5643bc30c9 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 3 Sep 2008 20:43:36 -0500 Subject: lib: Correct printk %pF to work on all architectures It was introduced by "vsprintf: add support for '%pS' and '%pF' pointer formats" in commit 0fe1ef24f7bd0020f29ffe287dfdb9ead33ca0b2. However, the current way its coded doesn't work on parisc64. For two reasons: 1) parisc isn't in the #ifdef and 2) parisc has a different format for function descriptors Make dereference_function_descriptor() more accommodating by allowing architecture overrides. I put the three overrides (for parisc64, ppc64 and ia64) in arch/kernel/module.c because that's where the kernel internal linker which knows how to deal with function descriptors sits. Signed-off-by: James Bottomley Acked-by: Benjamin Herrenschmidt Acked-by: Tony Luck Acked-by: Kyle McMartin Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/sections.h | 3 +++ arch/ia64/kernel/module.c | 12 ++++++++++++ arch/parisc/kernel/module.c | 14 ++++++++++++++ arch/powerpc/include/asm/sections.h | 3 +++ arch/powerpc/kernel/module_64.c | 13 ++++++++++++- include/asm-generic/sections.h | 6 ++++++ include/asm-parisc/sections.h | 5 +++++ lib/vsprintf.c | 11 +---------- 8 files changed, 56 insertions(+), 11 deletions(-) (limited to 'lib') diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h index 7286e4a9fe84..a7acad2bc2f0 100644 --- a/arch/ia64/include/asm/sections.h +++ b/arch/ia64/include/asm/sections.h @@ -21,5 +21,8 @@ extern char __start_gate_brl_fsys_bubble_down_patchlist[], __end_gate_brl_fsys_b extern char __start_unwind[], __end_unwind[]; extern char __start_ivt_text[], __end_ivt_text[]; +#undef dereference_function_descriptor +void *dereference_function_descriptor(void *); + #endif /* _ASM_IA64_SECTIONS_H */ diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c index 29aad349e0c4..545626f66a4c 100644 --- a/arch/ia64/kernel/module.c +++ b/arch/ia64/kernel/module.c @@ -31,9 +31,11 @@ #include #include #include +#include #include #include +#include #include #define ARCH_MODULE_DEBUG 0 @@ -941,3 +943,13 @@ module_arch_cleanup (struct module *mod) if (mod->arch.core_unw_table) unw_remove_unwind_table(mod->arch.core_unw_table); } + +void *dereference_function_descriptor(void *ptr) +{ + struct fdesc *desc = ptr; + void *p; + + if (!probe_kernel_address(&desc->ip, p)) + ptr = p; + return ptr; +} diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index fdacdd4341c9..44138c3e6ea7 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -47,7 +47,9 @@ #include #include #include +#include +#include #include #if 0 @@ -860,3 +862,15 @@ void module_arch_cleanup(struct module *mod) deregister_unwind_table(mod); module_bug_cleanup(mod); } + +#ifdef CONFIG_64BIT +void *dereference_function_descriptor(void *ptr) +{ + Elf64_Fdesc *desc = ptr; + void *p; + + if (!probe_kernel_address(&desc->addr, p)) + ptr = p; + return ptr; +} +#endif diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 916018e425c4..7710e9e6660f 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -16,6 +16,9 @@ static inline int in_kernel_text(unsigned long addr) return 0; } +#undef dereference_function_descriptor +void *dereference_function_descriptor(void *); + #endif #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index ee6a2982d567..ad79de272ff3 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -21,8 +21,9 @@ #include #include #include +#include #include -#include +#include #include #include #include @@ -451,3 +452,13 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, return 0; } + +void *dereference_function_descriptor(void *ptr) +{ + struct ppc64_opd_entry *desc = ptr; + void *p; + + if (!probe_kernel_address(&desc->funcaddr, p)) + ptr = p; + return ptr; +} diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 8feeae1f2369..79a7ff925bf8 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -14,4 +14,10 @@ extern char __kprobes_text_start[], __kprobes_text_end[]; extern char __initdata_begin[], __initdata_end[]; extern char __start_rodata[], __end_rodata[]; +/* function descriptor handling (if any). Override + * in asm/sections.h */ +#ifndef dereference_function_descriptor +#define dereference_function_descriptor(p) (p) +#endif + #endif /* _ASM_GENERIC_SECTIONS_H_ */ diff --git a/include/asm-parisc/sections.h b/include/asm-parisc/sections.h index fdd43ec42ec5..9d13c3507ad6 100644 --- a/include/asm-parisc/sections.h +++ b/include/asm-parisc/sections.h @@ -4,4 +4,9 @@ /* nothing to see, move along */ #include +#ifdef CONFIG_64BIT +#undef dereference_function_descriptor +void *dereference_function_descriptor(void *); +#endif + #endif diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d8d1d1142248..c399bc1093cb 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -27,6 +27,7 @@ #include /* for PAGE_SIZE */ #include +#include /* for dereference_function_descriptor() */ /* Works only for digits and letters, but small and fast */ #define TOLOWER(x) ((x) | 0x20) @@ -513,16 +514,6 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio return buf; } -static inline void *dereference_function_descriptor(void *ptr) -{ -#if defined(CONFIG_IA64) || defined(CONFIG_PPC64) - void *p; - if (!probe_kernel_address(ptr, p)) - ptr = p; -#endif - return ptr; -} - static char *symbol_string(char *buf, char *end, void *ptr, int field_width, int precision, int flags) { unsigned long value = (unsigned long) ptr; -- cgit v1.2.3 From 2797982ed93c10d5585ee1842ab298cb11326ff5 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 10 Sep 2008 01:06:49 +0900 Subject: swiotlb: convert swiotlb to use is_buffer_dma_capable helper function Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index b5f5d1133042..240a67c2c979 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -274,13 +274,13 @@ cleanup1: } static int -address_needs_mapping(struct device *hwdev, dma_addr_t addr) +address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size) { dma_addr_t mask = 0xffffffff; /* If the device has a mask, use it, otherwise default to 32 bits */ if (hwdev && hwdev->dma_mask) mask = *hwdev->dma_mask; - return (addr & ~mask) != 0; + return !is_buffer_dma_capable(mask, addr, size); } static int is_swiotlb_buffer(char *addr) @@ -473,7 +473,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, int order = get_order(size); ret = (void *)__get_free_pages(flags, order); - if (ret && address_needs_mapping(hwdev, virt_to_bus(ret))) { + if (ret && address_needs_mapping(hwdev, virt_to_bus(ret), size)) { /* * The allocated memory isn't reachable by the device. * Fall back on swiotlb_map_single(). @@ -497,7 +497,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, dev_addr = virt_to_bus(ret); /* Confirm address can be DMA'd by device */ - if (address_needs_mapping(hwdev, dev_addr)) { + if (address_needs_mapping(hwdev, dev_addr, size)) { printk("hwdev DMA mask = 0x%016Lx, dev_addr = 0x%016Lx\n", (unsigned long long)*hwdev->dma_mask, (unsigned long long)dev_addr); @@ -561,7 +561,7 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size, * we can safely return the device addr and not worry about bounce * buffering it. */ - if (!address_needs_mapping(hwdev, dev_addr) && !swiotlb_force) + if (!address_needs_mapping(hwdev, dev_addr, size) && !swiotlb_force) return dev_addr; /* @@ -578,7 +578,7 @@ swiotlb_map_single_attrs(struct device *hwdev, void *ptr, size_t size, /* * Ensure that the address returned is DMA'ble */ - if (address_needs_mapping(hwdev, dev_addr)) + if (address_needs_mapping(hwdev, dev_addr, size)) panic("map_single: bounce buffer is not DMA'ble"); return dev_addr; @@ -721,7 +721,8 @@ swiotlb_map_sg_attrs(struct device *hwdev, struct scatterlist *sgl, int nelems, for_each_sg(sgl, sg, nelems, i) { addr = SG_ENT_VIRT_ADDRESS(sg); dev_addr = virt_to_bus(addr); - if (swiotlb_force || address_needs_mapping(hwdev, dev_addr)) { + if (swiotlb_force || + address_needs_mapping(hwdev, dev_addr, sg->length)) { void *map = map_single(hwdev, addr, sg->length, dir); if (!map) { /* Don't panic here, we expect map_sg users -- cgit v1.2.3 From 36223a399f639b13b7a454349565934e6d3e2db0 Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Wed, 10 Sep 2008 21:07:55 +0100 Subject: swiotlb: fix back-off path when memory allocation fails This fixes a SWIOTLB oops With SWIOTLB being enabled and straight-forward page allocation failure [1], the swiotlb_alloc_coherent fall-back path hits an issue [2], resulting in my webcam failing to work. At the time of oops, RDI is clearly a pointer to a structure which has arrived as NULL, leading to the typo in swiotlb_map_single's callsite arguments. Correctly passing the device structure [3] addresses the issue and gets my webcam working again (the allocation failure still occuring). --- [1] skype: page allocation failure. order:3, mode:0x1 Pid: 5895, comm: skype Not tainted 2.6.27-rc6-235c-debug #1 Call Trace: [] __alloc_pages_internal+0x4a0/0x5d0 [] alloc_pages_current+0xad/0x110 [] __get_free_pages+0x1d/0x60 [] swiotlb_alloc_coherent+0x49/0x180 [] dma_alloc_coherent+0x281/0x310 [] hcd_buffer_alloc+0x50/0x90 [] usb_buffer_alloc+0x2d/0x40 [] uvc_alloc_urb_buffers+0x53/0xf0 [uvcvideo] [] uvc_init_video+0x158/0x3e0 [uvcvideo] [] uvc_video_enable+0x37/0x80 [uvcvideo] [] uvc_v4l2_do_ioctl+0x723/0x1260 [uvcvideo] [] ? trace_hardirqs_off_caller+0x21/0xc0 [] ? trace_hardirqs_off_caller+0x21/0xc0 [] video_usercopy+0x19f/0x390 [videodev] [] ? uvc_v4l2_do_ioctl+0x0/0x1260 [uvcvideo] [] ? put_lock_stats+0xe/0x30 [] uvc_v4l2_ioctl+0x4d/0x80 [uvcvideo] [] native_ioctl+0x83/0x90 [compat_ioctl32] [] v4l_compat_ioctl32+0x2be/0x1da4 [compat_ioctl32] [] ? do_page_fault+0x3d1/0xae0 [] ? trace_hardirqs_on+0xd/0x10 [] ? trace_hardirqs_on_caller+0x149/0x1b0 [] ? trace_hardirqs_on+0xd/0x10 [] compat_sys_ioctl+0x8a/0x3c0 [] ? trace_hardirqs_off_thunk+0x3a/0x3c [] sysenter_dispatch+0x7/0x2c [] ? trace_hardirqs_on_thunk+0x3a/0x3f Mem-Info: Node 0 DMA per-cpu: CPU 0: hi: 0, btch: 1 usd: 0 CPU 1: hi: 0, btch: 1 usd: 0 Node 0 DMA32 per-cpu: CPU 0: hi: 186, btch: 31 usd: 3 CPU 1: hi: 186, btch: 31 usd: 0 Node 0 Normal per-cpu: CPU 0: hi: 186, btch: 31 usd: 23 CPU 1: hi: 186, btch: 31 usd: 179 Active:78545 inactive:48683 dirty:31 writeback:0 unstable:2 free:830202 slab:17516 mapped:17473 pagetables:3496 bounce:0 Node 0 DMA free:36kB min:28kB low:32kB high:40kB active:0kB inactive:0kB present:15156kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 3207 3956 3956 Node 0 DMA32 free:3197192kB min:6512kB low:8140kB high:9768kB active:0kB inactive:0kB present:3284896kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 748 748 Node 0 Normal free:123580kB min:1516kB low:1892kB high:2272kB active:314180kB inactive:194732kB present:766464kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 0 DMA: 1*4kB 0*8kB 0*16kB 1*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 0*4096kB = 36kB Node 0 DMA32: 4*4kB 3*8kB 2*16kB 3*32kB 4*64kB 5*128kB 3*256kB 5*512kB 4*1024kB 5*2048kB 776*4096kB = 3197224kB Node 0 Normal: 14*4kB 14*8kB 8*16kB 6*32kB 1*64kB 3*128kB 3*256kB 2*512kB 4*1024kB 1*2048kB 28*4096kB = 123560kB 64847 total pagecache pages 0 pages in swap cache Swap cache stats: add 0, delete 0, find 0/0 Free swap = 502752kB Total swap = 502752kB 1048576 pages RAM 52120 pages reserved 71967 pages shared 143004 pages non-shared --- [2] BUG: unable to handle kernel NULL pointer dereference at 00000000000002c8 IP: [] map_single+0x1c/0x280 PGD 10e54e067 PUD 10e595067 PMD 0 Oops: 0000 [1] PREEMPT SMP DEBUG_PAGEALLOC CPU 0 Modules linked in: kvm_intel kvm microcode uvcvideo compat_ioctl32 videodev v4l1_compat shpchp pci_hotplug Pid: 5895, comm: skype Not tainted 2.6.27-rc6-235c-debug #1 RIP: 0010:[] [] map_single+0x1c/0x280 RSP: 0018:ffff88010e78d988 EFLAGS: 00210296 RAX: 0000780000000000 RBX: 0000000000000000 RCX: 0000000000000002 RDX: 0000000000005000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffff88010e78d9e8 R08: 0000000000000000 R09: 0000000000000001 R10: ffff88010e78d698 R11: 0000000000000001 R12: 0000000000000002 R13: 0000000000000000 R14: 0000000000005000 R15: ffff88012f1c9968 FS: 0000000000000000(0000) GS:ffffffff80a6cdc0(0063) knlGS:00000000f6355b90 CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 CR2: 00000000000002c8 CR3: 000000010e57d000 CR4: 00000000000026e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process skype (pid: 5895, threadinfo ffff88010e78c000, task ffff88012b9cc460) Stack: 0000000200000000 0000000000005000 0000000000000000 0000000000000000 00000000000017b8 0000000000000000 ffff88010e78d9c8 0000000000000000 0000000000000002 0000000000000000 0000000000005000 ffff88012f1c9968 Call Trace: [] swiotlb_map_single_attrs+0x60/0xf0 [] swiotlb_map_single+0xc/0x10 [] swiotlb_alloc_coherent+0xfe/0x180 [] dma_alloc_coherent+0x281/0x310 [] hcd_buffer_alloc+0x50/0x90 [] usb_buffer_alloc+0x2d/0x40 [] uvc_alloc_urb_buffers+0x53/0xf0 [uvcvideo] [] uvc_init_video+0x158/0x3e0 [uvcvideo] [] uvc_video_enable+0x37/0x80 [uvcvideo] [] uvc_v4l2_do_ioctl+0x723/0x1260 [uvcvideo] [] ? trace_hardirqs_off_caller+0x21/0xc0 [] ? trace_hardirqs_off_caller+0x21/0xc0 [] video_usercopy+0x19f/0x390 [videodev] [] ? uvc_v4l2_do_ioctl+0x0/0x1260 [uvcvideo] [] ? put_lock_stats+0xe/0x30 [] uvc_v4l2_ioctl+0x4d/0x80 [uvcvideo] [] native_ioctl+0x83/0x90 [compat_ioctl32] [] v4l_compat_ioctl32+0x2be/0x1da4 [compat_ioctl32] [] ? do_page_fault+0x3d1/0xae0 [] ? trace_hardirqs_on+0xd/0x10 [] ? trace_hardirqs_on_caller+0x149/0x1b0 [] ? trace_hardirqs_on+0xd/0x10 [] compat_sys_ioctl+0x8a/0x3c0 [] ? trace_hardirqs_off_thunk+0x3a/0x3c [] sysenter_dispatch+0x7/0x2c [] ? trace_hardirqs_on_thunk+0x3a/0x3f Code: 45 31 c0 48 89 e5 e8 a4 ff ff ff c9 c3 66 90 55 48 89 e5 41 57 41 56 41 55 41 54 53 48 83 ec 38 48 89 75 b0 48 89 55 a8 89 4d a4 <48> 8b 87 c8 02 00 00 48 85 c0 0f 84 1c 02 00 00 48 8b 58 08 48 RIP [] map_single+0x1c/0x280 RSP CR2: 00000000000002c8 ---[ end trace 5d15baeeb7025a0e ]--- --- [3] ffffffff8046c830 : map_single(): /store/kernel/linux/lib/swiotlb.c:291 ffffffff8046c830: 55 push %rbp ffffffff8046c831: 48 89 e5 mov %rsp,%rbp ffffffff8046c834: 41 57 push %r15 ffffffff8046c836: 41 56 push %r14 ffffffff8046c838: 41 55 push %r13 ffffffff8046c83a: 41 54 push %r12 ffffffff8046c83c: 53 push %rbx ffffffff8046c83d: 48 83 ec 38 sub $0x38,%rsp ffffffff8046c841: 48 89 75 b0 mov %rsi,-0x50(%rbp) ffffffff8046c845: 48 89 55 a8 mov %rdx,-0x58(%rbp) ffffffff8046c849: 89 4d a4 mov %ecx,-0x5c(%rbp) dma_get_seg_boundary(): /store/kernel/linux/include/linux/dma-mapping.h:80 ffffffff8046c84c: 48 8b 87 c8 02 00 00 mov 0x2c8(%rdi),%rax <---- --- [4] Signed-off-by: Daniel J Blueman Signed-off-by: Linus Torvalds --- lib/swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 977edbdbc1de..8826fdf0f180 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -491,7 +491,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, * the lowest available address range. */ dma_addr_t handle; - handle = swiotlb_map_single(NULL, NULL, size, DMA_FROM_DEVICE); + handle = swiotlb_map_single(hwdev, NULL, size, DMA_FROM_DEVICE); if (swiotlb_dma_mapping_error(hwdev, handle)) return NULL; -- cgit v1.2.3 From 50bed2e2862a8f3a4f7d683d0d27292e71ef18b9 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 11 Sep 2008 18:35:39 +0200 Subject: sg: disable interrupts inside sg_copy_buffer The callers of sg_copy_buffer must disable interrupts before calling it (since it uses kmap_atomic). Some callers use it on interrupt-disabled code but some need to take the trouble to disable interrupts just for this. No wonder they forget about it and we hit a bug like: http://bugzilla.kernel.org/show_bug.cgi?id=11529 James said that it might be better to disable interrupts inside the function rather than risk the callers getting it wrong. Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- lib/scatterlist.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib') diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 876ba6d5b670..8d2688ff1352 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -422,9 +422,12 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, { unsigned int offset = 0; struct sg_mapping_iter miter; + unsigned long flags; sg_miter_start(&miter, sgl, nents, SG_MITER_ATOMIC); + local_irq_save(flags); + while (sg_miter_next(&miter) && offset < buflen) { unsigned int len; @@ -442,6 +445,7 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, sg_miter_stop(&miter); + local_irq_restore(flags); return offset; } -- cgit v1.2.3 From 07a2c01a0c2a0cb4581a67d50d4f17cb4d2457c4 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 19 Sep 2008 02:02:05 +0900 Subject: convert swiotlb to use dma_get_mask swiotlb can use dma_get_mask() instead of the homegrown function. Signed-off-by: FUJITA Tomonori Cc: tony.luck@intel.com Signed-off-by: Ingo Molnar --- include/linux/dma-mapping.h | 2 +- lib/swiotlb.c | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 0dba7433af18..ba9114ec5d3a 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -65,7 +65,7 @@ static inline int is_buffer_dma_capable(u64 mask, dma_addr_t addr, size_t size) static inline u64 dma_get_mask(struct device *dev) { - if (dev->dma_mask && *dev->dma_mask) + if (dev && dev->dma_mask && *dev->dma_mask) return *dev->dma_mask; return DMA_32BIT_MASK; } diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 240a67c2c979..f8eebd489149 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -276,11 +276,7 @@ cleanup1: static int address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size) { - dma_addr_t mask = 0xffffffff; - /* If the device has a mask, use it, otherwise default to 32 bits */ - if (hwdev && hwdev->dma_mask) - mask = *hwdev->dma_mask; - return !is_buffer_dma_capable(mask, addr, size); + return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); } static int is_swiotlb_buffer(char *addr) -- cgit v1.2.3 From d26dbc5cf94b0a28acc947285c3b54814a73cb2e Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 22 Sep 2008 22:35:07 +0900 Subject: iommu: export iommu_area_reserve helper function x86 has set_bit_string() that does the exact same thing that set_bit_area() in lib/iommu-helper.c does. This patch exports set_bit_area() in lib/iommu-helper.c as iommu_area_reserve(), converts GART, Calgary, and AMD IOMMU to use it. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/pci-calgary_64.c | 2 +- arch/x86/kernel/pci-gart_64.c | 2 +- include/linux/iommu-helper.h | 1 + lib/iommu-helper.c | 5 ++--- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'lib') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 6f7b97445738..70537d117a96 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -572,7 +572,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, if (start_page + pages > last_page) pages = last_page - start_page; - set_bit_string(dom->bitmap, start_page, pages); + iommu_area_reserve(dom->bitmap, start_page, pages); } static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index fe7695e4caae..080d1d27f37a 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -261,7 +261,7 @@ static void iommu_range_reserve(struct iommu_table *tbl, badbit, tbl, start_addr, npages); } - set_bit_string(tbl->it_map, index, npages); + iommu_area_reserve(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 508ef470b27f..3dcb1ad86e38 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -827,7 +827,7 @@ void __init gart_iommu_init(void) * Out of IOMMU space handling. * Reserve some invalid pages at the beginning of the GART. */ - set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); agp_memory_reserved = iommu_size; printk(KERN_INFO diff --git a/include/linux/iommu-helper.h b/include/linux/iommu-helper.h index 58f41107e4ae..786539e432d7 100644 --- a/include/linux/iommu-helper.h +++ b/include/linux/iommu-helper.h @@ -11,6 +11,7 @@ static inline unsigned long iommu_device_max_index(unsigned long size, extern int iommu_is_span_boundary(unsigned int index, unsigned int nr, unsigned long shift, unsigned long boundary_size); +extern void iommu_area_reserve(unsigned long *map, unsigned long i, int len); extern unsigned long iommu_area_alloc(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, unsigned long shift, diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c index a3b8d4c3f77a..5d90074dca75 100644 --- a/lib/iommu-helper.c +++ b/lib/iommu-helper.c @@ -30,8 +30,7 @@ again: return index; } -static inline void set_bit_area(unsigned long *map, unsigned long i, - int len) +void iommu_area_reserve(unsigned long *map, unsigned long i, int len) { unsigned long end = i + len; while (i < end) { @@ -64,7 +63,7 @@ again: start = index + 1; goto again; } - set_bit_area(map, index, nr); + iommu_area_reserve(map, index, nr); } return index; } -- cgit v1.2.3 From 2133b5d7ff531bc15a923db4a6a50bf96c561be9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Oct 2008 16:06:39 -0700 Subject: rcu: RCU-based detection of stalled CPUs for Classic RCU This patch adds stalled-CPU detection to Classic RCU. This capability is enabled by a new config variable CONFIG_RCU_CPU_STALL_DETECTOR, which defaults disabled. This is a debugging feature to detect infinite loops in kernel code, not something that non-kernel-hackers would be expected to care about. This feature can detect looping CPUs in !PREEMPT builds and looping CPUs with preemption disabled in PREEMPT builds. This is essentially a port of this functionality from the treercu patch, replacing the stall debug patch that is already in tip/core/rcu (commit 67182ae1c4). The changes from the patch in tip/core/rcu include making the config variable name match that in treercu, changing from seconds to jiffies to avoid spurious warnings, and printing a boot message when this feature is enabled. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/rcuclassic.h | 12 +++- kernel/rcuclassic.c | 166 +++++++++++++++++++++++---------------------- lib/Kconfig.debug | 2 +- 3 files changed, 96 insertions(+), 84 deletions(-) (limited to 'lib') diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 29bf528c7dcc..5f89b62e6983 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -40,15 +40,21 @@ #include #include +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ /* Global control variables for rcupdate callback mechanism. */ struct rcu_ctrlblk { long cur; /* Current batch number. */ long completed; /* Number of the last completed batch */ long pending; /* Number of the last pending batch */ -#ifdef CONFIG_DEBUG_RCU_STALL - unsigned long gp_check; /* Time grace period should end, in seconds. */ -#endif /* #ifdef CONFIG_DEBUG_RCU_STALL */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + unsigned long gp_start; /* Time at which GP started in jiffies. */ + unsigned long jiffies_stall; + /* Time at which to check for CPU stalls. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ int signaled; diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c index ed15128ca2c9..0d07e6e51578 100644 --- a/kernel/rcuclassic.c +++ b/kernel/rcuclassic.c @@ -164,6 +164,87 @@ static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp, } } +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ + rcp->gp_start = jiffies; + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK; +} + +static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) +{ + int cpu; + long delta; + unsigned long flags; + + /* Only let one CPU complain about others per time interval. */ + + spin_lock_irqsave(&rcp->lock, flags); + delta = jiffies - rcp->jiffies_stall; + if (delta < 2 || rcp->cur != rcp->completed) { + spin_unlock_irqrestore(&rcp->lock, flags); + return; + } + rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + + /* OK, time to rat on our buddy... */ + + printk(KERN_ERR "RCU detected CPU stalls:"); + for_each_possible_cpu(cpu) { + if (cpu_isset(cpu, rcp->cpumask)) + printk(" %d", cpu); + } + printk(" (detected by %d, t=%ld jiffies)\n", + smp_processor_id(), (long)(jiffies - rcp->gp_start)); +} + +static void print_cpu_stall(struct rcu_ctrlblk *rcp) +{ + unsigned long flags; + + printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n", + smp_processor_id(), jiffies, + jiffies - rcp->gp_start); + dump_stack(); + spin_lock_irqsave(&rcp->lock, flags); + if ((long)(jiffies - rcp->jiffies_stall) >= 0) + rcp->jiffies_stall = + jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + spin_unlock_irqrestore(&rcp->lock, flags); + set_need_resched(); /* kick ourselves to get things going. */ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp) +{ + long delta; + + delta = jiffies - rcp->jiffies_stall; + if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) { + + /* We haven't checked in, so go dump stack. */ + print_cpu_stall(rcp); + + } else if (rcp->cur != rcp->completed && delta >= 2) { + + /* They had two seconds to dump stack, so complain. */ + print_other_cpu_stall(rcp); + } +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp) +{ +} + +static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + /** * call_rcu - Queue an RCU callback for invocation after a grace period. * @head: structure to be used for queueing the RCU updates. @@ -293,84 +374,6 @@ static void rcu_do_batch(struct rcu_data *rdp) * period (if necessary). */ -#ifdef CONFIG_DEBUG_RCU_STALL - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ - rcp->gp_check = get_seconds() + 3; -} - -static void print_other_cpu_stall(struct rcu_ctrlblk *rcp) -{ - int cpu; - long delta; - unsigned long flags; - - /* Only let one CPU complain about others per time interval. */ - - spin_lock_irqsave(&rcp->lock, flags); - delta = get_seconds() - rcp->gp_check; - if (delta < 2L || cpus_empty(rcp->cpumask)) { - spin_unlock(&rcp->lock); - return; - } - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); - - /* OK, time to rat on our buddy... */ - - printk(KERN_ERR "RCU detected CPU stalls:"); - for_each_cpu_mask(cpu, rcp->cpumask) - printk(" %d", cpu); - printk(" (detected by %d, t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); -} - -static void print_cpu_stall(struct rcu_ctrlblk *rcp) -{ - unsigned long flags; - - printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu)\n", - smp_processor_id(), get_seconds(), rcp->gp_check); - dump_stack(); - spin_lock_irqsave(&rcp->lock, flags); - if ((long)(get_seconds() - rcp->gp_check) >= 0L) - rcp->gp_check = get_seconds() + 30; - spin_unlock_irqrestore(&rcp->lock, flags); -} - -static void check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ - long delta; - - delta = get_seconds() - rcp->gp_check; - if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0L) { - - /* We haven't checked in, so go dump stack. */ - - print_cpu_stall(rcp); - - } else { - if (!cpus_empty(rcp->cpumask) && delta >= 2L) { - /* They had two seconds to dump stack, so complain. */ - print_other_cpu_stall(rcp); - } - } -} - -#else /* #ifdef CONFIG_DEBUG_RCU_STALL */ - -static inline void record_gp_check_time(struct rcu_ctrlblk *rcp) -{ -} - -static inline void -check_cpu_stall(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) -{ -} - -#endif /* #else #ifdef CONFIG_DEBUG_RCU_STALL */ - /* * Register a new batch of callbacks, and start it up if there is currently no * active batch and the batch to be registered has not already occurred. @@ -381,7 +384,7 @@ static void rcu_start_batch(struct rcu_ctrlblk *rcp) if (rcp->cur != rcp->pending && rcp->completed == rcp->cur) { rcp->cur++; - record_gp_check_time(rcp); + record_gp_stall_check_time(rcp); /* * Accessing nohz_cpu_mask before incrementing rcp->cur needs a @@ -603,7 +606,7 @@ static void rcu_process_callbacks(struct softirq_action *unused) static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) { /* Check for CPU stalls, if enabled. */ - check_cpu_stall(rcp, rdp); + check_cpu_stall(rcp); if (rdp->nxtlist) { long completed_snap = ACCESS_ONCE(rcp->completed); @@ -769,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = { */ void __init __rcu_init(void) { +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long)smp_processor_id()); /* Register notifier for non-boot CPUs */ diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index ccede1aeab38..9fee969dd60e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -597,7 +597,7 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. -config RCU_CPU_STALL +config RCU_CPU_STALL_DETECTOR bool "Check for stalled CPUs delaying RCU grace periods" depends on CLASSIC_RCU default n -- cgit v1.2.3 From 3c9f3681d0b4af09c1cbf04f92fdfb72bd81ad7b Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sun, 31 Aug 2008 10:13:54 -0500 Subject: [SCSI] lib: add generic helper to print sizes rounded to the correct SI range This patch adds the ability to print sizes in either units of 10^3 (SI) or 2^10 (Binary) units. It rounds up to three significant figures and can be used for either memory or storage capacities. Oh, and I'm fully aware that 64 bits is only 16EiB ... the Zetta and Yotta units are added for future proofing against the day we have 128 bit computers ... [fujita.tomonori@lab.ntt.co.jp: fix missed unsigned long long cast] Signed-off-by: James Bottomley --- include/linux/string_helpers.h | 16 +++++++++++ lib/Makefile | 3 +- lib/string_helpers.c | 64 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+), 1 deletion(-) create mode 100644 include/linux/string_helpers.h create mode 100644 lib/string_helpers.c (limited to 'lib') diff --git a/include/linux/string_helpers.h b/include/linux/string_helpers.h new file mode 100644 index 000000000000..a3eb2f65b656 --- /dev/null +++ b/include/linux/string_helpers.h @@ -0,0 +1,16 @@ +#ifndef _LINUX_STRING_HELPERS_H_ +#define _LINUX_STRING_HELPERS_H_ + +#include + +/* Descriptions of the types of units to + * print in */ +enum string_size_units { + STRING_UNITS_10, /* use powers of 10^3 (standard SI) */ + STRING_UNITS_2, /* use binary powers of 2^10 */ +}; + +int string_get_size(u64 size, enum string_size_units units, + char *buf, int len); + +#endif diff --git a/lib/Makefile b/lib/Makefile index 3b1f94bbe9de..44001af76a7d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -19,7 +19,8 @@ lib-$(CONFIG_SMP) += cpumask.o lib-y += kobject.o kref.o klist.o obj-y += bcd.o div64.o sort.o parser.o halfmd4.o debug_locks.o random32.o \ - bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o + bust_spinlocks.o hexdump.o kasprintf.o bitmap.o scatterlist.o \ + string_helpers.o ifeq ($(CONFIG_DEBUG_KOBJECT),y) CFLAGS_kobject.o += -DDEBUG diff --git a/lib/string_helpers.c b/lib/string_helpers.c new file mode 100644 index 000000000000..8347925030ff --- /dev/null +++ b/lib/string_helpers.c @@ -0,0 +1,64 @@ +/* + * Helpers for formatting and printing strings + * + * Copyright 31 August 2008 James Bottomley + */ +#include +#include +#include +#include + +/** + * string_get_size - get the size in the specified units + * @size: The size to be converted + * @units: units to use (powers of 1000 or 1024) + * @buf: buffer to format to + * @len: length of buffer + * + * This function returns a string formatted to 3 significant figures + * giving the size in the required units. Returns 0 on success or + * error on failure. @buf is always zero terminated. + * + */ +int string_get_size(u64 size, const enum string_size_units units, + char *buf, int len) +{ + const char *units_10[] = { "B", "KB", "MB", "GB", "TB", "PB", + "EB", "ZB", "YB", NULL}; + const char *units_2[] = {"B", "KiB", "MiB", "GiB", "TiB", "PiB", + "EiB", "ZiB", "YiB", NULL }; + const char **units_str[] = { + [STRING_UNITS_10] = units_10, + [STRING_UNITS_2] = units_2, + }; + const int divisor[] = { + [STRING_UNITS_10] = 1000, + [STRING_UNITS_2] = 1024, + }; + int i, j; + u64 remainder = 0, sf_cap; + char tmp[8]; + + tmp[0] = '\0'; + + for (i = 0; size > divisor[units] && units_str[units][i]; i++) + remainder = do_div(size, divisor[units]); + + sf_cap = size; + for (j = 0; sf_cap*10 < 1000; j++) + sf_cap *= 10; + + if (j) { + remainder *= 1000; + do_div(remainder, divisor[units]); + snprintf(tmp, sizeof(tmp), ".%03lld", + (unsigned long long)remainder); + tmp[j+1] = '\0'; + } + + snprintf(buf, len, "%lld%s%s", (unsigned long long)size, + tmp, units_str[units][i]); + + return 0; +} +EXPORT_SYMBOL(string_get_size); -- cgit v1.2.3 From a1ed5b0cffe4b16a93a6a3390e8cee0fbef94f86 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:50:16 +0200 Subject: klist: don't iterate over deleted entries A klist entry is kept on the list till all its current iterations are finished; however, a new iteration after deletion also iterates over deleted entries as long as their reference count stays above zero. This causes problems for cases where there are users which iterate over the list while synchronized against list manipulations and natuarally expect already deleted entries to not show up during iteration. This patch implements dead flag which gets set on deletion so that iteration can skip already deleted entries. The dead flag piggy backs on the lowest bit of knode->n_klist and only visible to klist implementation proper. While at it, drop klist_iter->i_head as it's redundant and doesn't offer anything in semantics or performance wise as klist_iter->i_klist is dereferenced on every iteration anyway. Signed-off-by: Tejun Heo Cc: Greg Kroah-Hartman Cc: Alan Stern Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/klist.h | 3 +- lib/klist.c | 96 +++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 71 insertions(+), 28 deletions(-) (limited to 'lib') diff --git a/include/linux/klist.h b/include/linux/klist.h index 06c338ef7f1b..8ea98db223e5 100644 --- a/include/linux/klist.h +++ b/include/linux/klist.h @@ -38,7 +38,7 @@ extern void klist_init(struct klist *k, void (*get)(struct klist_node *), void (*put)(struct klist_node *)); struct klist_node { - struct klist *n_klist; + void *n_klist; /* never access directly */ struct list_head n_node; struct kref n_ref; struct completion n_removed; @@ -57,7 +57,6 @@ extern int klist_node_attached(struct klist_node *n); struct klist_iter { struct klist *i_klist; - struct list_head *i_head; struct klist_node *i_cur; }; diff --git a/lib/klist.c b/lib/klist.c index cca37f96faa2..bbdd3015c2c7 100644 --- a/lib/klist.c +++ b/lib/klist.c @@ -37,6 +37,37 @@ #include #include +/* + * Use the lowest bit of n_klist to mark deleted nodes and exclude + * dead ones from iteration. + */ +#define KNODE_DEAD 1LU +#define KNODE_KLIST_MASK ~KNODE_DEAD + +static struct klist *knode_klist(struct klist_node *knode) +{ + return (struct klist *) + ((unsigned long)knode->n_klist & KNODE_KLIST_MASK); +} + +static bool knode_dead(struct klist_node *knode) +{ + return (unsigned long)knode->n_klist & KNODE_DEAD; +} + +static void knode_set_klist(struct klist_node *knode, struct klist *klist) +{ + knode->n_klist = klist; + /* no knode deserves to start its life dead */ + WARN_ON(knode_dead(knode)); +} + +static void knode_kill(struct klist_node *knode) +{ + /* and no knode should die twice ever either, see we're very humane */ + WARN_ON(knode_dead(knode)); + *(unsigned long *)&knode->n_klist |= KNODE_DEAD; +} /** * klist_init - Initialize a klist structure. @@ -79,7 +110,7 @@ static void klist_node_init(struct klist *k, struct klist_node *n) INIT_LIST_HEAD(&n->n_node); init_completion(&n->n_removed); kref_init(&n->n_ref); - n->n_klist = k; + knode_set_klist(n, k); if (k->get) k->get(n); } @@ -115,7 +146,7 @@ EXPORT_SYMBOL_GPL(klist_add_tail); */ void klist_add_after(struct klist_node *n, struct klist_node *pos) { - struct klist *k = pos->n_klist; + struct klist *k = knode_klist(pos); klist_node_init(k, n); spin_lock(&k->k_lock); @@ -131,7 +162,7 @@ EXPORT_SYMBOL_GPL(klist_add_after); */ void klist_add_before(struct klist_node *n, struct klist_node *pos) { - struct klist *k = pos->n_klist; + struct klist *k = knode_klist(pos); klist_node_init(k, n); spin_lock(&k->k_lock); @@ -144,9 +175,10 @@ static void klist_release(struct kref *kref) { struct klist_node *n = container_of(kref, struct klist_node, n_ref); + WARN_ON(!knode_dead(n)); list_del(&n->n_node); complete(&n->n_removed); - n->n_klist = NULL; + knode_set_klist(n, NULL); } static int klist_dec_and_del(struct klist_node *n) @@ -154,22 +186,29 @@ static int klist_dec_and_del(struct klist_node *n) return kref_put(&n->n_ref, klist_release); } -/** - * klist_del - Decrement the reference count of node and try to remove. - * @n: node we're deleting. - */ -void klist_del(struct klist_node *n) +static void klist_put(struct klist_node *n, bool kill) { - struct klist *k = n->n_klist; + struct klist *k = knode_klist(n); void (*put)(struct klist_node *) = k->put; spin_lock(&k->k_lock); + if (kill) + knode_kill(n); if (!klist_dec_and_del(n)) put = NULL; spin_unlock(&k->k_lock); if (put) put(n); } + +/** + * klist_del - Decrement the reference count of node and try to remove. + * @n: node we're deleting. + */ +void klist_del(struct klist_node *n) +{ + klist_put(n, true); +} EXPORT_SYMBOL_GPL(klist_del); /** @@ -206,7 +245,6 @@ void klist_iter_init_node(struct klist *k, struct klist_iter *i, struct klist_node *n) { i->i_klist = k; - i->i_head = &k->k_list; i->i_cur = n; if (n) kref_get(&n->n_ref); @@ -237,7 +275,7 @@ EXPORT_SYMBOL_GPL(klist_iter_init); void klist_iter_exit(struct klist_iter *i) { if (i->i_cur) { - klist_del(i->i_cur); + klist_put(i->i_cur, false); i->i_cur = NULL; } } @@ -258,27 +296,33 @@ static struct klist_node *to_klist_node(struct list_head *n) */ struct klist_node *klist_next(struct klist_iter *i) { - struct list_head *next; - struct klist_node *lnode = i->i_cur; - struct klist_node *knode = NULL; void (*put)(struct klist_node *) = i->i_klist->put; + struct klist_node *last = i->i_cur; + struct klist_node *next; spin_lock(&i->i_klist->k_lock); - if (lnode) { - next = lnode->n_node.next; - if (!klist_dec_and_del(lnode)) + + if (last) { + next = to_klist_node(last->n_node.next); + if (!klist_dec_and_del(last)) put = NULL; } else - next = i->i_head->next; + next = to_klist_node(i->i_klist->k_list.next); - if (next != i->i_head) { - knode = to_klist_node(next); - kref_get(&knode->n_ref); + i->i_cur = NULL; + while (next != to_klist_node(&i->i_klist->k_list)) { + if (likely(!knode_dead(next))) { + kref_get(&next->n_ref); + i->i_cur = next; + break; + } + next = to_klist_node(next->n_node.next); } - i->i_cur = knode; + spin_unlock(&i->i_klist->k_lock); - if (put && lnode) - put(lnode); - return knode; + + if (put && last) + put(last); + return i->i_cur; } EXPORT_SYMBOL_GPL(klist_next); -- cgit v1.2.3 From 870d6656126add8e383645732b03df2b7ccd4f94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:25 +0900 Subject: block: implement CONFIG_DEBUG_BLOCK_EXT_DEVT Extended devt introduces non-contiguos device numbers. This patch implements a debug option which forces most devt allocations to be from the extended area and spreads them out. This is enabled by default if DEBUG_KERNEL is set and achieves... 1. Detects code paths in kernel or userland which expect predetermined consecutive device numbers. 2. When something goes wrong, avoid corruption as adding to the minor of earlier partition won't lead to the wrong but valid device. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/genhd.c | 38 +++++++++++++++++++++++++++++++++++--- drivers/ide/ide-disk.c | 6 ++++++ drivers/scsi/sd.c | 6 ++++++ lib/Kconfig.debug | 16 ++++++++++++++++ 4 files changed, 63 insertions(+), 3 deletions(-) (limited to 'lib') diff --git a/block/genhd.c b/block/genhd.c index ee4b13520e59..67e5a59ced2a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -298,6 +298,38 @@ EXPORT_SYMBOL(unregister_blkdev); static struct kobj_map *bdev_map; +/** + * blk_mangle_minor - scatter minor numbers apart + * @minor: minor number to mangle + * + * Scatter consecutively allocated @minor number apart if MANGLE_DEVT + * is enabled. Mangling twice gives the original value. + * + * RETURNS: + * Mangled value. + * + * CONTEXT: + * Don't care. + */ +static int blk_mangle_minor(int minor) +{ +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + int i; + + for (i = 0; i < MINORBITS / 2; i++) { + int low = minor & (1 << i); + int high = minor & (1 << (MINORBITS - 1 - i)); + int distance = MINORBITS - 1 - 2 * i; + + minor ^= low | high; /* clear both bits */ + low <<= distance; /* swap the positions */ + high >>= distance; + minor |= low | high; /* and set */ + } +#endif + return minor; +} + /** * blk_alloc_devt - allocate a dev_t for a partition * @part: partition to allocate dev_t for @@ -339,7 +371,7 @@ int blk_alloc_devt(struct hd_struct *part, dev_t *devt) return -EBUSY; } - *devt = MKDEV(BLOCK_EXT_MAJOR, idx); + *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); return 0; } @@ -361,7 +393,7 @@ void blk_free_devt(dev_t devt) if (MAJOR(devt) == BLOCK_EXT_MAJOR) { mutex_lock(&ext_devt_mutex); - idr_remove(&ext_devt_idr, MINOR(devt)); + idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); mutex_unlock(&ext_devt_mutex); } } @@ -473,7 +505,7 @@ struct gendisk *get_gendisk(dev_t devt, int *partno) struct hd_struct *part; mutex_lock(&ext_devt_mutex); - part = idr_find(&ext_devt_idr, MINOR(devt)); + part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); if (part && get_disk(part_to_disk(part))) { *partno = part->partno; disk = part_to_disk(part); diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index 7a88de9ada29..a072df5053ae 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -42,7 +42,13 @@ #include #define IDE_DISK_PARTS (1 << PARTN_BITS) + +#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT) #define IDE_DISK_MINORS IDE_DISK_PARTS +#else +#define IDE_DISK_MINORS 1 +#endif + #define IDE_DISK_EXT_MINORS (IDE_DISK_PARTS - IDE_DISK_MINORS) struct ide_disk_obj { diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index d1bb0e1d2d28..280d231a86ed 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -87,7 +87,13 @@ MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD); MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC); #define SD_PARTS 64 + +#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT) #define SD_MINORS 16 +#else +#define SD_MINORS 1 +#endif + #define SD_EXT_MINORS (SD_PARTS - SD_MINORS) static int sd_revalidate_disk(struct gendisk *); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 0b504814e378..5a536f703a83 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -624,6 +624,22 @@ config BACKTRACE_SELF_TEST Say N if you are unsure. +config DEBUG_BLOCK_EXT_DEVT + bool "Force extended block device numbers and spread them" + depends on DEBUG_KERNEL + depends on BLOCK + default y + help + Conventionally, block device numbers are allocated from + predetermined contiguous area. However, extended block area + may introduce non-contiguous block device numbers. This + option forces most block device numbers to be allocated from + the extended space and spreads them to discover kernel or + userland code paths which assume predetermined contiguous + device number allocation. + + Say N if you are unsure. + config LKDTM tristate "Linux Kernel Dump Test Tool Module" depends on DEBUG_KERNEL -- cgit v1.2.3 From 759f8ca3048f7438aa3129268d7252552505d662 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 29 Aug 2008 09:06:29 +0200 Subject: Change default value of CONFIG_DEBUG_BLOCK_EXT_DEVT to 'n' It's a debug option that you would explicitly enable to test this feature, we should default it to 'n' to prevent accidental surprises for now. Signed-off-by: Jens Axboe --- lib/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5a536f703a83..4378d5e923ca 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -628,7 +628,7 @@ config DEBUG_BLOCK_EXT_DEVT bool "Force extended block device numbers and spread them" depends on DEBUG_KERNEL depends on BLOCK - default y + default n help Conventionally, block device numbers are allocated from predetermined contiguous area. However, extended block area -- cgit v1.2.3 From 55dc7db70a73a3809a2334063c9b5b0d8ccebdaa Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 1 Sep 2008 13:44:35 +0200 Subject: init: DEBUG_BLOCK_EXT_DEVT requires explicit root= param DEBUG_BLOCK_EXT_DEVT shuffles SCSI and IDE device numbers and root device number set using rdev become meaningless. Root devices should be explicitly specified using textual names. Warn about it if root can't be found and DEBUG_BLOCK_EXT_DEVT is enabled. Also, add warning to the help text. Signed-off-by: Tejun Heo Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Jens Axboe --- init/do_mounts.c | 4 ++++ lib/Kconfig.debug | 6 ++++++ 2 files changed, 10 insertions(+) (limited to 'lib') diff --git a/init/do_mounts.c b/init/do_mounts.c index 3715feb8446d..d055b1914c3d 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -263,6 +263,10 @@ retry: printk("Please append a correct \"root=\" boot option; here are the available partitions:\n"); printk_all_partitions(); +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + printk("DEBUG_BLOCK_EXT_DEVT is enabled, you need to specify " + "explicit textual name for \"root=\" boot option.\n"); +#endif panic("VFS: Unable to mount root fs on %s", b); } diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 4378d5e923ca..c556896abe57 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -638,6 +638,12 @@ config DEBUG_BLOCK_EXT_DEVT userland code paths which assume predetermined contiguous device number allocation. + Note that turning on this debug option shuffles all the + device numbers for all IDE and SCSI devices including libata + ones, so root partition specified using device number + directly (via rdev or root=MAJ:MIN) won't work anymore. + Textual device names (root=/dev/sdXn) will continue to work. + Say N if you are unsure. config LKDTM -- cgit v1.2.3 From 581d4e28d9195aa8b2231383dbabc288988d615e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sun, 14 Sep 2008 05:56:33 -0700 Subject: block: add fault injection mechanism for faking request timeouts Only works for the generic request timer handling. Allows one to sporadically ignore request completions, thus exercising the timeout handling. Signed-off-by: Jens Axboe --- block/blk-softirq.c | 2 ++ block/blk-timeout.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++ block/blk.h | 12 ++++++++++ block/genhd.c | 8 +++++++ include/linux/blkdev.h | 1 + lib/Kconfig.debug | 13 ++++++++++- 6 files changed, 94 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/block/blk-softirq.c b/block/blk-softirq.c index 7ab344afb16f..e660d26ca656 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -154,6 +154,8 @@ do_local: **/ void blk_complete_request(struct request *req) { + if (unlikely(blk_should_fake_timeout(req->q))) + return; if (!blk_mark_rq_complete(req)) __blk_complete_request(req); } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 6e5c781c5af1..9b4ad138bb33 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -4,9 +4,68 @@ #include #include #include +#include #include "blk.h" +#ifdef CONFIG_FAIL_IO_TIMEOUT + +static DECLARE_FAULT_ATTR(fail_io_timeout); + +static int __init setup_fail_io_timeout(char *str) +{ + return setup_fault_attr(&fail_io_timeout, str); +} +__setup("fail_io_timeout=", setup_fail_io_timeout); + +int blk_should_fake_timeout(struct request_queue *q) +{ + if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) + return 0; + + return should_fail(&fail_io_timeout, 1); +} + +static int __init fail_io_timeout_debugfs(void) +{ + return init_fault_attr_dentries(&fail_io_timeout, "fail_io_timeout"); +} + +late_initcall(fail_io_timeout_debugfs); + +ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags); + + return sprintf(buf, "%d\n", set != 0); +} + +ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + int val; + + if (count) { + struct request_queue *q = disk->queue; + char *p = (char *) buf; + + val = simple_strtoul(p, &p, 10); + spin_lock_irq(q->queue_lock); + if (val) + queue_flag_set(QUEUE_FLAG_FAIL_IO, q); + else + queue_flag_clear(QUEUE_FLAG_FAIL_IO, q); + spin_unlock_irq(q->queue_lock); + } + + return count; +} + +#endif /* CONFIG_FAIL_IO_TIMEOUT */ + /* * blk_delete_timer - Delete/cancel timer for a given function. * @req: request that we are canceling timer for diff --git a/block/blk.h b/block/blk.h index a4f4a50aefaa..e5c579769963 100644 --- a/block/blk.h +++ b/block/blk.h @@ -42,6 +42,18 @@ static inline void blk_clear_rq_complete(struct request *rq) clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); } +#ifdef CONFIG_FAIL_IO_TIMEOUT +int blk_should_fake_timeout(struct request_queue *); +ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); +ssize_t part_timeout_store(struct device *, struct device_attribute *, + const char *, size_t); +#else +static inline int blk_should_fake_timeout(struct request_queue *q) +{ + return 0; +} +#endif + struct io_context *current_io_context(gfp_t gfp_flags, int node); int ll_back_merge_fn(struct request_queue *q, struct request *req, diff --git a/block/genhd.c b/block/genhd.c index 8acaff0154e3..4cd3433c99ac 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -817,6 +817,11 @@ static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); #endif +#ifdef CONFIG_FAIL_IO_TIMEOUT +static struct device_attribute dev_attr_fail_timeout = + __ATTR(io-timeout-fail, S_IRUGO|S_IWUSR, part_timeout_show, + part_timeout_store); +#endif static struct attribute *disk_attrs[] = { &dev_attr_range.attr, @@ -828,6 +833,9 @@ static struct attribute *disk_attrs[] = { &dev_attr_stat.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, +#endif +#ifdef CONFIG_FAIL_IO_TIMEOUT + &dev_attr_fail_timeout.attr, #endif NULL }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b47767c72ce3..e34999d14c16 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -440,6 +440,7 @@ struct request_queue #define QUEUE_FLAG_BIDI 9 /* queue supports bidi requests */ #define QUEUE_FLAG_NOMERGES 10 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 11 /* force complete on same CPU */ +#define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */ static inline int queue_is_locked(struct request_queue *q) { diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c556896abe57..7d7a31d0ddeb 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -683,10 +683,21 @@ config FAIL_PAGE_ALLOC config FAIL_MAKE_REQUEST bool "Fault-injection capability for disk IO" - depends on FAULT_INJECTION + depends on FAULT_INJECTION && BLOCK help Provide fault-injection capability for disk IO. +config FAIL_IO_TIMEOUT + bool "Faul-injection capability for faking disk interrupts" + depends on FAULT_INJECTION && BLOCK + help + Provide fault-injection capability on end IO handling. This + will make the block layer "forget" an interrupt as configured, + thus exercising the error handling. + + Only works with drivers that use the generic timeout handling, + for others it wont do anything. + config FAULT_INJECTION_DEBUG_FS bool "Debugfs entries for fault-injection capabilities" depends on FAULT_INJECTION && SYSFS && DEBUG_FS -- cgit v1.2.3