From 454ed842d55740160334efc9ad56cfef54ed37bc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 09:30:25 +0200 Subject: lockdep: annotate mm_take_all_locks() The nesting is correct due to holding mmap_sem, use the new annotation to annotate this. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- mm/mmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 245c3d69067b..5d09d08a4120 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2273,14 +2273,14 @@ int install_special_mapping(struct mm_struct *mm, static DEFINE_MUTEX(mm_all_locks_mutex); -static void vm_lock_anon_vma(struct anon_vma *anon_vma) +static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - spin_lock(&anon_vma->lock); + spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); /* * We can safely modify head.next after taking the * anon_vma->lock. If some other vma in this mm shares @@ -2296,7 +2296,7 @@ static void vm_lock_anon_vma(struct anon_vma *anon_vma) } } -static void vm_lock_mapping(struct address_space *mapping) +static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) { if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) { /* @@ -2310,7 +2310,7 @@ static void vm_lock_mapping(struct address_space *mapping) */ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags)) BUG(); - spin_lock(&mapping->i_mmap_lock); + spin_lock_nest_lock(&mapping->i_mmap_lock, &mm->mmap_sem); } } @@ -2359,9 +2359,9 @@ int mm_take_all_locks(struct mm_struct *mm) if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) - vm_lock_anon_vma(vma->anon_vma); + vm_lock_anon_vma(mm, vma->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) - vm_lock_mapping(vma->vm_file->f_mapping); + vm_lock_mapping(mm, vma->vm_file->f_mapping); } ret = 0; -- cgit v1.2.3 From 7cd5a02f54f4c9d16cf7fdffa2122bc73bb09b43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 11 Aug 2008 09:30:25 +0200 Subject: mm: fix mm_take_all_locks() locking order Lockdep spotted: ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.27-rc1 #270 ------------------------------------------------------- qemu-kvm/2033 is trying to acquire lock: (&inode->i_data.i_mmap_lock){----}, at: [] mm_take_all_locks+0xc2/0xea but task is already holding lock: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&anon_vma->lock){----}: [] __lock_acquire+0x11be/0x14d2 [] lock_acquire+0x5e/0x7a [] _spin_lock+0x3b/0x47 [] vma_adjust+0x200/0x444 [] split_vma+0x12f/0x146 [] mprotect_fixup+0x13c/0x536 [] sys_mprotect+0x1a9/0x21e [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff -> #0 (&inode->i_data.i_mmap_lock){----}: [] __lock_acquire+0xedb/0x14d2 [] lock_release_non_nested+0x1c2/0x219 [] lock_release+0x127/0x14a [] _spin_unlock+0x1e/0x50 [] mm_drop_all_locks+0x7f/0xb0 [] do_mmu_notifier_register+0xe2/0x112 [] mmu_notifier_register+0xe/0x10 [] kvm_dev_ioctl+0x11e/0x287 [kvm] [] vfs_ioctl+0x2a/0x78 [] do_vfs_ioctl+0x257/0x274 [] sys_ioctl+0x55/0x78 [] system_call_fastpath+0x16/0x1b [] 0xffffffffffffffff other info that might help us debug this: 5 locks held by qemu-kvm/2033: #0: (&mm->mmap_sem){----}, at: [] do_mmu_notifier_register+0x55/0x112 #1: (mm_all_locks_mutex){--..}, at: [] mm_take_all_locks+0x34/0xea #2: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea #3: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea #4: (&anon_vma->lock){----}, at: [] mm_take_all_locks+0x70/0xea stack backtrace: Pid: 2033, comm: qemu-kvm Not tainted 2.6.27-rc1 #270 Call Trace: [] print_circular_bug_tail+0xb8/0xc3 [] __lock_acquire+0xedb/0x14d2 [] ? add_lock_to_list+0x7e/0xad [] ? mm_take_all_locks+0x70/0xea [] ? mm_take_all_locks+0x70/0xea [] lock_release_non_nested+0x1c2/0x219 [] ? mm_take_all_locks+0xc2/0xea [] ? mm_take_all_locks+0xc2/0xea [] ? trace_hardirqs_on_caller+0x4d/0x115 [] ? mm_drop_all_locks+0x7f/0xb0 [] lock_release+0x127/0x14a [] _spin_unlock+0x1e/0x50 [] mm_drop_all_locks+0x7f/0xb0 [] do_mmu_notifier_register+0xe2/0x112 [] mmu_notifier_register+0xe/0x10 [] kvm_dev_ioctl+0x11e/0x287 [kvm] [] ? file_has_perm+0x83/0x8e [] vfs_ioctl+0x2a/0x78 [] do_vfs_ioctl+0x257/0x274 [] sys_ioctl+0x55/0x78 [] system_call_fastpath+0x16/0x1b Which the locking hierarchy in mm/rmap.c confirms as valid. Fix this by first taking all the mapping->i_mmap_lock instances and then take all anon_vma->lock instances. Signed-off-by: Peter Zijlstra Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- mm/mmap.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 5d09d08a4120..32a287b631d4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2358,11 +2358,17 @@ int mm_take_all_locks(struct mm_struct *mm) for (vma = mm->mmap; vma; vma = vma->vm_next) { if (signal_pending(current)) goto out_unlock; - if (vma->anon_vma) - vm_lock_anon_vma(mm, vma->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_lock_mapping(mm, vma->vm_file->f_mapping); } + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (signal_pending(current)) + goto out_unlock; + if (vma->anon_vma) + vm_lock_anon_vma(mm, vma->anon_vma); + } + ret = 0; out_unlock: -- cgit v1.2.3 From 912985dce45ef18fcdd9f5439fef054e0e22302a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 12 Aug 2008 17:52:52 -0500 Subject: mm: Make generic weak get_user_pages_fast and EXPORT_GPL it Out of line get_user_pages_fast fallback implementation, make it a weak symbol, get rid of CONFIG_HAVE_GET_USER_PAGES_FAST. Export the symbol to modules so lguest can use it. Signed-off-by: Nick Piggin Signed-off-by: Rusty Russell --- arch/powerpc/Kconfig | 3 --- arch/x86/Kconfig | 1 - arch/x86/mm/Makefile | 3 +-- include/linux/mm.h | 20 -------------------- mm/Kconfig | 3 --- mm/util.c | 15 +++++++++++++++ 6 files changed, 16 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 63c9cafda9c4..587da5e0990f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -42,9 +42,6 @@ config GENERIC_HARDIRQS bool default y -config HAVE_GET_USER_PAGES_FAST - def_bool PPC64 - config HAVE_SETUP_PER_CPU_AREA def_bool PPC64 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3d0f2b6a5a16..ac2fb0641a04 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,7 +22,6 @@ config X86 select HAVE_IDE select HAVE_OPROFILE select HAVE_IOREMAP_PROT - select HAVE_GET_USER_PAGES_FAST select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_KRETPROBES diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 2977ea37791f..dfb932dcf136 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,7 +1,6 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o + pat.o pgtable.o gup.o -obj-$(CONFIG_HAVE_GET_USER_PAGES_FAST) += gup.o obj-$(CONFIG_X86_32) += pgtable_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/include/linux/mm.h b/include/linux/mm.h index 335288bff1b7..fa651609b65d 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -834,7 +834,6 @@ extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); -#ifdef CONFIG_HAVE_GET_USER_PAGES_FAST /* * get_user_pages_fast provides equivalent functionality to get_user_pages, * operating on current and current->mm (force=0 and doesn't return any vmas). @@ -848,25 +847,6 @@ extern int mprotect_fixup(struct vm_area_struct *vma, int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); -#else -/* - * Should probably be moved to asm-generic, and architectures can include it if - * they don't implement their own get_user_pages_fast. - */ -#define get_user_pages_fast(start, nr_pages, write, pages) \ -({ \ - struct mm_struct *mm = current->mm; \ - int ret; \ - \ - down_read(&mm->mmap_sem); \ - ret = get_user_pages(current, mm, start, nr_pages, \ - write, 0, pages, NULL); \ - up_read(&mm->mmap_sem); \ - \ - ret; \ -}) -#endif - /* * A callback you can register to apply pressure to ageable caches. * diff --git a/mm/Kconfig b/mm/Kconfig index 446c6588c753..0bd9c2dbb2a0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -77,9 +77,6 @@ config FLAT_NODE_MEM_MAP def_bool y depends on !SPARSEMEM -config HAVE_GET_USER_PAGES_FAST - bool - # # Both the NUMA code and DISCONTIGMEM use arrays of pg_data_t's # to represent different areas of memory. This variable allows diff --git a/mm/util.c b/mm/util.c index 9341ca77bd88..cb00b748ce47 100644 --- a/mm/util.c +++ b/mm/util.c @@ -171,3 +171,18 @@ void arch_pick_mmap_layout(struct mm_struct *mm) mm->unmap_area = arch_unmap_area; } #endif + +int __attribute__((weak)) get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, nr_pages, + write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(get_user_pages_fast); -- cgit v1.2.3 From caff3a2c333e11a794308bd9a875a09b94fee24a Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Tue, 12 Aug 2008 15:08:38 -0700 Subject: hugetlb: call arch_prepare_hugepage() for surplus pages The s390 software large page emulation implements shared page tables by using page->index of the first tail page from a compound large page to store page table information. This is set up in arch_prepare_hugepage(), which is called from alloc_fresh_huge_page_node(). A similar call to arch_prepare_hugepage() is missing for surplus large pages that are allocated in alloc_buddy_huge_page(), which breaks the software emulation mode for (surplus) large pages on s390. This patch adds the missing call to arch_prepare_hugepage(). It will have no effect on other architectures where arch_prepare_hugepage() is a nop. Also, use the correct order in the error path in alloc_fresh_huge_page_node(). Acked-by: Martin Schwidefsky Signed-off-by: Gerald Schaefer Acked-by: Nick Piggin Acked-by: Adam Litke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 757ca983fd99..92155db888b9 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -565,7 +565,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); return NULL; } prep_new_huge_page(h, page, nid); @@ -665,6 +665,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + if (page && arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } + spin_lock(&hugetlb_lock); if (page) { /* -- cgit v1.2.3 From 74768ed833344bb0f82b97cee46320a3d7f09ecd Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 12 Aug 2008 15:08:39 -0700 Subject: page allocator: use no-panic variant of alloc_bootmem() in alloc_large_system_hash() .. since a failed allocation is being (initially) handled gracefully, and panic()-ed upon failure explicitly in the function if retries with smaller sizes failed. Signed-off-by: Jan Beulich Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 4 ++++ mm/page_alloc.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 652470b687c9..95837bfb5256 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -97,10 +97,14 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE #define alloc_bootmem(x) \ __alloc_bootmem(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_nopanic(x) \ + __alloc_bootmem_nopanic(x, SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low(x) \ __alloc_bootmem_low(x, SMP_CACHE_BYTES, 0) #define alloc_bootmem_pages(x) \ __alloc_bootmem(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) +#define alloc_bootmem_pages_nopanic(x) \ + __alloc_bootmem_nopanic(x, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)) #define alloc_bootmem_low_pages(x) \ __alloc_bootmem_low(x, PAGE_SIZE, 0) #define alloc_bootmem_node(pgdat, x) \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 401d104d2bb6..af982f7cdb2a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4437,7 +4437,7 @@ void *__init alloc_large_system_hash(const char *tablename, do { size = bucketsize << log2qty; if (flags & HASH_EARLY) - table = alloc_bootmem(size); + table = alloc_bootmem_nopanic(size); else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { -- cgit v1.2.3 From 9623e078c1f4692a91531af2f639ec8aff8f0472 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 12 Aug 2008 15:08:41 -0700 Subject: memcg: fix oops in mem_cgroup_shrink_usage Got an oops in mem_cgroup_shrink_usage() when testing loop over tmpfs: yes, of course, loop0 has no mm: other entry points check but this didn't. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7056c3bdb478..0f1f7a7374ba 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -796,6 +796,8 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) if (mem_cgroup_subsys.disabled) return 0; + if (!mm) + return 0; rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); -- cgit v1.2.3 From 57303d80175e10056bf51206f9961d586f02f967 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 12 Aug 2008 15:08:47 -0700 Subject: hugetlbfs: allocate structures for reservation tracking outside of spinlocks In the normal case, hugetlbfs reserves hugepages at map time so that the pages exist for future faults. A struct file_region is used to track when reservations have been consumed and where. These file_regions are allocated as necessary with kmalloc() which can sleep with the mm->page_table_lock held. This is wrong and triggers may-sleep warning when PREEMPT is enabled. Updates to the underlying file_region are done in two phases. The first phase prepares the region for the change, allocating any necessary memory, without actually making the change. The second phase actually commits the change. This patch makes use of this by checking the reservations before the page_table_lock is taken; triggering any necessary allocations. This may then be safely repeated within the locks without any allocations being required. Credit to Mel Gorman for diagnosing this failure and initial versions of the patch. Signed-off-by: Andy Whitcroft Tested-by: Gerald Schaefer Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 44 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 35 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 92155db888b9..4c97c174e2e1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1942,6 +1942,15 @@ retry: lock_page(page); } + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) + vma_needs_reservation(h, vma, address); + spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -1978,6 +1987,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); @@ -2000,19 +2010,35 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = 0; + /* + * If we are going to COW the mapping later, we examine the pending + * reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. For private mappings, we also lookup the pagecache + * page now as it is used to determine if a reservation has been + * consumed. + */ + if (write_access && !pte_write(entry)) { + vma_needs_reservation(h, vma, address); + + if (!(vma->vm_flags & VM_SHARED)) + pagecache_page = hugetlbfs_pagecache_page(h, + vma, address); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) { - struct page *page; - page = hugetlbfs_pagecache_page(h, vma, address); - ret = hugetlb_cow(mm, vma, address, ptep, entry, page); - if (page) { - unlock_page(page); - put_page(page); - } - } + if (write_access && !pte_write(entry)) + ret = hugetlb_cow(mm, vma, address, ptep, entry, + pagecache_page); spin_unlock(&mm->page_table_lock); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + mutex_unlock(&hugetlb_instantiation_mutex); return ret; -- cgit v1.2.3 From 2b26736c88db85c038e04c2306d0745553e69602 Mon Sep 17 00:00:00 2001 From: Andy Whitcroft Date: Tue, 12 Aug 2008 15:08:49 -0700 Subject: allocate structures for reservation tracking in hugetlbfs outside of spinlocks v2 [Andrew this should replace the previous version which did not check the returns from the region prepare for errors. This has been tested by us and Gerald and it looks good. Bah, while reviewing the locking based on your previous email I spotted that we need to check the return from the vma_needs_reservation call for allocation errors. Here is an updated patch to correct this. This passes testing here.] Signed-off-by: Andy Whitcroft Tested-by: Gerald Schaefer Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4c97c174e2e1..67a71191136e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1949,7 +1949,10 @@ retry: * the spinlock. */ if (write_access && !(vma->vm_flags & VM_SHARED)) - vma_needs_reservation(h, vma, address); + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); @@ -1976,6 +1979,7 @@ out: backout: spin_unlock(&mm->page_table_lock); +backout_unlocked: unlock_page(page); put_page(page); goto out; @@ -2004,8 +2008,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); - return ret; + goto out_unlock; } ret = 0; @@ -2019,7 +2022,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * consumed. */ if (write_access && !pte_write(entry)) { - vma_needs_reservation(h, vma, address); + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto out_unlock; + } if (!(vma->vm_flags & VM_SHARED)) pagecache_page = hugetlbfs_pagecache_page(h, @@ -2039,6 +2045,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(pagecache_page); } +out_unlock: mutex_unlock(&hugetlb_instantiation_mutex); return ret; -- cgit v1.2.3 From d6bf73e4340f52159c1d9f13836b62e20fcd12d3 Mon Sep 17 00:00:00 2001 From: MinChan Kim Date: Tue, 12 Aug 2008 15:08:52 -0700 Subject: do_migrate_pages(): remove unused variable Signed-off-by: MinChan Kim Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e550bec20582..83369058ec13 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -803,7 +803,6 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) { - LIST_HEAD(pagelist); int busy = 0; int err = 0; nodemask_t tmp; -- cgit v1.2.3 From fc1efbdb7a1175759b099d74b67921396e5e8e3d Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Tue, 12 Aug 2008 15:09:00 -0700 Subject: mm/sparse.c: removed duplicated include Signed-off-by: Huang Weiyi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index 5d9dbbb9d39e..39db301b920d 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -12,7 +12,6 @@ #include #include #include -#include "internal.h" /* * Permanent SPARSEMEM data: -- cgit v1.2.3 From 5cd9c58fbe9ec92b45b27e131719af4f2bd9eb40 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 14 Aug 2008 11:37:28 +0100 Subject: security: Fix setting of PF_SUPERPRIV by __capable() Fix the setting of PF_SUPERPRIV by __capable() as it could corrupt the flags the target process if that is not the current process and it is trying to change its own flags in a different way at the same time. __capable() is using neither atomic ops nor locking to protect t->flags. This patch removes __capable() and introduces has_capability() that doesn't set PF_SUPERPRIV on the process being queried. This patch further splits security_ptrace() in two: (1) security_ptrace_may_access(). This passes judgement on whether one process may access another only (PTRACE_MODE_ATTACH for ptrace() and PTRACE_MODE_READ for /proc), and takes a pointer to the child process. current is the parent. (2) security_ptrace_traceme(). This passes judgement on PTRACE_TRACEME only, and takes only a pointer to the parent process. current is the child. In Smack and commoncap, this uses has_capability() to determine whether the parent will be permitted to use PTRACE_ATTACH if normal checks fail. This does not set PF_SUPERPRIV. Two of the instances of __capable() actually only act on current, and so have been changed to calls to capable(). Of the places that were using __capable(): (1) The OOM killer calls __capable() thrice when weighing the killability of a process. All of these now use has_capability(). (2) cap_ptrace() and smack_ptrace() were using __capable() to check to see whether the parent was allowed to trace any process. As mentioned above, these have been split. For PTRACE_ATTACH and /proc, capable() is now used, and for PTRACE_TRACEME, has_capability() is used. (3) cap_safe_nice() only ever saw current, so now uses capable(). (4) smack_setprocattr() rejected accesses to tasks other than current just after calling __capable(), so the order of these two tests have been switched and capable() is used instead. (5) In smack_file_send_sigiotask(), we need to allow privileged processes to receive SIGIO on files they're manipulating. (6) In smack_task_wait(), we let a process wait for a privileged process, whether or not the process doing the waiting is privileged. I've tested this with the LTP SELinux and syscalls testscripts. Signed-off-by: David Howells Acked-by: Serge Hallyn Acked-by: Casey Schaufler Acked-by: Andrew G. Morgan Acked-by: Al Viro Signed-off-by: James Morris --- include/linux/capability.h | 15 ++++++++++++-- include/linux/security.h | 39 +++++++++++++++++++++++------------- kernel/capability.c | 21 ++++++++++++-------- kernel/ptrace.c | 5 ++--- mm/oom_kill.c | 6 ++++-- security/capability.c | 3 ++- security/commoncap.c | 24 ++++++++++++++++------- security/root_plug.c | 3 ++- security/security.c | 10 +++++++--- security/selinux/hooks.c | 25 ++++++++++++++++------- security/smack/smack_lsm.c | 49 ++++++++++++++++++++++++++++++++-------------- 11 files changed, 137 insertions(+), 63 deletions(-) (limited to 'mm') diff --git a/include/linux/capability.h b/include/linux/capability.h index 02673846d205..9d1fe30b6f6c 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -503,8 +503,19 @@ extern const kernel_cap_t __cap_init_eff_set; kernel_cap_t cap_set_effective(const kernel_cap_t pE_new); -int capable(int cap); -int __capable(struct task_struct *t, int cap); +/** + * has_capability - Determine if a task has a superior capability available + * @t: The task in question + * @cap: The capability to be tested for + * + * Return true if the specified task has the given superior capability + * currently in effect, false if not. + * + * Note that this does not set PF_SUPERPRIV on the task. + */ +#define has_capability(t, cap) (security_capable((t), (cap)) == 0) + +extern int capable(int cap); #endif /* __KERNEL__ */ diff --git a/include/linux/security.h b/include/linux/security.h index fd96e7f8a6f9..2ee5ecfb2393 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -46,8 +46,8 @@ struct audit_krule; */ extern int cap_capable(struct task_struct *tsk, int cap); extern int cap_settime(struct timespec *ts, struct timezone *tz); -extern int cap_ptrace(struct task_struct *parent, struct task_struct *child, - unsigned int mode); +extern int cap_ptrace_may_access(struct task_struct *child, unsigned int mode); +extern int cap_ptrace_traceme(struct task_struct *parent); extern int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern int cap_capset_check(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); extern void cap_capset_set(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); @@ -1157,17 +1157,24 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @alter contains the flag indicating whether changes are to be made. * Return 0 if permission is granted. * - * @ptrace: - * Check permission before allowing the @parent process to trace the + * @ptrace_may_access: + * Check permission before allowing the current process to trace the * @child process. * Security modules may also want to perform a process tracing check * during an execve in the set_security or apply_creds hooks of * binprm_security_ops if the process is being traced and its security * attributes would be changed by the execve. - * @parent contains the task_struct structure for parent process. - * @child contains the task_struct structure for child process. + * @child contains the task_struct structure for the target process. * @mode contains the PTRACE_MODE flags indicating the form of access. * Return 0 if permission is granted. + * @ptrace_traceme: + * Check that the @parent process has sufficient permission to trace the + * current process before allowing the current process to present itself + * to the @parent process for tracing. + * The parent process will still have to undergo the ptrace_may_access + * checks before it is allowed to trace this one. + * @parent contains the task_struct structure for debugger process. + * Return 0 if permission is granted. * @capget: * Get the @effective, @inheritable, and @permitted capability sets for * the @target process. The hook may also perform permission checking to @@ -1287,8 +1294,8 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) struct security_operations { char name[SECURITY_NAME_MAX + 1]; - int (*ptrace) (struct task_struct *parent, struct task_struct *child, - unsigned int mode); + int (*ptrace_may_access) (struct task_struct *child, unsigned int mode); + int (*ptrace_traceme) (struct task_struct *parent); int (*capget) (struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted); @@ -1560,8 +1567,8 @@ extern struct dentry *securityfs_create_dir(const char *name, struct dentry *par extern void securityfs_remove(struct dentry *dentry); /* Security operations */ -int security_ptrace(struct task_struct *parent, struct task_struct *child, - unsigned int mode); +int security_ptrace_may_access(struct task_struct *child, unsigned int mode); +int security_ptrace_traceme(struct task_struct *parent); int security_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, @@ -1742,11 +1749,15 @@ static inline int security_init(void) return 0; } -static inline int security_ptrace(struct task_struct *parent, - struct task_struct *child, - unsigned int mode) +static inline int security_ptrace_may_access(struct task_struct *child, + unsigned int mode) +{ + return cap_ptrace_may_access(child, mode); +} + +static inline int security_ptrace_traceme(struct task_struct *child) { - return cap_ptrace(parent, child, mode); + return cap_ptrace_traceme(parent); } static inline int security_capget(struct task_struct *target, diff --git a/kernel/capability.c b/kernel/capability.c index 0101e847603e..33e51e78c2d8 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -486,17 +486,22 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) return ret; } -int __capable(struct task_struct *t, int cap) +/** + * capable - Determine if the current task has a superior capability in effect + * @cap: The capability to be tested for + * + * Return true if the current task has the given superior capability currently + * available for use, false if not. + * + * This sets PF_SUPERPRIV on the task if the capability is available on the + * assumption that it's about to be used. + */ +int capable(int cap) { - if (security_capable(t, cap) == 0) { - t->flags |= PF_SUPERPRIV; + if (has_capability(current, cap)) { + current->flags |= PF_SUPERPRIV; return 1; } return 0; } - -int capable(int cap) -{ - return __capable(current, cap); -} EXPORT_SYMBOL(capable); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 082b3fcb32a0..356699a96d56 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -140,7 +140,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode) if (!dumpable && !capable(CAP_SYS_PTRACE)) return -EPERM; - return security_ptrace(current, task, mode); + return security_ptrace_may_access(task, mode); } bool ptrace_may_access(struct task_struct *task, unsigned int mode) @@ -499,8 +499,7 @@ repeat: goto repeat; } - ret = security_ptrace(current->parent, current, - PTRACE_MODE_ATTACH); + ret = security_ptrace_traceme(current->parent); /* * Set the ptrace bit in the process ptrace flags. diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8a5467ee6265..64e5b4bcd964 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -26,6 +26,7 @@ #include #include #include +#include int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; @@ -128,7 +129,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Superuser processes are usually more important, so we make it * less likely that we kill those. */ - if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) + if (has_capability(p, CAP_SYS_ADMIN) || + has_capability(p, CAP_SYS_RESOURCE)) points /= 4; /* @@ -137,7 +139,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * tend to only have this flag set on applications they think * of as important. */ - if (__capable(p, CAP_SYS_RAWIO)) + if (has_capability(p, CAP_SYS_RAWIO)) points /= 4; /* diff --git a/security/capability.c b/security/capability.c index 63d10da515a5..245874819036 100644 --- a/security/capability.c +++ b/security/capability.c @@ -811,7 +811,8 @@ struct security_operations default_security_ops = { void security_fixup_ops(struct security_operations *ops) { - set_to_cap_if_null(ops, ptrace); + set_to_cap_if_null(ops, ptrace_may_access); + set_to_cap_if_null(ops, ptrace_traceme); set_to_cap_if_null(ops, capget); set_to_cap_if_null(ops, capset_check); set_to_cap_if_null(ops, capset_set); diff --git a/security/commoncap.c b/security/commoncap.c index 4afbece37a08..e4c4b3fc0c04 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -63,14 +63,24 @@ int cap_settime(struct timespec *ts, struct timezone *tz) return 0; } -int cap_ptrace (struct task_struct *parent, struct task_struct *child, - unsigned int mode) +int cap_ptrace_may_access(struct task_struct *child, unsigned int mode) { /* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */ - if (!cap_issubset(child->cap_permitted, parent->cap_permitted) && - !__capable(parent, CAP_SYS_PTRACE)) - return -EPERM; - return 0; + if (cap_issubset(child->cap_permitted, current->cap_permitted)) + return 0; + if (capable(CAP_SYS_PTRACE)) + return 0; + return -EPERM; +} + +int cap_ptrace_traceme(struct task_struct *parent) +{ + /* Derived from arch/i386/kernel/ptrace.c:sys_ptrace. */ + if (cap_issubset(current->cap_permitted, parent->cap_permitted)) + return 0; + if (has_capability(parent, CAP_SYS_PTRACE)) + return 0; + return -EPERM; } int cap_capget (struct task_struct *target, kernel_cap_t *effective, @@ -534,7 +544,7 @@ int cap_task_post_setuid (uid_t old_ruid, uid_t old_euid, uid_t old_suid, static inline int cap_safe_nice(struct task_struct *p) { if (!cap_issubset(p->cap_permitted, current->cap_permitted) && - !__capable(current, CAP_SYS_NICE)) + !capable(CAP_SYS_NICE)) return -EPERM; return 0; } diff --git a/security/root_plug.c b/security/root_plug.c index be0ebec2580b..c3f68b5b372d 100644 --- a/security/root_plug.c +++ b/security/root_plug.c @@ -72,7 +72,8 @@ static int rootplug_bprm_check_security (struct linux_binprm *bprm) static struct security_operations rootplug_security_ops = { /* Use the capability functions for some of the hooks */ - .ptrace = cap_ptrace, + .ptrace_may_access = cap_ptrace_may_access, + .ptrace_traceme = cap_ptrace_traceme, .capget = cap_capget, .capset_check = cap_capset_check, .capset_set = cap_capset_set, diff --git a/security/security.c b/security/security.c index ff7068727757..3a4b4f55b33f 100644 --- a/security/security.c +++ b/security/security.c @@ -127,10 +127,14 @@ int register_security(struct security_operations *ops) /* Security operations */ -int security_ptrace(struct task_struct *parent, struct task_struct *child, - unsigned int mode) +int security_ptrace_may_access(struct task_struct *child, unsigned int mode) { - return security_ops->ptrace(parent, child, mode); + return security_ops->ptrace_may_access(child, mode); +} + +int security_ptrace_traceme(struct task_struct *parent) +{ + return security_ops->ptrace_traceme(parent); } int security_capget(struct task_struct *target, diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 3ae9bec5a508..03fc6a81ae32 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1738,24 +1738,34 @@ static inline u32 file_to_av(struct file *file) /* Hook functions begin here. */ -static int selinux_ptrace(struct task_struct *parent, - struct task_struct *child, - unsigned int mode) +static int selinux_ptrace_may_access(struct task_struct *child, + unsigned int mode) { int rc; - rc = secondary_ops->ptrace(parent, child, mode); + rc = secondary_ops->ptrace_may_access(child, mode); if (rc) return rc; if (mode == PTRACE_MODE_READ) { - struct task_security_struct *tsec = parent->security; + struct task_security_struct *tsec = current->security; struct task_security_struct *csec = child->security; return avc_has_perm(tsec->sid, csec->sid, SECCLASS_FILE, FILE__READ, NULL); } - return task_has_perm(parent, child, PROCESS__PTRACE); + return task_has_perm(current, child, PROCESS__PTRACE); +} + +static int selinux_ptrace_traceme(struct task_struct *parent) +{ + int rc; + + rc = secondary_ops->ptrace_traceme(parent); + if (rc) + return rc; + + return task_has_perm(parent, current, PROCESS__PTRACE); } static int selinux_capget(struct task_struct *target, kernel_cap_t *effective, @@ -5346,7 +5356,8 @@ static int selinux_key_getsecurity(struct key *key, char **_buffer) static struct security_operations selinux_ops = { .name = "selinux", - .ptrace = selinux_ptrace, + .ptrace_may_access = selinux_ptrace_may_access, + .ptrace_traceme = selinux_ptrace_traceme, .capget = selinux_capget, .capset_check = selinux_capset_check, .capset_set = selinux_capset_set, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 1b40e558f983..87d75417ea93 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -87,27 +87,46 @@ struct inode_smack *new_inode_smack(char *smack) */ /** - * smack_ptrace - Smack approval on ptrace - * @ptp: parent task pointer + * smack_ptrace_may_access - Smack approval on PTRACE_ATTACH * @ctp: child task pointer * * Returns 0 if access is OK, an error code otherwise * * Do the capability checks, and require read and write. */ -static int smack_ptrace(struct task_struct *ptp, struct task_struct *ctp, - unsigned int mode) +static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode) { int rc; - rc = cap_ptrace(ptp, ctp, mode); + rc = cap_ptrace_may_access(ctp, mode); if (rc != 0) return rc; - rc = smk_access(ptp->security, ctp->security, MAY_READWRITE); - if (rc != 0 && __capable(ptp, CAP_MAC_OVERRIDE)) + rc = smk_access(current->security, ctp->security, MAY_READWRITE); + if (rc != 0 && capable(CAP_MAC_OVERRIDE)) return 0; + return rc; +} + +/** + * smack_ptrace_traceme - Smack approval on PTRACE_TRACEME + * @ptp: parent task pointer + * + * Returns 0 if access is OK, an error code otherwise + * + * Do the capability checks, and require read and write. + */ +static int smack_ptrace_traceme(struct task_struct *ptp) +{ + int rc; + + rc = cap_ptrace_traceme(ptp); + if (rc != 0) + return rc; + rc = smk_access(ptp->security, current->security, MAY_READWRITE); + if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE)) + return 0; return rc; } @@ -923,7 +942,7 @@ static int smack_file_send_sigiotask(struct task_struct *tsk, */ file = container_of(fown, struct file, f_owner); rc = smk_access(file->f_security, tsk->security, MAY_WRITE); - if (rc != 0 && __capable(tsk, CAP_MAC_OVERRIDE)) + if (rc != 0 && has_capability(tsk, CAP_MAC_OVERRIDE)) return 0; return rc; } @@ -1164,12 +1183,12 @@ static int smack_task_wait(struct task_struct *p) * account for the smack labels having gotten to * be different in the first place. * - * This breaks the strict subjet/object access + * This breaks the strict subject/object access * control ideal, taking the object's privilege * state into account in the decision as well as * the smack value. */ - if (capable(CAP_MAC_OVERRIDE) || __capable(p, CAP_MAC_OVERRIDE)) + if (capable(CAP_MAC_OVERRIDE) || has_capability(p, CAP_MAC_OVERRIDE)) return 0; return rc; @@ -2016,9 +2035,6 @@ static int smack_setprocattr(struct task_struct *p, char *name, { char *newsmack; - if (!__capable(p, CAP_MAC_ADMIN)) - return -EPERM; - /* * Changing another process' Smack value is too dangerous * and supports no sane use case. @@ -2026,6 +2042,9 @@ static int smack_setprocattr(struct task_struct *p, char *name, if (p != current) return -EPERM; + if (!capable(CAP_MAC_ADMIN)) + return -EPERM; + if (value == NULL || size == 0 || size >= SMK_LABELLEN) return -EINVAL; @@ -2552,7 +2571,8 @@ static void smack_release_secctx(char *secdata, u32 seclen) struct security_operations smack_ops = { .name = "smack", - .ptrace = smack_ptrace, + .ptrace_may_access = smack_ptrace_may_access, + .ptrace_traceme = smack_ptrace_traceme, .capget = cap_capget, .capset_check = cap_capset_check, .capset_set = cap_capset_set, @@ -2729,4 +2749,3 @@ static __init int smack_init(void) * all processes and objects when they are created. */ security_initcall(smack_init); - -- cgit v1.2.3 From 627240aaa92a4dc00d25584910b5f205e963747b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 15 Aug 2008 00:40:17 -0700 Subject: bootmem allocator: alloc_bootmem_core(): page-align the end offset This is the minimal sequence that jams the allocator: void *p, *q, *r; p = alloc_bootmem(PAGE_SIZE); q = alloc_bootmem(64); free_bootmem(p, PAGE_SIZE); p = alloc_bootmem(PAGE_SIZE); r = alloc_bootmem(64); after this sequence (assuming that the allocator was empty or page-aligned before), pointer "q" will be equal to pointer "r". What's hapenning inside the allocator: p = alloc_bootmem(PAGE_SIZE); in allocator: last_end_off == PAGE_SIZE, bitmap contains bits 10000... q = alloc_bootmem(64); in allocator: last_end_off == PAGE_SIZE + 64, bitmap contains 11000... free_bootmem(p, PAGE_SIZE); in allocator: last_end_off == PAGE_SIZE + 64, bitmap contains 01000... p = alloc_bootmem(PAGE_SIZE); in allocator: last_end_off == PAGE_SIZE, bitmap contains 11000... r = alloc_bootmem(64); and now: it finds bit "2", as a place where to allocate (sidx) it hits the condition if (bdata->last_end_off && PFN_DOWN(bdata->last_end_off) + 1 == sidx)) start_off = ALIGN(bdata->last_end_off, align); -you can see that the condition is true, so it assigns start_off = ALIGN(bdata->last_end_off, align); (that is PAGE_SIZE) and allocates over already allocated block. With the patch it tries to continue at the end of previous allocation only if the previous allocation ended in the middle of the page. Signed-off-by: Mikulas Patocka Acked-by: Johannes Weiner Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 4af15d0340ad..e023c68b0255 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -473,7 +473,7 @@ find_block: goto find_block; } - if (bdata->last_end_off && + if (bdata->last_end_off & (PAGE_SIZE - 1) && PFN_DOWN(bdata->last_end_off) + 1 == sidx) start_off = ALIGN(bdata->last_end_off, align); else -- cgit v1.2.3