From 0531b2aac59c2296570ac52bfc032ef2ace7d5e1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 27 Jan 2010 09:20:03 -0800 Subject: mm: add new 'read_cache_page_gfp()' helper function It's a simplified 'read_cache_page()' which takes a page allocation flag, so that different paths can control how aggressive the memory allocations are that populate a address space. In particular, the intel GPU object mapping code wants to be able to do a certain amount of own internal memory management by automatically shrinking the address space when memory starts getting tight. This allows it to dynamically use different memory allocation policies on a per-allocation basis, rather than depend on the (static) address space gfp policy. The actual new function is a one-liner, but re-organizing the helper functions to the point where you can do this with a single line of code is what most of the patch is all about. Tested-by: Chris Wilson Signed-off-by: Linus Torvalds --- mm/filemap.c | 100 ++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 68 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 96ac6b0eb6cb..e3736923220e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1634,14 +1634,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) { struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { - page = page_cache_alloc_cold(mapping); + page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); @@ -1661,31 +1662,18 @@ repeat: return page; } -/** - * read_cache_page_async - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: destination for read data - * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page_async(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) + { struct page *page; int err; retry: - page = __read_cache_page(mapping, index, filler, data); + page = __read_cache_page(mapping, index, filler, data, gfp); if (IS_ERR(page)) return page; if (PageUptodate(page)) @@ -1710,8 +1698,67 @@ out: mark_page_accessed(page); return page; } + +/** + * read_cache_page_async - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * + * Same as read_cache_page, but don't wait for page to become unlocked + * after submitting it to the filler. + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page but don't wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_async(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *,struct page*), + void *data) +{ + return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); +} EXPORT_SYMBOL(read_cache_page_async); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + +/** + * read_cache_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This is the same as "read_mapping_page(mapping, index, NULL)", but with + * any new page allocations done using the specified allocation flags. Note + * that the Radix tree operations will still use GFP_KERNEL, so you can't + * expect to do this atomically or anything like that - but you can pass in + * other page requirements. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_gfp(struct address_space *mapping, + pgoff_t index, + gfp_t gfp) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + + return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); +} +EXPORT_SYMBOL(read_cache_page_gfp); + /** * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space @@ -1729,18 +1776,7 @@ struct page *read_cache_page(struct address_space *mapping, int (*filler)(void *,struct page*), void *data) { - struct page *page; - - page = read_cache_page_async(mapping, index, filler, data); - if (IS_ERR(page)) - goto out; - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - out: - return page; + return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); } EXPORT_SYMBOL(read_cache_page); -- cgit v1.2.3 From a7016235a61d520e6806f38129001d935c4b6661 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 29 Jan 2010 17:46:34 +0000 Subject: mm: fix migratetype bug which slowed swapping After memory pressure has forced it to dip into the reserves, 2.6.32's 5f8dcc21211a3d4e3a7a5ca366b469fb88117f61 "page-allocator: split per-cpu list into one-list-per-migrate-type" has been returning MIGRATE_RESERVE pages to the MIGRATE_MOVABLE free_list: in some sense depleting reserves. Fix that in the most straightforward way (which, considering the overheads of alternative approaches, is Mel's preference): the right migratetype is already in page_private(page), but free_pcppages_bulk() wasn't using it. How did this bug show up? As a 20% slowdown in my tmpfs loop kbuild swapping tests, on PowerMac G5 with SLUB allocator. Bisecting to that commit was easy, but explaining the magnitude of the slowdown not easy. The same effect appears, but much less markedly, with SLAB, and even less markedly on other machines (the PowerMac divides into fewer zones than x86, I think that may be a factor). We guess that lumpy reclaim of short-lived high-order pages is implicated in some way, and probably this bug has been tickling a poor decision somewhere in page reclaim. But instrumentation hasn't told me much, I've run out of time and imagination to determine exactly what's going on, and shouldn't hold up the fix any longer: it's valid, and might even fix other misbehaviours. Signed-off-by: Hugh Dickins Acked-by: Mel Gorman Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d2a8889b4c58..8deb9d0fd5b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -556,8 +556,9 @@ static void free_pcppages_bulk(struct zone *zone, int count, page = list_entry(list->prev, struct page, lru); /* must delete as __free_one_page list manipulates */ list_del(&page->lru); - __free_one_page(page, zone, 0, migratetype); - trace_mm_page_pcpu_drain(page, 0, migratetype); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ + __free_one_page(page, zone, 0, page_private(page)); + trace_mm_page_pcpu_drain(page, 0, page_private(page)); } while (--count && --batch_free && !list_empty(list)); } spin_unlock(&zone->lock); -- cgit v1.2.3 From de5604231ce4bc8db1bc1dcd27d8540cbedf1518 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 1 Feb 2010 22:24:18 +1100 Subject: mm: percpu-vmap fix RCU list walking RCU list walking of the per-cpu vmap cache was broken. It did not use RCU primitives, and also the union of free_list and rcu_head is obviously wrong (because free_list is indeed the list we are RCU walking). While we are there, remove a couple of unused fields from an earlier iteration. These APIs aren't actually used anywhere, because of problems with the XFS conversion. Christoph has now verified that the problems are solved with these patches. Also it is an exported interface, so I think it will be good to be merged now (and Christoph wants to get the XFS changes into their local tree). Cc: stable@kernel.org Cc: linux-mm@kvack.org Tested-by: Christoph Hellwig Signed-off-by: Nick Piggin -- Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d55d905463eb..cf76ff6ba596 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -667,8 +667,6 @@ static bool vmap_initialized __read_mostly = false; struct vmap_block_queue { spinlock_t lock; struct list_head free; - struct list_head dirty; - unsigned int nr_dirty; }; struct vmap_block { @@ -678,10 +676,8 @@ struct vmap_block { unsigned long free, dirty; DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); - union { - struct list_head free_list; - struct rcu_head rcu_head; - }; + struct list_head free_list; + struct rcu_head rcu_head; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ @@ -757,7 +753,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vbq = &get_cpu_var(vmap_block_queue); vb->vbq = vbq; spin_lock(&vbq->lock); - list_add(&vb->free_list, &vbq->free); + list_add_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); put_cpu_var(vmap_block_queue); @@ -776,8 +772,6 @@ static void free_vmap_block(struct vmap_block *vb) struct vmap_block *tmp; unsigned long vb_idx; - BUG_ON(!list_empty(&vb->free_list)); - vb_idx = addr_to_vb_idx(vb->va->va_start); spin_lock(&vmap_block_tree_lock); tmp = radix_tree_delete(&vmap_block_tree, vb_idx); @@ -816,7 +810,7 @@ again: vb->free -= 1UL << order; if (vb->free == 0) { spin_lock(&vbq->lock); - list_del_init(&vb->free_list); + list_del_rcu(&vb->free_list); spin_unlock(&vbq->lock); } spin_unlock(&vb->lock); @@ -860,11 +854,11 @@ static void vb_free(const void *addr, unsigned long size) BUG_ON(!vb); spin_lock(&vb->lock); - bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); + BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { - BUG_ON(vb->free || !list_empty(&vb->free_list)); + BUG_ON(vb->free); spin_unlock(&vb->lock); free_vmap_block(vb); } else @@ -1033,8 +1027,6 @@ void __init vmalloc_init(void) vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); - INIT_LIST_HEAD(&vbq->dirty); - vbq->nr_dirty = 0; } /* Import existing vmlist entries. */ -- cgit v1.2.3 From 02b709df817c0db174f249cc59e5f7fd01b64d92 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 1 Feb 2010 22:25:57 +1100 Subject: mm: purge fragmented percpu vmap blocks Improve handling of fragmented per-CPU vmaps. We previously don't free up per-CPU maps until all its addresses have been used and freed. So fragmented blocks could fill up vmalloc space even if they actually had no active vmap regions within them. Add some logic to allow all CPUs to have these blocks purged in the case of failure to allocate a new vm area, and also put some logic to trim such blocks of a current CPU if we hit them in the allocation path (so as to avoid a large build up of them). Christoph reported some vmap allocation failures when using the per CPU vmap APIs in XFS, which cannot be reproduced after this patch and the previous bug fix. Cc: linux-mm@kvack.org Cc: stable@kernel.org Tested-by: Christoph Hellwig Signed-off-by: Nick Piggin -- Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 81 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cf76ff6ba596..ae007462b7f6 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -509,6 +509,9 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* for per-CPU blocks */ +static void purge_fragmented_blocks_allcpus(void); + /* * Purges all lazily-freed vmap areas. * @@ -539,6 +542,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } else spin_lock(&purge_lock); + if (sync) + purge_fragmented_blocks_allcpus(); + rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { if (va->flags & VM_LAZY_FREE) { @@ -678,6 +684,7 @@ struct vmap_block { DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); struct list_head free_list; struct rcu_head rcu_head; + struct list_head purge; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ @@ -782,12 +789,61 @@ static void free_vmap_block(struct vmap_block *vb) call_rcu(&vb->rcu_head, rcu_free_vb); } +static void purge_fragmented_blocks(int cpu) +{ + LIST_HEAD(purge); + struct vmap_block *vb; + struct vmap_block *n_vb; + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + + if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) + continue; + + spin_lock(&vb->lock); + if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { + vb->free = 0; /* prevent further allocs after releasing lock */ + vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ + bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); + bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + spin_unlock(&vb->lock); + list_add_tail(&vb->purge, &purge); + } else + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + + list_for_each_entry_safe(vb, n_vb, &purge, purge) { + list_del(&vb->purge); + free_vmap_block(vb); + } +} + +static void purge_fragmented_blocks_thiscpu(void) +{ + purge_fragmented_blocks(smp_processor_id()); +} + +static void purge_fragmented_blocks_allcpus(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + purge_fragmented_blocks(cpu); +} + static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; unsigned long addr = 0; unsigned int order; + int purge = 0; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); @@ -800,24 +856,38 @@ again: int i; spin_lock(&vb->lock); + if (vb->free < 1UL << order) + goto next; + i = bitmap_find_free_region(vb->alloc_map, VMAP_BBMAP_BITS, order); - if (i >= 0) { - addr = vb->va->va_start + (i << PAGE_SHIFT); - BUG_ON(addr_to_vb_idx(addr) != - addr_to_vb_idx(vb->va->va_start)); - vb->free -= 1UL << order; - if (vb->free == 0) { - spin_lock(&vbq->lock); - list_del_rcu(&vb->free_list); - spin_unlock(&vbq->lock); + if (i < 0) { + if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { + /* fragmented and no outstanding allocations */ + BUG_ON(vb->dirty != VMAP_BBMAP_BITS); + purge = 1; } - spin_unlock(&vb->lock); - break; + goto next; } + addr = vb->va->va_start + (i << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != + addr_to_vb_idx(vb->va->va_start)); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + } + spin_unlock(&vb->lock); + break; +next: spin_unlock(&vb->lock); } + + if (purge) + purge_fragmented_blocks_thiscpu(); + put_cpu_var(vmap_block_queue); rcu_read_unlock(); -- cgit v1.2.3 From 931e80e4b3263db75c8e34f078d22f11bbabd3a3 Mon Sep 17 00:00:00 2001 From: anfei zhou Date: Tue, 2 Feb 2010 13:44:02 -0800 Subject: mm: flush dcache before writing into page to avoid alias The cache alias problem will happen if the changes of user shared mapping is not flushed before copying, then user and kernel mapping may be mapped into two different cache line, it is impossible to guarantee the coherence after iov_iter_copy_from_user_atomic. So the right steps should be: flush_dcache_page(page); kmap_atomic(page); write to page; kunmap_atomic(page); flush_dcache_page(page); More precisely, we might create two new APIs flush_dcache_user_page and flush_dcache_kern_page to replace the two flush_dcache_page accordingly. Here is a snippet tested on omap2430 with VIPT cache, and I think it is not ARM-specific: int val = 0x11111111; fd = open("abc", O_RDWR); addr = mmap(NULL, 4096, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); *(addr+0) = 0x44444444; tmp = *(addr+0); *(addr+1) = 0x77777777; write(fd, &val, sizeof(int)); close(fd); The results are not always 0x11111111 0x77777777 at the beginning as expected. Sometimes we see 0x44444444 0x77777777. Signed-off-by: Anfei Cc: Russell King Cc: Miklos Szeredi Cc: Nick Piggin Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/file.c | 3 +++ mm/filemap.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'mm') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c18913a777ae..a9f5e137f1d3 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -828,6 +828,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!page) break; + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + pagefault_disable(); tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); pagefault_enable(); diff --git a/mm/filemap.c b/mm/filemap.c index e3736923220e..698ea80f2102 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2232,6 +2232,9 @@ again: if (unlikely(status)) break; + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); pagefault_enable(); -- cgit v1.2.3 From 094e9539bd24bbe23b8e2741e903b0f3f1f85b03 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 2 Feb 2010 13:44:14 -0800 Subject: hugetlb: fix section mismatches hugetlb_sysfs_add_hstate is called by hugetlb_register_node directly during init and also indirectly via sysfs after init. This patch removes the __init tag from hugetlb_sysfs_add_hstate. Signed-off-by: Jeff Mahoney Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e91b81b63670..2d16fa6b8c2d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1515,10 +1515,9 @@ static struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; -static int __init hugetlb_sysfs_add_hstate(struct hstate *h, - struct kobject *parent, - struct kobject **hstate_kobjs, - struct attribute_group *hstate_attr_group) +static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, + struct kobject **hstate_kobjs, + struct attribute_group *hstate_attr_group) { int retval; int hi = h - hstates; -- cgit v1.2.3 From 6f5a55f1a6c5abee15a0e878e5c74d9f1569b8b0 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Feb 2010 16:16:50 -0800 Subject: Fix potential crash with sys_move_pages We incorrectly depended on the 'node_state/node_isset()' functions testing the node range, rather than checking it explicitly. That's not reliable, even if it might often happen to work. So do the proper explicit test. Reported-by: Marcus Meissner Acked-and-tested-by: Brice Goglin Acked-by: Hugh Dickins Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- mm/migrate.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index efddbf0926b2..9a0db5bbabe4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -912,6 +912,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, goto out_pm; err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_pm; + if (!node_state(node, N_HIGH_MEMORY)) goto out_pm; -- cgit v1.2.3