From 184101bf143ac96d62b3dcc17e7b3550f98d3350 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Wed, 6 May 2009 16:02:55 -0700
Subject: oom: prevent livelock when oom_kill_allocating_task is set

When /proc/sys/vm/oom_kill_allocating_task is set for large systems that
want to avoid the lengthy tasklist scan, it's possible to livelock if
current is ineligible for oom kill.  This normally happens when it is set
to OOM_DISABLE, but is also possible if any threads are sharing the same
->mm with a different tgid.

So change __out_of_memory() to fall back to the full task-list scan if it
was unable to kill `current'.

Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/oom_kill.c | 44 +++++++++++++++++++++-----------------------
 1 file changed, 21 insertions(+), 23 deletions(-)

(limited to 'mm')
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 2f3166e308d9..92bcf1db16b2 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -514,34 +514,32 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
  */
 static void __out_of_memory(gfp_t gfp_mask, int order)
 {
-	if (sysctl_oom_kill_allocating_task) {
-		oom_kill_process(current, gfp_mask, order, 0, NULL,
-				"Out of memory (oom_kill_allocating_task)");
-
-	} else {
-		unsigned long points;
-		struct task_struct *p;
-
-retry:
-		/*
-		 * Rambo mode: Shoot down a process and hope it solves whatever
-		 * issues we may have.
-		 */
-		p = select_bad_process(&points, NULL);
+	struct task_struct *p;
+	unsigned long points;
 
-		if (PTR_ERR(p) == -1UL)
+	if (sysctl_oom_kill_allocating_task)
+		if (!oom_kill_process(current, gfp_mask, order, 0, NULL,
+				"Out of memory (oom_kill_allocating_task)"))
 			return;
+retry:
+	/*
+	 * Rambo mode: Shoot down a process and hope it solves whatever
+	 * issues we may have.
+	 */
+	p = select_bad_process(&points, NULL);
 
-		/* Found nothing?!?! Either we hang forever, or we panic. */
-		if (!p) {
-			read_unlock(&tasklist_lock);
-			panic("Out of memory and no killable processes...\n");
-		}
+	if (PTR_ERR(p) == -1UL)
+		return;
 
-		if (oom_kill_process(p, gfp_mask, order, points, NULL,
-				     "Out of memory"))
-			goto retry;
+	/* Found nothing?!?! Either we hang forever, or we panic. */
+	if (!p) {
+		read_unlock(&tasklist_lock);
+		panic("Out of memory and no killable processes...\n");
 	}
+
+	if (oom_kill_process(p, gfp_mask, order, points, NULL,
+			     "Out of memory"))
+		goto retry;
 }
 
 /*
-- 
cgit v1.2.3


From 2498ce42d3a4d1a498f1df4884da960087547db7 Mon Sep 17 00:00:00 2001
From: Ralph Wuerthner <ralphw@linux.vnet.ibm.com>
Date: Wed, 6 May 2009 16:02:59 -0700
Subject: alloc_vmap_area: fix memory leak

If alloc_vmap_area() fails the allocated struct vmap_area has to be freed.

Signed-off-by: Ralph Wuerthner <ralphw@linux.vnet.ibm.com>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/vmalloc.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'mm')

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index fab19876b4d1..083716ea38c9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -402,6 +402,7 @@ overflow:
 			printk(KERN_WARNING
 				"vmap allocation for size %lu failed: "
 				"use vmalloc=<size> to increase size.\n", size);
+		kfree(va);
 		return ERR_PTR(-EBUSY);
 	}
 
-- 
cgit v1.2.3


From 9155203a5de94278525647b16733f0c315f3b786 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 May 2009 16:03:02 -0700
Subject: mm: use roundown_pow_of_two() in zone_batchsize()

Use roundown_pow_of_two(N) in zone_batchsize() rather than (1 <<
(fls(N)-1)) as they are equivalent, and with the former it is easier to
see what is going on.

Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Lanttor Guo <lanttor.guo@freescale.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2f26991fff1..8add7daf98b0 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2706,7 +2706,7 @@ static int zone_batchsize(struct zone *zone)
 	 * of pages of one half of the possible page colors
 	 * and the other with pages of the other colors.
 	 */
-	batch = (1 << (fls(batch + batch/2)-1)) - 1;
+	batch = rounddown_pow_of_two(batch + batch/2) - 1;
 
 	return batch;
 }
-- 
cgit v1.2.3


From 3a6be87fd1e5cdbbc3b6a14d02a3efa9ecba1d3f Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 May 2009 16:03:03 -0700
Subject: nommu: clamp zone_batchsize() to 0 under NOMMU conditions

Clamp zone_batchsize() to 0 under NOMMU conditions to stop
free_hot_cold_page() from queueing and batching frees.

The problem is that under NOMMU conditions it is really important to be
able to allocate large contiguous chunks of memory, but when munmap() or
exit_mmap() releases big stretches of memory, return of these to the buddy
allocator can be deferred, and when it does finally happen, it can be in
small chunks.

Whilst the fragmentation this incurs isn't so much of a problem under MMU
conditions as userspace VM is glued together from individual pages with
the aid of the MMU, it is a real problem if there isn't an MMU.

By clamping the page freeing queue size to 0, pages are returned to the
allocator immediately, and the buddy detector is more likely to be able to
glue them together into large chunks immediately, and fragmentation is
less likely to occur.

By disabling batching of frees, and by turning off the trimming of excess
space during boot, Coldfire can manage to boot.

Reported-by: Lanttor Guo <lanttor.guo@freescale.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Lanttor Guo <lanttor.guo@freescale.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/page_alloc.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8add7daf98b0..fe753ecf2aa5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2681,6 +2681,7 @@ static void __meminit zone_init_free_lists(struct zone *zone)
 
 static int zone_batchsize(struct zone *zone)
 {
+#ifdef CONFIG_MMU
 	int batch;
 
 	/*
@@ -2709,6 +2710,23 @@ static int zone_batchsize(struct zone *zone)
 	batch = rounddown_pow_of_two(batch + batch/2) - 1;
 
 	return batch;
+
+#else
+	/* The deferral and batching of frees should be suppressed under NOMMU
+	 * conditions.
+	 *
+	 * The problem is that NOMMU needs to be able to allocate large chunks
+	 * of contiguous memory as there's no hardware page translation to
+	 * assemble apparent contiguous memory from discontiguous pages.
+	 *
+	 * Queueing large contiguous runs of pages for batching, however,
+	 * causes the pages to actually be freed in smaller chunks.  As there
+	 * can be a significant delay between the individual batches being
+	 * recycled, this leads to the once large chunks of space being
+	 * fragmented and becoming unavailable for high-order allocations.
+	 */
+	return 0;
+#endif
 }
 
 static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
-- 
cgit v1.2.3


From fc4d5c292b68ef02514d2072dcbf82d090c34875 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 6 May 2009 16:03:05 -0700
Subject: nommu: make the initial mmap allocation excess behaviour Kconfig
 configurable

NOMMU mmap() has an option controlled by a sysctl variable that determines
whether the allocations made by do_mmap_private() should have the excess
space trimmed off and returned to the allocator.  Make the initial setting
of this variable a Kconfig configuration option.

The reason there can be excess space is that the allocator only allocates
in power-of-2 size chunks, but mmap()'s can be made in sizes that aren't a
power of 2.

There are two alternatives:

 (1) Keep the excess as dead space.  The dead space then remains unused for the
     lifetime of the mapping.  Mappings of shared objects such as libc, ld.so
     or busybox's text segment may retain their dead space forever.

 (2) Return the excess to the allocator.  This means that the dead space is
     limited to less than a page per mapping, but it means that for a transient
     process, there's more chance of fragmentation as the excess space may be
     reused fairly quickly.

During the boot process, a lot of transient processes are created, and
this can cause a lot of fragmentation as the pagecache and various slabs
grow greatly during this time.

By turning off the trimming of excess space during boot and disabling
batching of frees, Coldfire can manage to boot.

A better way of doing things might be to have /sbin/init turn this option
off.  By that point libc, ld.so and init - which are all long-duration
processes - have all been loaded and trimmed.

Reported-by: Lanttor Guo <lanttor.guo@freescale.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Tested-by: Lanttor Guo <lanttor.guo@freescale.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/Kconfig | 28 ++++++++++++++++++++++++++++
 mm/nommu.c |  2 +-
 2 files changed, 29 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index 57971d2ab848..c2b57d81e153 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -225,3 +225,31 @@ config HAVE_MLOCKED_PAGE_BIT
 
 config MMU_NOTIFIER
 	bool
+
+config NOMMU_INITIAL_TRIM_EXCESS
+	int "Turn on mmap() excess space trimming before booting"
+	depends on !MMU
+	default 1
+	help
+	  The NOMMU mmap() frequently needs to allocate large contiguous chunks
+	  of memory on which to store mappings, but it can only ask the system
+	  allocator for chunks in 2^N*PAGE_SIZE amounts - which is frequently
+	  more than it requires.  To deal with this, mmap() is able to trim off
+	  the excess and return it to the allocator.
+
+	  If trimming is enabled, the excess is trimmed off and returned to the
+	  system allocator, which can cause extra fragmentation, particularly
+	  if there are a lot of transient processes.
+
+	  If trimming is disabled, the excess is kept, but not used, which for
+	  long-term mappings means that the space is wasted.
+
+	  Trimming can be dynamically controlled through a sysctl option
+	  (/proc/sys/vm/nr_trim_pages) which specifies the minimum number of
+	  excess pages there must be before trimming should occur, or zero if
+	  no trimming is to occur.
+
+	  This option specifies the initial value of this option.  The default
+	  of 1 says that all excess pages should be trimmed.
+
+	  See Documentation/nommu-mmap.txt for more information.
diff --git a/mm/nommu.c b/mm/nommu.c
index 809998aa7b50..67cd1a487ee6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -66,7 +66,7 @@ struct percpu_counter vm_committed_as;
 int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
 int sysctl_overcommit_ratio = 50; /* default is 50% */
 int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
-int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
+int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 int heap_stack_gap = 0;
 
 atomic_long_t mmap_pages_allocated;
-- 
cgit v1.2.3


From 8c9ed899b44c19e81859fbb0e9d659fe2f8630fc Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 7 May 2009 11:41:37 +0100
Subject: NOMMU: Don't check vm_region::vm_start is page aligned in
 add_nommu_region()

Don't check vm_region::vm_start is page aligned in add_nommu_region() because
the region may reflect some non-page-aligned mapped file, such as could be
obtained from RomFS XIP.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Greg Ungerer <gerg@uclinux.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/nommu.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'mm')

diff --git a/mm/nommu.c b/mm/nommu.c
index 67cd1a487ee6..b571ef707428 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -515,8 +515,6 @@ static void add_nommu_region(struct vm_region *region)
 
 	validate_nommu_regions();
 
-	BUG_ON(region->vm_start & ~PAGE_MASK);
-
 	parent = NULL;
 	p = &nommu_region_tree.rb_node;
 	while (*p) {
-- 
cgit v1.2.3


From 0f181328287db30671e9997329cff71395d4af8b Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 13 May 2009 08:29:12 -0700
Subject: Revert "Ignore madvise(MADV_WILLNEED) for hugetlbfs-backed regions"

This reverts commit a425a638c858fd10370b573bde81df3ba500e271.

Now that the previous commit removed the "readpage" actor for hugetlb
files, read-ahead will no longer mess up the mapping, and there's no
longer any reason to treat hugetlbfs mappings specially.

Tested-and-acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 mm/madvise.c | 8 --------
 1 file changed, 8 deletions(-)

(limited to 'mm')

diff --git a/mm/madvise.c b/mm/madvise.c
index 36d6ea2b6340..b9ce574827c8 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -112,14 +112,6 @@ static long madvise_willneed(struct vm_area_struct * vma,
 	if (!file)
 		return -EBADF;
 
-	/*
-	 * Page cache readahead assumes page cache pages are order-0 which
-	 * is not the case for hugetlbfs. Do not give a bad return value
-	 * but ignore the advice.
-	 */
-	if (vma->vm_flags & VM_HUGETLB)
-		return 0;
-
 	if (file->f_mapping->a_ops->get_xip_mem) {
 		/* no bad return value, but ignore advice */
 		return 0;
-- 
cgit v1.2.3


From cd17cbfda004fe5f406c01b318c6378d9895896f Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 15 May 2009 11:32:24 +0200
Subject: Revert "mm: add /proc controls for pdflush threads"

This reverts commit fafd688e4c0c34da0f3de909881117d374e4c7af.

Work is progressing to switch away from pdflush as the process backing
for flushing out dirty data. So it seems pointless to add more knobs
to control pdflush threads. The original author of the patch did not
have any specific use cases for adding the knobs, so we can easily
revert this before 2.6.30 to avoid having to maintain this API
forever.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/sysctl/vm.txt | 28 ----------------------------
 include/linux/writeback.h   |  2 --
 kernel/sysctl.c             | 23 -----------------------
 mm/pdflush.c                | 31 ++++++++++++-------------------
 4 files changed, 12 insertions(+), 72 deletions(-)

(limited to 'mm')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index b716d33912d8..c302ddf629a0 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -39,8 +39,6 @@ Currently, these files are in /proc/sys/vm:
 - nr_hugepages
 - nr_overcommit_hugepages
 - nr_pdflush_threads
-- nr_pdflush_threads_min
-- nr_pdflush_threads_max
 - nr_trim_pages         (only if CONFIG_MMU=n)
 - numa_zonelist_order
 - oom_dump_tasks
@@ -469,32 +467,6 @@ The default value is 0.
 
 ==============================================================
 
-nr_pdflush_threads_min
-
-This value controls the minimum number of pdflush threads.
-
-At boot time, the kernel will create and maintain 'nr_pdflush_threads_min'
-threads for the kernel's lifetime.
-
-The default value is 2.  The minimum value you can specify is 1, and
-the maximum value is the current setting of 'nr_pdflush_threads_max'.
-
-See 'nr_pdflush_threads_max' below for more information.
-
-==============================================================
-
-nr_pdflush_threads_max
-
-This value controls the maximum number of pdflush threads that can be
-created.  The pdflush algorithm will create a new pdflush thread (up to
-this maximum) if no pdflush threads have been available for >= 1 second.
-
-The default value is 8.  The minimum value you can specify is the
-current value of 'nr_pdflush_threads_min' and the
-maximum is 1000.
-
-==============================================================
-
 overcommit_memory:
 
 This value contains a flag that enables memory overcommitment.
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 9c1ed1fb6ddb..93445477f86a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -168,8 +168,6 @@ void writeback_set_ratelimit(void);
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
 				   read-only. */
-extern int nr_pdflush_threads_max; /* Global so it can be exported to sysctl */
-extern int nr_pdflush_threads_min; /* Global so it can be exported to sysctl */
 
 
 #endif		/* WRITEBACK_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ea78fa101ad6..b2970d56fb76 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -101,7 +101,6 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
-static int one_thousand = 1000;
 
 /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */
 static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
@@ -1033,28 +1032,6 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0444 /* read-only*/,
 		.proc_handler	= &proc_dointvec,
 	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "nr_pdflush_threads_min",
-		.data		= &nr_pdflush_threads_min,
-		.maxlen		= sizeof nr_pdflush_threads_min,
-		.mode		= 0644 /* read-write */,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &one,
-		.extra2		= &nr_pdflush_threads_max,
-	},
-	{
-		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "nr_pdflush_threads_max",
-		.data		= &nr_pdflush_threads_max,
-		.maxlen		= sizeof nr_pdflush_threads_max,
-		.mode		= 0644 /* read-write */,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &nr_pdflush_threads_min,
-		.extra2		= &one_thousand,
-	},
 	{
 		.ctl_name	= VM_SWAPPINESS,
 		.procname	= "swappiness",
diff --git a/mm/pdflush.c b/mm/pdflush.c
index f2caf96993f8..235ac440c44e 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -57,14 +57,6 @@ static DEFINE_SPINLOCK(pdflush_lock);
  */
 int nr_pdflush_threads = 0;
 
-/*
- * The max/min number of pdflush threads. R/W by sysctl at
- * /proc/sys/vm/nr_pdflush_threads_max/min
- */
-int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS;
-int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS;
-
-
 /*
  * The time at which the pdflush thread pool last went empty
  */
@@ -76,7 +68,7 @@ static unsigned long last_empty_jifs;
  * Thread pool management algorithm:
  * 
  * - The minimum and maximum number of pdflush instances are bound
- *   by nr_pdflush_threads_min and nr_pdflush_threads_max.
+ *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
  * 
  * - If there have been no idle pdflush instances for 1 second, create
  *   a new one.
@@ -142,13 +134,14 @@ static int __pdflush(struct pdflush_work *my_work)
 		 * To throttle creation, we reset last_empty_jifs.
 		 */
 		if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
-			if (list_empty(&pdflush_list) &&
-			    nr_pdflush_threads < nr_pdflush_threads_max) {
-				last_empty_jifs = jiffies;
-				nr_pdflush_threads++;
-				spin_unlock_irq(&pdflush_lock);
-				start_one_pdflush_thread();
-				spin_lock_irq(&pdflush_lock);
+			if (list_empty(&pdflush_list)) {
+				if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
+					last_empty_jifs = jiffies;
+					nr_pdflush_threads++;
+					spin_unlock_irq(&pdflush_lock);
+					start_one_pdflush_thread();
+					spin_lock_irq(&pdflush_lock);
+				}
 			}
 		}
 
@@ -160,7 +153,7 @@ static int __pdflush(struct pdflush_work *my_work)
 		 */
 		if (list_empty(&pdflush_list))
 			continue;
-		if (nr_pdflush_threads <= nr_pdflush_threads_min)
+		if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
 			continue;
 		pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
 		if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
@@ -266,9 +259,9 @@ static int __init pdflush_init(void)
 	 * Pre-set nr_pdflush_threads...  If we fail to create,
 	 * the count will be decremented.
 	 */
-	nr_pdflush_threads = nr_pdflush_threads_min;
+	nr_pdflush_threads = MIN_PDFLUSH_THREADS;
 
-	for (i = 0; i < nr_pdflush_threads_min; i++)
+	for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
 		start_one_pdflush_thread();
 	return 0;
 }
-- 
cgit v1.2.3