From b40607fc02f8248828d52d88f91b7d68df1933b0 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:07:39 -0800
Subject: [PATCH] __get_page_state() cpumask cleanup and fix

__get_page_state() has an open-coded for_each_cpu_mask() loop in it.

Tidy that up, then notice that the code was buggy:

	while (cpu < NR_CPUS) {
		unsigned long *in, *out, off;

		if (!cpu_isset(cpu, *cpumask))
			continue;

an obvious infinite loop.  I guess we just never call it with a holey cpu
mask.

Even after my cpumask size-reduction work, this patch increases code size :(

Cc: Paul Jackson <pj@sgi.com>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 234bd4895d14..61775866ea18 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1214,24 +1214,22 @@ DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
 
 static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask)
 {
-	int cpu = 0;
+	unsigned cpu;
 
 	memset(ret, 0, nr * sizeof(unsigned long));
 	cpus_and(*cpumask, *cpumask, cpu_online_map);
 
-	cpu = first_cpu(*cpumask);
-	while (cpu < NR_CPUS) {
-		unsigned long *in, *out, off;
-
-		if (!cpu_isset(cpu, *cpumask))
-			continue;
+	for_each_cpu_mask(cpu, *cpumask) {
+		unsigned long *in;
+		unsigned long *out;
+		unsigned off;
+		unsigned next_cpu;
 
 		in = (unsigned long *)&per_cpu(page_states, cpu);
 
-		cpu = next_cpu(cpu, *cpumask);
-
-		if (likely(cpu < NR_CPUS))
-			prefetch(&per_cpu(page_states, cpu));
+		next_cpu = next_cpu(cpu, *cpumask);
+		if (likely(next_cpu < NR_CPUS))
+			prefetch(&per_cpu(page_states, next_cpu));
 
 		out = (unsigned long *)ret;
 		for (off = 0; off < nr; off++)
-- 
cgit v1.2.3


From 46453a6e194a8c55fe6cf3dc8e1c4f24e2abc013 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:07:58 -0800
Subject: [PATCH] mm: never ClearPageLRU released pages

If vmscan finds a zero refcount page on the lru list, never ClearPageLRU
it.  This means the release code need not hold ->lru_lock to stabilise
PageLRU, so that lock may be skipped entirely when releasing !PageLRU pages
(because we know PageLRU won't have been temporarily cleared by vmscan,
which was previously guaranteed by holding the lock to synchronise against
vmscan).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swap.c   | 52 +++++++++++++++++++++++++++-------------------------
 mm/vmscan.c | 18 +++++++++++-------
 2 files changed, 38 insertions(+), 32 deletions(-)

(limited to 'mm')

diff --git a/mm/swap.c b/mm/swap.c
index b524ea90bddb..3045a0f4c451 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -209,19 +209,18 @@ int lru_add_drain_all(void)
  */
 void fastcall __page_cache_release(struct page *page)
 {
-	unsigned long flags;
-	struct zone *zone = page_zone(page);
+	if (PageLRU(page)) {
+		unsigned long flags;
+		struct zone *zone = page_zone(page);
 
-	spin_lock_irqsave(&zone->lru_lock, flags);
-	if (TestClearPageLRU(page))
+		spin_lock_irqsave(&zone->lru_lock, flags);
+		if (!TestClearPageLRU(page))
+			BUG();
 		del_page_from_lru(zone, page);
-	if (page_count(page) != 0)
-		page = NULL;
-	spin_unlock_irqrestore(&zone->lru_lock, flags);
-	if (page)
-		free_hot_page(page);
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+	free_hot_page(page);
 }
-
 EXPORT_SYMBOL(__page_cache_release);
 
 /*
@@ -245,7 +244,6 @@ void release_pages(struct page **pages, int nr, int cold)
 	pagevec_init(&pages_to_free, cold);
 	for (i = 0; i < nr; i++) {
 		struct page *page = pages[i];
-		struct zone *pagezone;
 
 		if (unlikely(PageCompound(page))) {
 			if (zone) {
@@ -259,23 +257,27 @@ void release_pages(struct page **pages, int nr, int cold)
 		if (!put_page_testzero(page))
 			continue;
 
-		pagezone = page_zone(page);
-		if (pagezone != zone) {
-			if (zone)
-				spin_unlock_irq(&zone->lru_lock);
-			zone = pagezone;
-			spin_lock_irq(&zone->lru_lock);
-		}
-		if (TestClearPageLRU(page))
+		if (PageLRU(page)) {
+			struct zone *pagezone = page_zone(page);
+			if (pagezone != zone) {
+				if (zone)
+					spin_unlock_irq(&zone->lru_lock);
+				zone = pagezone;
+				spin_lock_irq(&zone->lru_lock);
+			}
+			if (!TestClearPageLRU(page))
+				BUG();
 			del_page_from_lru(zone, page);
-		if (page_count(page) == 0) {
-			if (!pagevec_add(&pages_to_free, page)) {
+		}
+
+		if (!pagevec_add(&pages_to_free, page)) {
+			if (zone) {
 				spin_unlock_irq(&zone->lru_lock);
-				__pagevec_free(&pages_to_free);
-				pagevec_reinit(&pages_to_free);
-				zone = NULL;	/* No lock is held */
+				zone = NULL;
 			}
-		}
+			__pagevec_free(&pages_to_free);
+			pagevec_reinit(&pages_to_free);
+  		}
 	}
 	if (zone)
 		spin_unlock_irq(&zone->lru_lock);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4fe7e3aa02e2..acb7611cd525 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1085,21 +1085,25 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
-		if (!TestClearPageLRU(page))
-			BUG();
 		list_del(&page->lru);
-		if (get_page_testone(page)) {
+		if (unlikely(get_page_testone(page))) {
 			/*
 			 * It is being freed elsewhere
 			 */
 			__put_page(page);
-			SetPageLRU(page);
 			list_add(&page->lru, src);
 			continue;
-		} else {
-			list_add(&page->lru, dst);
-			nr_taken++;
 		}
+
+		/*
+		 * Be careful not to clear PageLRU until after we're sure
+		 * the page is not being freed elsewhere -- the page release
+		 * code relies on it.
+		 */
+		if (!TestClearPageLRU(page))
+			BUG();
+		list_add(&page->lru, dst);
+		nr_taken++;
 	}
 
 	*scanned = scan;
-- 
cgit v1.2.3


From 8d438f96d2b8eade6cbcd8adfc22dae6f5cbd6c0 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:07:59 -0800
Subject: [PATCH] mm: PageLRU no testset

PG_lru is protected by zone->lru_lock. It does not need TestSet/TestClear
operations.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h |  5 ++---
 mm/swap.c                  | 16 ++++++++--------
 mm/vmscan.c                | 20 +++++++++++---------
 3 files changed, 21 insertions(+), 20 deletions(-)

(limited to 'mm')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index d52999c43336..58856c823f8b 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -239,10 +239,9 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define __ClearPageDirty(page)	__clear_bit(PG_dirty, &(page)->flags)
 #define TestClearPageDirty(page) test_and_clear_bit(PG_dirty, &(page)->flags)
 
-#define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
 #define PageLRU(page)		test_bit(PG_lru, &(page)->flags)
-#define TestSetPageLRU(page)	test_and_set_bit(PG_lru, &(page)->flags)
-#define TestClearPageLRU(page)	test_and_clear_bit(PG_lru, &(page)->flags)
+#define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
+#define ClearPageLRU(page)	clear_bit(PG_lru, &(page)->flags)
 
 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
diff --git a/mm/swap.c b/mm/swap.c
index 3045a0f4c451..985324ee9368 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -214,8 +214,8 @@ void fastcall __page_cache_release(struct page *page)
 		struct zone *zone = page_zone(page);
 
 		spin_lock_irqsave(&zone->lru_lock, flags);
-		if (!TestClearPageLRU(page))
-			BUG();
+		BUG_ON(!PageLRU(page));
+		ClearPageLRU(page);
 		del_page_from_lru(zone, page);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
@@ -265,8 +265,8 @@ void release_pages(struct page **pages, int nr, int cold)
 				zone = pagezone;
 				spin_lock_irq(&zone->lru_lock);
 			}
-			if (!TestClearPageLRU(page))
-				BUG();
+			BUG_ON(!PageLRU(page));
+			ClearPageLRU(page);
 			del_page_from_lru(zone, page);
 		}
 
@@ -345,8 +345,8 @@ void __pagevec_lru_add(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		add_page_to_inactive_list(zone, page);
 	}
 	if (zone)
@@ -372,8 +372,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
 			zone = pagezone;
 			spin_lock_irq(&zone->lru_lock);
 		}
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		if (TestSetPageActive(page))
 			BUG();
 		add_page_to_active_list(zone, page);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index acb7611cd525..40fb37828e8c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1042,9 +1042,10 @@ int isolate_lru_page(struct page *page)
 	if (PageLRU(page)) {
 		struct zone *zone = page_zone(page);
 		spin_lock_irq(&zone->lru_lock);
-		if (TestClearPageLRU(page)) {
+		if (PageLRU(page)) {
 			ret = 1;
 			get_page(page);
+			ClearPageLRU(page);
 			if (PageActive(page))
 				del_page_from_active_list(zone, page);
 			else
@@ -1085,6 +1086,8 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
+		BUG_ON(!PageLRU(page));
+
 		list_del(&page->lru);
 		if (unlikely(get_page_testone(page))) {
 			/*
@@ -1100,8 +1103,7 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 		 * the page is not being freed elsewhere -- the page release
 		 * code relies on it.
 		 */
-		if (!TestClearPageLRU(page))
-			BUG();
+		ClearPageLRU(page);
 		list_add(&page->lru, dst);
 		nr_taken++;
 	}
@@ -1156,8 +1158,8 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc)
 		 */
 		while (!list_empty(&page_list)) {
 			page = lru_to_page(&page_list);
-			if (TestSetPageLRU(page))
-				BUG();
+			BUG_ON(PageLRU(page));
+			SetPageLRU(page);
 			list_del(&page->lru);
 			if (PageActive(page))
 				add_page_to_active_list(zone, page);
@@ -1276,8 +1278,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	while (!list_empty(&l_inactive)) {
 		page = lru_to_page(&l_inactive);
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		if (!TestClearPageActive(page))
 			BUG();
 		list_move(&page->lru, &zone->inactive_list);
@@ -1305,8 +1307,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 	while (!list_empty(&l_active)) {
 		page = lru_to_page(&l_active);
 		prefetchw_prev_lru_page(page, &l_active, flags);
-		if (TestSetPageLRU(page))
-			BUG();
+		BUG_ON(PageLRU(page));
+		SetPageLRU(page);
 		BUG_ON(!PageActive(page));
 		list_move(&page->lru, &zone->active_list);
 		pgmoved++;
-- 
cgit v1.2.3


From 4c84cacfa424264f7ad5287298d3ea4a3e935278 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:00 -0800
Subject: [PATCH] mm: PageActive no testset

PG_active is protected by zone->lru_lock, it does not need TestSet/TestClear
operations.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 2 --
 mm/swap.c                  | 4 ++--
 mm/vmscan.c                | 5 +++--
 3 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 58856c823f8b..5d1e7bd85107 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -246,8 +246,6 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
 #define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
-#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags)
-#define TestSetPageActive(page) test_and_set_bit(PG_active, &(page)->flags)
 
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
 #define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
diff --git a/mm/swap.c b/mm/swap.c
index 985324ee9368..cf88226cf96d 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -374,8 +374,8 @@ void __pagevec_lru_add_active(struct pagevec *pvec)
 		}
 		BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		if (TestSetPageActive(page))
-			BUG();
+		BUG_ON(PageActive(page));
+		SetPageActive(page);
 		add_page_to_active_list(zone, page);
 	}
 	if (zone)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 40fb37828e8c..8e477b1a4838 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1280,8 +1280,9 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
 		prefetchw_prev_lru_page(page, &l_inactive, flags);
 		BUG_ON(PageLRU(page));
 		SetPageLRU(page);
-		if (!TestClearPageActive(page))
-			BUG();
+		BUG_ON(!PageActive(page));
+		ClearPageActive(page);
+
 		list_move(&page->lru, &zone->inactive_list);
 		pgmoved++;
 		if (!pagevec_add(&pvec, page)) {
-- 
cgit v1.2.3


From 674539115cc88473f623581e1d53c0e2ecef2179 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:00 -0800
Subject: [PATCH] mm: less atomic ops

In the page release paths, we can be sure that nobody will mess with our
page->flags because the refcount has dropped to 0.  So no need for atomic
operations here.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm_inline.h  | 2 +-
 include/linux/page-flags.h | 2 ++
 mm/swap.c                  | 4 ++--
 3 files changed, 5 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 8ac854f7f190..3b6723dfaff3 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -32,7 +32,7 @@ del_page_from_lru(struct zone *zone, struct page *page)
 {
 	list_del(&page->lru);
 	if (PageActive(page)) {
-		ClearPageActive(page);
+		__ClearPageActive(page);
 		zone->nr_active--;
 	} else {
 		zone->nr_inactive--;
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 5d1e7bd85107..da71d63df465 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -242,10 +242,12 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define PageLRU(page)		test_bit(PG_lru, &(page)->flags)
 #define SetPageLRU(page)	set_bit(PG_lru, &(page)->flags)
 #define ClearPageLRU(page)	clear_bit(PG_lru, &(page)->flags)
+#define __ClearPageLRU(page)	__clear_bit(PG_lru, &(page)->flags)
 
 #define PageActive(page)	test_bit(PG_active, &(page)->flags)
 #define SetPageActive(page)	set_bit(PG_active, &(page)->flags)
 #define ClearPageActive(page)	clear_bit(PG_active, &(page)->flags)
+#define __ClearPageActive(page)	__clear_bit(PG_active, &(page)->flags)
 
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
 #define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
diff --git a/mm/swap.c b/mm/swap.c
index cf88226cf96d..91b7e2026f69 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -215,7 +215,7 @@ void fastcall __page_cache_release(struct page *page)
 
 		spin_lock_irqsave(&zone->lru_lock, flags);
 		BUG_ON(!PageLRU(page));
-		ClearPageLRU(page);
+		__ClearPageLRU(page);
 		del_page_from_lru(zone, page);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
@@ -266,7 +266,7 @@ void release_pages(struct page **pages, int nr, int cold)
 				spin_lock_irq(&zone->lru_lock);
 			}
 			BUG_ON(!PageLRU(page));
-			ClearPageLRU(page);
+			__ClearPageLRU(page);
 			del_page_from_lru(zone, page);
 		}
 
-- 
cgit v1.2.3


From 5e9dace8d386def04219134d7160e8a778824764 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:01 -0800
Subject: [PATCH] mm: page_alloc less atomics

More atomic operation removal from page allocator

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 4 ++--
 mm/page_alloc.c            | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index da71d63df465..76c7ffdd0424 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -328,8 +328,8 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define TestClearPageReclaim(page) test_and_clear_bit(PG_reclaim, &(page)->flags)
 
 #define PageCompound(page)	test_bit(PG_compound, &(page)->flags)
-#define SetPageCompound(page)	set_bit(PG_compound, &(page)->flags)
-#define ClearPageCompound(page)	clear_bit(PG_compound, &(page)->flags)
+#define __SetPageCompound(page)	__set_bit(PG_compound, &(page)->flags)
+#define __ClearPageCompound(page) __clear_bit(PG_compound, &(page)->flags)
 
 #ifdef CONFIG_SWAP
 #define PageSwapCache(page)	test_bit(PG_swapcache, &(page)->flags)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 61775866ea18..102919851353 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -190,7 +190,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
 	for (i = 0; i < nr_pages; i++) {
 		struct page *p = page + i;
 
-		SetPageCompound(p);
+		__SetPageCompound(p);
 		set_page_private(p, (unsigned long)page);
 	}
 }
@@ -209,7 +209,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 		if (unlikely(!PageCompound(p) |
 				(page_private(p) != (unsigned long)page)))
 			bad_page(page);
-		ClearPageCompound(p);
+		__ClearPageCompound(p);
 	}
 }
 
-- 
cgit v1.2.3


From f205b2fe62d321403525065a4cb31b6bff1bbe53 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:02 -0800
Subject: [PATCH] mm: slab less atomics

Atomic operation removal from slab

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/page-flags.h | 6 ++----
 mm/slab.c                  | 6 +++---
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 76c7ffdd0424..8cef69d462f2 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -250,10 +250,8 @@ extern void __mod_page_state_offset(unsigned long offset, unsigned long delta);
 #define __ClearPageActive(page)	__clear_bit(PG_active, &(page)->flags)
 
 #define PageSlab(page)		test_bit(PG_slab, &(page)->flags)
-#define SetPageSlab(page)	set_bit(PG_slab, &(page)->flags)
-#define ClearPageSlab(page)	clear_bit(PG_slab, &(page)->flags)
-#define TestClearPageSlab(page)	test_and_clear_bit(PG_slab, &(page)->flags)
-#define TestSetPageSlab(page)	test_and_set_bit(PG_slab, &(page)->flags)
+#define __SetPageSlab(page)	__set_bit(PG_slab, &(page)->flags)
+#define __ClearPageSlab(page)	__clear_bit(PG_slab, &(page)->flags)
 
 #ifdef CONFIG_HIGHMEM
 #define PageHighMem(page)	is_highmem(page_zone(page))
diff --git a/mm/slab.c b/mm/slab.c
index d0bd7f07ab04..5988adf010c5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1402,7 +1402,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		atomic_add(i, &slab_reclaim_pages);
 	add_page_state(nr_slab, i);
 	while (i--) {
-		SetPageSlab(page);
+		__SetPageSlab(page);
 		page++;
 	}
 	return addr;
@@ -1418,8 +1418,8 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 	const unsigned long nr_freed = i;
 
 	while (i--) {
-		if (!TestClearPageSlab(page))
-			BUG();
+		BUG_ON(!PageSlab(page));
+		__ClearPageSlab(page);
 		page++;
 	}
 	sub_page_state(nr_slab, nr_freed);
-- 
cgit v1.2.3


From 7c8ee9a86340db686cd4314e9944dc9b6111bda9 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:03 -0800
Subject: [PATCH] mm: simplify vmscan vs release refcounting

The VM has an interesting race where a page refcount can drop to zero, but it
is still on the LRU lists for a short time.  This was solved by testing a 0->1
refcount transition when picking up pages from the LRU, and dropping the
refcount in that case.

Instead, use atomic_add_unless to ensure we never pick up a 0 refcount page
from the LRU, thus a 0 refcount page will never have its refcount elevated
until it is allocated again.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h | 19 +++++++++++--------
 mm/vmscan.c        | 25 +++++++++++--------------
 2 files changed, 22 insertions(+), 22 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 498ff8778fb6..b12d5c76420d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -301,17 +301,20 @@ struct page {
  * Drop a ref, return true if the logical refcount fell to zero (the page has
  * no users)
  */
-#define put_page_testzero(p)				\
-	({						\
-		BUG_ON(atomic_read(&(p)->_count) == -1);\
-		atomic_add_negative(-1, &(p)->_count);	\
-	})
+static inline int put_page_testzero(struct page *page)
+{
+	BUG_ON(atomic_read(&page->_count) == -1);
+	return atomic_add_negative(-1, &page->_count);
+}
 
 /*
- * Grab a ref, return true if the page previously had a logical refcount of
- * zero.  ie: returns true if we just grabbed an already-deemed-to-be-free page
+ * Try to grab a ref unless the page has a refcount of zero, return false if
+ * that is the case.
  */
-#define get_page_testone(p)	atomic_inc_and_test(&(p)->_count)
+static inline int get_page_unless_zero(struct page *page)
+{
+	return atomic_add_unless(&page->_count, 1, -1);
+}
 
 #define set_page_count(p,v) 	atomic_set(&(p)->_count, (v) - 1)
 #define __put_page(p)		atomic_dec(&(p)->_count)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8e477b1a4838..e21bab4deda6 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1083,29 +1083,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 	int scan = 0;
 
 	while (scan++ < nr_to_scan && !list_empty(src)) {
+		struct list_head *target;
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
 
 		BUG_ON(!PageLRU(page));
 
 		list_del(&page->lru);
-		if (unlikely(get_page_testone(page))) {
+		target = src;
+		if (likely(get_page_unless_zero(page))) {
 			/*
-			 * It is being freed elsewhere
+			 * Be careful not to clear PageLRU until after we're
+			 * sure the page is not being freed elsewhere -- the
+			 * page release code relies on it.
 			 */
-			__put_page(page);
-			list_add(&page->lru, src);
-			continue;
-		}
+			ClearPageLRU(page);
+			target = dst;
+			nr_taken++;
+		} /* else it is being freed elsewhere */
 
-		/*
-		 * Be careful not to clear PageLRU until after we're sure
-		 * the page is not being freed elsewhere -- the page release
-		 * code relies on it.
-		 */
-		ClearPageLRU(page);
-		list_add(&page->lru, dst);
-		nr_taken++;
+		list_add(&page->lru, target);
 	}
 
 	*scanned = scan;
-- 
cgit v1.2.3


From 8dfcc9ba27e2ed257e5de9539f7f03e57c2c0e33 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:05 -0800
Subject: [PATCH] mm: split highorder pages

Have an explicit mm call to split higher order pages into individual pages.
 Should help to avoid bugs and be more explicit about the code's intention.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: David Howells <dhowells@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Chris Zankel <chris@zankel.net>
Signed-off-by: Yoichi Yuasa <yoichi_yuasa@tripeaks.co.jp>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/arm/mm/consistent.c      |  4 ++--
 arch/frv/mm/dma-alloc.c       |  4 +---
 arch/mips/mm/init.c           |  5 +++--
 arch/ppc/kernel/dma-mapping.c |  4 ++--
 arch/sh/mm/consistent.c       |  3 +--
 arch/xtensa/mm/pgtable.c      | 10 +++-------
 include/linux/mm.h            |  6 ++++++
 mm/memory.c                   |  4 +---
 mm/page_alloc.c               | 22 ++++++++++++++++++++++
 9 files changed, 41 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c
index c2ee18d2075e..8a1bfcd50087 100644
--- a/arch/arm/mm/consistent.c
+++ b/arch/arm/mm/consistent.c
@@ -223,6 +223,8 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 		pte = consistent_pte[idx] + off;
 		c->vm_pages = page;
 
+		split_page(page, order);
+
 		/*
 		 * Set the "dma handle"
 		 */
@@ -231,7 +233,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 		do {
 			BUG_ON(!pte_none(*pte));
 
-			set_page_count(page, 1);
 			/*
 			 * x86 does not mark the pages reserved...
 			 */
@@ -250,7 +251,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp,
 		 * Free the otherwise unused pages.
 		 */
 		while (page < end) {
-			set_page_count(page, 1);
 			__free_page(page);
 			page++;
 		}
diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c
index 342823aad758..636b2f8b5d98 100644
--- a/arch/frv/mm/dma-alloc.c
+++ b/arch/frv/mm/dma-alloc.c
@@ -115,9 +115,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *dma_handle)
 	 */
 	if (order > 0) {
 		struct page *rpage = virt_to_page(page);
-
-		for (i = 1; i < (1 << order); i++)
-			set_page_count(rpage + i, 1);
+		split_page(rpage, order);
 	}
 
 	err = 0;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index 0ff9a348b843..a140da9732db 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -54,7 +54,8 @@ unsigned long empty_zero_page, zero_page_mask;
  */
 unsigned long setup_zero_pages(void)
 {
-	unsigned long order, size;
+	unsigned int order;
+	unsigned long size;
 	struct page *page;
 
 	if (cpu_has_vce)
@@ -67,9 +68,9 @@ unsigned long setup_zero_pages(void)
 		panic("Oh boy, that early out of memory?");
 
 	page = virt_to_page(empty_zero_page);
+	split_page(page, order);
 	while (page < virt_to_page(empty_zero_page + (PAGE_SIZE << order))) {
 		SetPageReserved(page);
-		set_page_count(page, 1);
 		page++;
 	}
 
diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c
index 685fd0defe23..61465ec88bc7 100644
--- a/arch/ppc/kernel/dma-mapping.c
+++ b/arch/ppc/kernel/dma-mapping.c
@@ -223,6 +223,8 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 		pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr);
 		struct page *end = page + (1 << order);
 
+		split_page(page, order);
+
 		/*
 		 * Set the "dma handle"
 		 */
@@ -231,7 +233,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 		do {
 			BUG_ON(!pte_none(*pte));
 
-			set_page_count(page, 1);
 			SetPageReserved(page);
 			set_pte_at(&init_mm, vaddr,
 				   pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL)));
@@ -244,7 +245,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp)
 		 * Free the otherwise unused pages.
 		 */
 		while (page < end) {
-			set_page_count(page, 1);
 			__free_page(page);
 			page++;
 		}
diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c
index df3a9e452cc5..ee73e30263af 100644
--- a/arch/sh/mm/consistent.c
+++ b/arch/sh/mm/consistent.c
@@ -23,6 +23,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle)
 	page = alloc_pages(gfp, order);
 	if (!page)
 		return NULL;
+	split_page(page, order);
 
 	ret = page_address(page);
 	*handle = virt_to_phys(ret);
@@ -37,8 +38,6 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle)
 	end  = page + (1 << order);
 
 	while (++page < end) {
-		set_page_count(page, 1);
-
 		/* Free any unused pages */
 		if (page >= free) {
 			__free_page(page);
diff --git a/arch/xtensa/mm/pgtable.c b/arch/xtensa/mm/pgtable.c
index cbc56aedf13e..7d28914d11cb 100644
--- a/arch/xtensa/mm/pgtable.c
+++ b/arch/xtensa/mm/pgtable.c
@@ -21,13 +21,9 @@ pte_t* pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 	p = (pte_t*) __get_free_pages(GFP_KERNEL|__GFP_REPEAT, COLOR_ORDER);
 
 	if (likely(p)) {
-		struct page *page;
+		split_page(virt_to_page(p), COLOR_ORDER);
 
 		for (i = 0; i < COLOR_SIZE; i++) {
-			page = virt_to_page(p);
-
-			set_page_count(page, 1);
-
 			if (ADDR_COLOR(p) == color)
 				pte = p;
 			else
@@ -55,9 +51,9 @@ struct page* pte_alloc_one(struct mm_struct *mm, unsigned long address)
 	p = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER);
 
 	if (likely(p)) {
-		for (i = 0; i < PAGE_ORDER; i++) {
-			set_page_count(p, 1);
+		split_page(p, COLOR_ORDER);
 
+		for (i = 0; i < PAGE_ORDER; i++) {
 			if (PADDR_COLOR(page_address(p)) == color)
 				page = p;
 			else
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9bbddf228cd9..e67980654c49 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -328,6 +328,12 @@ static inline void get_page(struct page *page)
 
 void put_page(struct page *page);
 
+#ifdef CONFIG_MMU
+void split_page(struct page *page, unsigned int order);
+#else
+static inline void split_page(struct page *page, unsigned int order) {}
+#endif
+
 /*
  * Multiple processes may "see" the same page. E.g. for untouched
  * mappings of /dev/null, all processes see the same page full of
diff --git a/mm/memory.c b/mm/memory.c
index 85e80a57db29..6af555c1c42a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1221,9 +1221,7 @@ out:
  * The page has to be a nice clean _individual_ kernel allocation.
  * If you allocate a compound page, you need to have marked it as
  * such (__GFP_COMP), or manually just split the page up yourself
- * (which is mainly an issue of doing "set_page_count(page, 1)" for
- * each sub-page, and then freeing them one by one when you free
- * them rather than freeing it as a compound page).
+ * (see split_page()).
  *
  * NOTE! Traditionally this was done with "remap_pfn_range()" which
  * took an arbitrary page protection parameter. This doesn't allow
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 102919851353..fc65e87368b3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -752,6 +752,28 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 		clear_highpage(page + i);
 }
 
+#ifdef CONFIG_MMU
+/*
+ * split_page takes a non-compound higher-order page, and splits it into
+ * n (1<<order) sub-pages: page[0..n]
+ * Each sub-page must be freed individually.
+ *
+ * Note: this is probably too low level an operation for use in drivers.
+ * Please consult with lkml before using this in your driver.
+ */
+void split_page(struct page *page, unsigned int order)
+{
+	int i;
+
+	BUG_ON(PageCompound(page));
+	BUG_ON(!page_count(page));
+	for (i = 1; i < (1 << order); i++) {
+		BUG_ON(page_count(page + i));
+		set_page_count(page + i, 1);
+	}
+}
+#endif
+
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
  * we cheat by calling it from here, in the order > 0 path.  Saves a branch
-- 
cgit v1.2.3


From 545b1ea9bfa5a8ca9af33d63144bd4f2faaea8dd Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:07 -0800
Subject: [PATCH] mm: cleanup bootmem

The bootmem code added to page_alloc.c duplicated some page freeing code
that it really doesn't need to because it is not so performance critical.

While we're here, make prefetching work properly by actually prefetching
the page we're about to use before prefetching ahead to the next one (ie.
get the most important transaction started first).  Also prefetch just a
single page ahead rather than leaving a gap of 16.

Jack Steiner reported no problems with SGI's ia64 simulator.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fc65e87368b3..7aa0181287e1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -55,7 +55,6 @@ unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
 int percpu_pagelist_fraction;
 
-static void fastcall free_hot_cold_page(struct page *page, int cold);
 static void __free_pages_ok(struct page *page, unsigned int order);
 
 /*
@@ -448,28 +447,23 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 	if (order == 0) {
 		__ClearPageReserved(page);
 		set_page_count(page, 0);
-
-		free_hot_cold_page(page, 0);
+		set_page_refs(page, 0);
+		__free_page(page);
 	} else {
-		LIST_HEAD(list);
 		int loop;
 
+		prefetchw(page);
 		for (loop = 0; loop < BITS_PER_LONG; loop++) {
 			struct page *p = &page[loop];
 
-			if (loop + 16 < BITS_PER_LONG)
-				prefetchw(p + 16);
+			if (loop + 1 < BITS_PER_LONG)
+				prefetchw(p + 1);
 			__ClearPageReserved(p);
 			set_page_count(p, 0);
 		}
 
-		arch_free_page(page, order);
-
-		mod_page_state(pgfree, 1 << order);
-
-		list_add(&page->lru, &list);
-		kernel_map_pages(page, 1 << order, 0);
-		free_pages_bulk(page_zone(page), 1, &list, order);
+		set_page_refs(page, order);
+		__free_pages(page, order);
 	}
 }
 
-- 
cgit v1.2.3


From a482289d46587ffcda4c85aab109fb74910d7a48 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:08 -0800
Subject: [PATCH] hugepage allocator cleanup

Insert "fresh" huge pages into the hugepage allocator by the same means as
they are freed back into it.  This reduces code size and allows
enqueue_huge_page to be inlined into the hugepage free fastpath.

Eliminate occurances of hugepages on the free list with non-zero refcount.
This can allow stricter refcount checks in future.  Also required for
lockless pagecache.

Signed-off-by: Nick Piggin <npiggin@suse.de>

"This patch also eliminates a leak "cleaned up" by re-clobbering the
refcount on every allocation from the hugepage freelists.  With respect to
the lockless pagecache, the crucial aspect is to eliminate unconditional
set_page_count() to 0 on pages with potentially nonzero refcounts, though
closer inspection suggests the assignments removed are entirely spurious."

Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 508707704d2c..39d49ecea8e8 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -64,7 +64,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 	return page;
 }
 
-static struct page *alloc_fresh_huge_page(void)
+static int alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
 	struct page *page;
@@ -72,12 +72,15 @@ static struct page *alloc_fresh_huge_page(void)
 					HUGETLB_PAGE_ORDER);
 	nid = (nid + 1) % num_online_nodes();
 	if (page) {
+		page[1].lru.next = (void *)free_huge_page;	/* dtor */
 		spin_lock(&hugetlb_lock);
 		nr_huge_pages++;
 		nr_huge_pages_node[page_to_nid(page)]++;
 		spin_unlock(&hugetlb_lock);
+		put_page(page); /* free it into the hugepage allocator */
+		return 1;
 	}
-	return page;
+	return 0;
 }
 
 void free_huge_page(struct page *page)
@@ -85,7 +88,6 @@ void free_huge_page(struct page *page)
 	BUG_ON(page_count(page));
 
 	INIT_LIST_HEAD(&page->lru);
-	page[1].lru.next = NULL;			/* reset dtor */
 
 	spin_lock(&hugetlb_lock);
 	enqueue_huge_page(page);
@@ -105,7 +107,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 	}
 	spin_unlock(&hugetlb_lock);
 	set_page_count(page, 1);
-	page[1].lru.next = (void *)free_huge_page;	/* set dtor */
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_user_highpage(&page[i], addr);
 	return page;
@@ -114,7 +115,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 static int __init hugetlb_init(void)
 {
 	unsigned long i;
-	struct page *page;
 
 	if (HPAGE_SHIFT == 0)
 		return 0;
@@ -123,12 +123,8 @@ static int __init hugetlb_init(void)
 		INIT_LIST_HEAD(&hugepage_freelists[i]);
 
 	for (i = 0; i < max_huge_pages; ++i) {
-		page = alloc_fresh_huge_page();
-		if (!page)
+		if (!alloc_fresh_huge_page())
 			break;
-		spin_lock(&hugetlb_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&hugetlb_lock);
 	}
 	max_huge_pages = free_huge_pages = nr_huge_pages = i;
 	printk("Total HugeTLB memory allocated, %ld\n", free_huge_pages);
@@ -154,8 +150,8 @@ static void update_and_free_page(struct page *page)
 		page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced |
 				1 << PG_dirty | 1 << PG_active | 1 << PG_reserved |
 				1 << PG_private | 1<< PG_writeback);
-		set_page_count(&page[i], 0);
 	}
+	page[1].lru.next = NULL;
 	set_page_count(page, 1);
 	__free_pages(page, HUGETLB_PAGE_ORDER);
 }
@@ -188,12 +184,8 @@ static inline void try_to_free_low(unsigned long count)
 static unsigned long set_max_huge_pages(unsigned long count)
 {
 	while (count > nr_huge_pages) {
-		struct page *page = alloc_fresh_huge_page();
-		if (!page)
+		if (!alloc_fresh_huge_page())
 			return nr_huge_pages;
-		spin_lock(&hugetlb_lock);
-		enqueue_huge_page(page);
-		spin_unlock(&hugetlb_lock);
 	}
 	if (count >= nr_huge_pages)
 		return nr_huge_pages;
-- 
cgit v1.2.3


From 8fea4e96a8f29ccc34c244f54574680ce9b43631 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 22 Mar 2006 00:08:10 -0800
Subject: [PATCH] slab: object to index mapping cleanup

Clean up the object to index mapping that has been spread around mm/slab.c.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 5988adf010c5..3d18b711ab82 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -609,6 +609,18 @@ static inline struct slab *virt_to_slab(const void *obj)
 	return page_get_slab(page);
 }
 
+static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
+				 unsigned int idx)
+{
+	return slab->s_mem + cache->buffer_size * idx;
+}
+
+static inline unsigned int obj_to_index(struct kmem_cache *cache,
+					struct slab *slab, void *obj)
+{
+	return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
+}
+
 /* These are the default caches for kmalloc. Custom caches can have other sizes. */
 struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
@@ -1568,18 +1580,18 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 		 * exist:
 		 */
 		struct slab *slabp = virt_to_slab(objp);
-		int objnr;
+		unsigned int objnr;
 
-		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+		objnr = obj_to_index(cachep, slabp, objp);
 		if (objnr) {
-			objp = slabp->s_mem + (objnr - 1) * cachep->buffer_size;
+			objp = index_to_obj(cachep, slabp, objnr - 1);
 			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
 			       realobj, size);
 			print_objinfo(cachep, objp, 2);
 		}
 		if (objnr + 1 < cachep->num) {
-			objp = slabp->s_mem + (objnr + 1) * cachep->buffer_size;
+			objp = index_to_obj(cachep, slabp, objnr + 1);
 			realobj = (char *)objp + obj_offset(cachep);
 			printk(KERN_ERR "Next obj: start=%p, len=%d\n",
 			       realobj, size);
@@ -1598,7 +1610,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
 	int i;
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem + cachep->buffer_size * i;
+		void *objp = index_to_obj(cachep, slabp, i);
 
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
@@ -1631,7 +1643,7 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 	if (cachep->dtor) {
 		int i;
 		for (i = 0; i < cachep->num; i++) {
-			void *objp = slabp->s_mem + cachep->buffer_size * i;
+			void *objp = index_to_obj(cachep, slabp, i);
 			(cachep->dtor) (objp, cachep, 0);
 		}
 	}
@@ -2307,7 +2319,7 @@ static void cache_init_objs(struct kmem_cache *cachep,
 	int i;
 
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem + cachep->buffer_size * i;
+		void *objp = index_to_obj(cachep, slabp, i);
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
@@ -2363,7 +2375,7 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 
 static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
 {
-	void *objp = slabp->s_mem + (slabp->free * cachep->buffer_size);
+	void *objp = index_to_obj(cachep, slabp, slabp->free);
 	kmem_bufctl_t next;
 
 	slabp->inuse++;
@@ -2380,7 +2392,7 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod
 static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
 			  int nodeid)
 {
-	unsigned int objnr = (unsigned)(objp-slabp->s_mem) / cachep->buffer_size;
+	unsigned int objnr = obj_to_index(cachep, slabp, objp);
 
 #if DEBUG
 	/* Verify that the slab belongs to the intended node */
@@ -2565,10 +2577,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	if (cachep->flags & SLAB_STORE_USER)
 		*dbg_userword(cachep, objp) = caller;
 
-	objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+	objnr = obj_to_index(cachep, slabp, objp);
 
 	BUG_ON(objnr >= cachep->num);
-	BUG_ON(objp != slabp->s_mem + objnr * cachep->buffer_size);
+	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
 
 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
 		/* Need to call the slab's constructor so the
-- 
cgit v1.2.3


From f30cf7d13eee420f5249b4d7709b46570098ab92 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 22 Mar 2006 00:08:11 -0800
Subject: [PATCH] slab: extract setup_cpu_cache

Extract setup_cpu_cache() function from kmem_cache_create() to make the
latter a little less complex.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 109 +++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 55 insertions(+), 54 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 3d18b711ab82..4d5c4b93e0eb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1748,6 +1748,60 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 	return left_over;
 }
 
+static void setup_cpu_cache(struct kmem_cache *cachep)
+{
+	if (g_cpucache_up == FULL) {
+		enable_cpucache(cachep);
+		return;
+	}
+	if (g_cpucache_up == NONE) {
+		/*
+		 * Note: the first kmem_cache_create must create the cache
+		 * that's used by kmalloc(24), otherwise the creation of
+		 * further caches will BUG().
+		 */
+		cachep->array[smp_processor_id()] = &initarray_generic.cache;
+
+		/*
+		 * If the cache that's used by kmalloc(sizeof(kmem_list3)) is
+		 * the first cache, then we need to set up all its list3s,
+		 * otherwise the creation of further caches will BUG().
+		 */
+		set_up_list3s(cachep, SIZE_AC);
+		if (INDEX_AC == INDEX_L3)
+			g_cpucache_up = PARTIAL_L3;
+		else
+			g_cpucache_up = PARTIAL_AC;
+	} else {
+		cachep->array[smp_processor_id()] =
+			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+
+		if (g_cpucache_up == PARTIAL_AC) {
+			set_up_list3s(cachep, SIZE_L3);
+			g_cpucache_up = PARTIAL_L3;
+		} else {
+			int node;
+			for_each_online_node(node) {
+				cachep->nodelists[node] =
+				    kmalloc_node(sizeof(struct kmem_list3),
+						GFP_KERNEL, node);
+				BUG_ON(!cachep->nodelists[node]);
+				kmem_list3_init(cachep->nodelists[node]);
+			}
+		}
+	}
+	cachep->nodelists[numa_node_id()]->next_reap =
+			jiffies + REAPTIMEOUT_LIST3 +
+			((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+
+	cpu_cache_get(cachep)->avail = 0;
+	cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
+	cpu_cache_get(cachep)->batchcount = 1;
+	cpu_cache_get(cachep)->touched = 0;
+	cachep->batchcount = 1;
+	cachep->limit = BOOT_CPUCACHE_ENTRIES;
+}
+
 /**
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -2000,60 +2054,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->name = name;
 
 
-	if (g_cpucache_up == FULL) {
-		enable_cpucache(cachep);
-	} else {
-		if (g_cpucache_up == NONE) {
-			/* Note: the first kmem_cache_create must create
-			 * the cache that's used by kmalloc(24), otherwise
-			 * the creation of further caches will BUG().
-			 */
-			cachep->array[smp_processor_id()] =
-			    &initarray_generic.cache;
-
-			/* If the cache that's used by
-			 * kmalloc(sizeof(kmem_list3)) is the first cache,
-			 * then we need to set up all its list3s, otherwise
-			 * the creation of further caches will BUG().
-			 */
-			set_up_list3s(cachep, SIZE_AC);
-			if (INDEX_AC == INDEX_L3)
-				g_cpucache_up = PARTIAL_L3;
-			else
-				g_cpucache_up = PARTIAL_AC;
-		} else {
-			cachep->array[smp_processor_id()] =
-			    kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-
-			if (g_cpucache_up == PARTIAL_AC) {
-				set_up_list3s(cachep, SIZE_L3);
-				g_cpucache_up = PARTIAL_L3;
-			} else {
-				int node;
-				for_each_online_node(node) {
-
-					cachep->nodelists[node] =
-					    kmalloc_node(sizeof
-							 (struct kmem_list3),
-							 GFP_KERNEL, node);
-					BUG_ON(!cachep->nodelists[node]);
-					kmem_list3_init(cachep->
-							nodelists[node]);
-				}
-			}
-		}
-		cachep->nodelists[numa_node_id()]->next_reap =
-		    jiffies + REAPTIMEOUT_LIST3 +
-		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-
-		BUG_ON(!cpu_cache_get(cachep));
-		cpu_cache_get(cachep)->avail = 0;
-		cpu_cache_get(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-		cpu_cache_get(cachep)->batchcount = 1;
-		cpu_cache_get(cachep)->touched = 0;
-		cachep->batchcount = 1;
-		cachep->limit = BOOT_CPUCACHE_ENTRIES;
-	}
+	setup_cpu_cache(cachep);
 
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
-- 
cgit v1.2.3


From a737b3e2fcf96f576fa3e2e382236d9ee94f383f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:11 -0800
Subject: [PATCH] slab cleanup

slab.c has become a bit revolting again.  Try to repair it.

- Coding style fixes

- Don't do assignments-in-if-statements.

- Don't typecast assignments to/from void*

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 596 ++++++++++++++++++++++++++++++++------------------------------
 1 file changed, 304 insertions(+), 292 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 4d5c4b93e0eb..7b6f9f10e757 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -50,7 +50,7 @@
  * The head array is strictly LIFO and should improve the cache hit rates.
  * On SMP, it additionally reduces the spinlock operations.
  *
- * The c_cpuarray may not be read with enabled local interrupts - 
+ * The c_cpuarray may not be read with enabled local interrupts -
  * it's changed with a smp_call_function().
  *
  * SMP synchronization:
@@ -266,16 +266,17 @@ struct array_cache {
 	unsigned int batchcount;
 	unsigned int touched;
 	spinlock_t lock;
-	void *entry[0];		/*
-				 * Must have this definition in here for the proper
-				 * alignment of array_cache. Also simplifies accessing
-				 * the entries.
-				 * [0] is for gcc 2.95. It should really be [].
-				 */
+	void *entry[0];	/*
+			 * Must have this definition in here for the proper
+			 * alignment of array_cache. Also simplifies accessing
+			 * the entries.
+			 * [0] is for gcc 2.95. It should really be [].
+			 */
 };
 
-/* bootstrap: The caches do not work without cpuarrays anymore,
- * but the cpuarrays are allocated from the generic caches...
+/*
+ * bootstrap: The caches do not work without cpuarrays anymore, but the
+ * cpuarrays are allocated from the generic caches...
  */
 #define BOOT_CPUCACHE_ENTRIES	1
 struct arraycache_init {
@@ -310,10 +311,8 @@ struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
 #define	SIZE_L3 (1 + MAX_NUMNODES)
 
 /*
- * This function must be completely optimized away if
- * a constant is passed to it. Mostly the same as
- * what is in linux/slab.h except it returns an
- * index.
+ * This function must be completely optimized away if a constant is passed to
+ * it.  Mostly the same as what is in linux/slab.h except it returns an index.
  */
 static __always_inline int index_of(const size_t size)
 {
@@ -351,14 +350,14 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 	parent->free_touched = 0;
 }
 
-#define MAKE_LIST(cachep, listp, slab, nodeid)	\
-	do {	\
-		INIT_LIST_HEAD(listp);		\
-		list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
+#define MAKE_LIST(cachep, listp, slab, nodeid)				\
+	do {								\
+		INIT_LIST_HEAD(listp);					\
+		list_splice(&(cachep->nodelists[nodeid]->slab), listp);	\
 	} while (0)
 
-#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)			\
-	do {					\
+#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)				\
+	do {								\
 	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
 	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
 	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
@@ -379,8 +378,8 @@ struct kmem_cache {
 	unsigned int buffer_size;
 /* 2) touched by every alloc & free from the backend */
 	struct kmem_list3 *nodelists[MAX_NUMNODES];
-	unsigned int flags;	/* constant flags */
-	unsigned int num;	/* # of objs per slab */
+	unsigned int flags;		/* constant flags */
+	unsigned int num;		/* # of objs per slab */
 	spinlock_t spinlock;
 
 /* 3) cache_grow/shrink */
@@ -390,11 +389,11 @@ struct kmem_cache {
 	/* force GFP flags, e.g. GFP_DMA */
 	gfp_t gfpflags;
 
-	size_t colour;		/* cache colouring range */
+	size_t colour;			/* cache colouring range */
 	unsigned int colour_off;	/* colour offset */
 	struct kmem_cache *slabp_cache;
 	unsigned int slab_size;
-	unsigned int dflags;	/* dynamic flags */
+	unsigned int dflags;		/* dynamic flags */
 
 	/* constructor func */
 	void (*ctor) (void *, struct kmem_cache *, unsigned long);
@@ -438,8 +437,9 @@ struct kmem_cache {
 #define	OFF_SLAB(x)	((x)->flags & CFLGS_OFF_SLAB)
 
 #define BATCHREFILL_LIMIT	16
-/* Optimization question: fewer reaps means less 
- * probability for unnessary cpucache drain/refill cycles.
+/*
+ * Optimization question: fewer reaps means less probability for unnessary
+ * cpucache drain/refill cycles.
  *
  * OTOH the cpuarrays can contain lots of objects,
  * which could lock up otherwise freeable slabs.
@@ -453,17 +453,19 @@ struct kmem_cache {
 #define	STATS_INC_ALLOCED(x)	((x)->num_allocations++)
 #define	STATS_INC_GROWN(x)	((x)->grown++)
 #define	STATS_INC_REAPED(x)	((x)->reaped++)
-#define	STATS_SET_HIGH(x)	do { if ((x)->num_active > (x)->high_mark) \
-					(x)->high_mark = (x)->num_active; \
-				} while (0)
+#define	STATS_SET_HIGH(x)						\
+	do {								\
+		if ((x)->num_active > (x)->high_mark)			\
+			(x)->high_mark = (x)->num_active;		\
+	} while (0)
 #define	STATS_INC_ERR(x)	((x)->errors++)
 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
 #define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
-#define	STATS_SET_FREEABLE(x, i) \
-				do { if ((x)->max_freeable < i) \
-					(x)->max_freeable = i; \
-				} while (0)
-
+#define	STATS_SET_FREEABLE(x, i)					\
+	do {								\
+		if ((x)->max_freeable < i)				\
+			(x)->max_freeable = i;				\
+	} while (0)
 #define STATS_INC_ALLOCHIT(x)	atomic_inc(&(x)->allochit)
 #define STATS_INC_ALLOCMISS(x)	atomic_inc(&(x)->allocmiss)
 #define STATS_INC_FREEHIT(x)	atomic_inc(&(x)->freehit)
@@ -478,9 +480,7 @@ struct kmem_cache {
 #define	STATS_INC_ERR(x)	do { } while (0)
 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
 #define	STATS_INC_NODEFREES(x)	do { } while (0)
-#define	STATS_SET_FREEABLE(x, i) \
-				do { } while (0)
-
+#define	STATS_SET_FREEABLE(x, i) do { } while (0)
 #define STATS_INC_ALLOCHIT(x)	do { } while (0)
 #define STATS_INC_ALLOCMISS(x)	do { } while (0)
 #define STATS_INC_FREEHIT(x)	do { } while (0)
@@ -488,7 +488,8 @@ struct kmem_cache {
 #endif
 
 #if DEBUG
-/* Magic nums for obj red zoning.
+/*
+ * Magic nums for obj red zoning.
  * Placed in the first word before and the first word after an obj.
  */
 #define	RED_INACTIVE	0x5A2CF071UL	/* when obj is inactive */
@@ -499,7 +500,8 @@ struct kmem_cache {
 #define POISON_FREE	0x6b	/* for use-after-free poisoning */
 #define	POISON_END	0xa5	/* end-byte of poisoning */
 
-/* memory layout of objects:
+/*
+ * memory layout of objects:
  * 0		: objp
  * 0 .. cachep->obj_offset - BYTES_PER_WORD - 1: padding. This ensures that
  * 		the end of an object is aligned with the end of the real
@@ -508,7 +510,8 @@ struct kmem_cache {
  * 		redzone word.
  * cachep->obj_offset: The real object.
  * cachep->buffer_size - 2* BYTES_PER_WORD: redzone word [BYTES_PER_WORD long]
- * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address [BYTES_PER_WORD long]
+ * cachep->buffer_size - 1* BYTES_PER_WORD: last caller address
+ *					[BYTES_PER_WORD long]
  */
 static int obj_offset(struct kmem_cache *cachep)
 {
@@ -552,8 +555,8 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #endif
 
 /*
- * Maximum size of an obj (in 2^order pages)
- * and absolute limit for the gfp order.
+ * Maximum size of an obj (in 2^order pages) and absolute limit for the gfp
+ * order.
  */
 #if defined(CONFIG_LARGE_ALLOCS)
 #define	MAX_OBJ_ORDER	13	/* up to 32Mb */
@@ -573,9 +576,10 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
 #define	BREAK_GFP_ORDER_LO	0
 static int slab_break_gfp_order = BREAK_GFP_ORDER_LO;
 
-/* Functions for storing/retrieving the cachep and or slab from the
- * global 'mem_map'. These are used to find the slab an obj belongs to.
- * With kfree(), these are used to find the cache which an obj belongs to.
+/*
+ * Functions for storing/retrieving the cachep and or slab from the page
+ * allocator.  These are used to find the slab an obj belongs to.  With kfree(),
+ * these are used to find the cache which an obj belongs to.
  */
 static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 {
@@ -621,7 +625,9 @@ static inline unsigned int obj_to_index(struct kmem_cache *cache,
 	return (unsigned)(obj - slab->s_mem) / cache->buffer_size;
 }
 
-/* These are the default caches for kmalloc. Custom caches can have other sizes. */
+/*
+ * These are the default caches for kmalloc. Custom caches can have other sizes.
+ */
 struct cache_sizes malloc_sizes[] = {
 #define CACHE(x) { .cs_size = (x) },
 #include <linux/kmalloc_sizes.h>
@@ -667,8 +673,8 @@ static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 
 /*
- * vm_enough_memory() looks at this to determine how many
- * slab-allocated pages are possibly freeable under pressure
+ * vm_enough_memory() looks at this to determine how many slab-allocated pages
+ * are possibly freeable under pressure
  *
  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
  */
@@ -687,7 +693,8 @@ static enum {
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static void free_block(struct kmem_cache *cachep, void **objpp, int len, int node);
+static void free_block(struct kmem_cache *cachep, void **objpp, int len,
+			int node);
 static void enable_cpucache(struct kmem_cache *cachep);
 static void cache_reap(void *unused);
 static int __node_shrink(struct kmem_cache *cachep, int node);
@@ -697,7 +704,8 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
 	return cachep->array[smp_processor_id()];
 }
 
-static inline struct kmem_cache *__find_general_cachep(size_t size, gfp_t gfpflags)
+static inline struct kmem_cache *__find_general_cachep(size_t size,
+							gfp_t gfpflags)
 {
 	struct cache_sizes *csizep = malloc_sizes;
 
@@ -732,8 +740,9 @@ static size_t slab_mgmt_size(size_t nr_objs, size_t align)
 	return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
 }
 
-/* Calculate the number of objects and left-over bytes for a given
-   buffer size. */
+/*
+ * Calculate the number of objects and left-over bytes for a given buffer size.
+ */
 static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 			   size_t align, int flags, size_t *left_over,
 			   unsigned int *num)
@@ -794,7 +803,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
 
 #define slab_error(cachep, msg) __slab_error(__FUNCTION__, cachep, msg)
 
-static void __slab_error(const char *function, struct kmem_cache *cachep, char *msg)
+static void __slab_error(const char *function, struct kmem_cache *cachep,
+			char *msg)
 {
 	printk(KERN_ERR "slab error in %s(): cache `%s': %s\n",
 	       function, cachep->name, msg);
@@ -918,10 +928,8 @@ static void free_alien_cache(struct array_cache **ac_ptr)
 
 	if (!ac_ptr)
 		return;
-
 	for_each_node(i)
 	    kfree(ac_ptr[i]);
-
 	kfree(ac_ptr);
 }
 
@@ -955,7 +963,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
 	}
 }
 
-static void drain_alien_cache(struct kmem_cache *cachep, struct array_cache **alien)
+static void drain_alien_cache(struct kmem_cache *cachep,
+				struct array_cache **alien)
 {
 	int i = 0;
 	struct array_cache *ac;
@@ -998,20 +1007,22 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_UP_PREPARE:
 		mutex_lock(&cache_chain_mutex);
-		/* we need to do this right in the beginning since
+		/*
+		 * We need to do this right in the beginning since
 		 * alloc_arraycache's are going to use this list.
 		 * kmalloc_node allows us to add the slab to the right
 		 * kmem_list3 and not this cpu's kmem_list3
 		 */
 
 		list_for_each_entry(cachep, &cache_chain, next) {
-			/* setup the size64 kmemlist for cpu before we can
+			/*
+			 * Set up the size64 kmemlist for cpu before we can
 			 * begin anything. Make sure some other cpu on this
 			 * node has not already allocated this
 			 */
 			if (!cachep->nodelists[node]) {
-				if (!(l3 = kmalloc_node(memsize,
-							GFP_KERNEL, node)))
+				l3 = kmalloc_node(memsize, GFP_KERNEL, node);
+				if (!l3)
 					goto bad;
 				kmem_list3_init(l3);
 				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
@@ -1027,13 +1038,15 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 
 			spin_lock_irq(&cachep->nodelists[node]->list_lock);
 			cachep->nodelists[node]->free_limit =
-			    (1 + nr_cpus_node(node)) *
-			    cachep->batchcount + cachep->num;
+				(1 + nr_cpus_node(node)) *
+				cachep->batchcount + cachep->num;
 			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
 		}
 
-		/* Now we can go ahead with allocating the shared array's
-		   & array cache's */
+		/*
+		 * Now we can go ahead with allocating the shared arrays and
+		 * array caches
+		 */
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
 			struct array_cache *shared;
@@ -1053,7 +1066,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			if (!alien)
 				goto bad;
 			cachep->array[cpu] = nc;
-
 			l3 = cachep->nodelists[node];
 			BUG_ON(!l3);
 
@@ -1073,7 +1085,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 			}
 #endif
 			spin_unlock_irq(&l3->list_lock);
-
 			kfree(shared);
 			free_alien_cache(alien);
 		}
@@ -1095,7 +1106,6 @@ static int __devinit cpuup_callback(struct notifier_block *nfb,
 		/* fall thru */
 	case CPU_UP_CANCELED:
 		mutex_lock(&cache_chain_mutex);
-
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
 			struct array_cache *shared;
@@ -1162,7 +1172,7 @@ free_array_cache:
 #endif
 	}
 	return NOTIFY_OK;
-      bad:
+bad:
 	mutex_unlock(&cache_chain_mutex);
 	return NOTIFY_BAD;
 }
@@ -1172,7 +1182,8 @@ static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 /*
  * swap the static kmem_list3 with kmalloced memory
  */
-static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int nodeid)
+static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
+			int nodeid)
 {
 	struct kmem_list3 *ptr;
 
@@ -1187,8 +1198,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, int no
 	local_irq_enable();
 }
 
-/* Initialisation.
- * Called after the gfp() functions have been enabled, and before smp_init().
+/*
+ * Initialisation.  Called after the page allocator have been initialised and
+ * before smp_init().
  */
 void __init kmem_cache_init(void)
 {
@@ -1213,9 +1225,9 @@ void __init kmem_cache_init(void)
 
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
-	 * 1) initialize the cache_cache cache: it contains the struct kmem_cache
-	 *    structures of all caches, except cache_cache itself: cache_cache
-	 *    is statically allocated.
+	 * 1) initialize the cache_cache cache: it contains the struct
+	 *    kmem_cache structures of all caches, except cache_cache itself:
+	 *    cache_cache is statically allocated.
 	 *    Initially an __init data area is used for the head array and the
 	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
 	 *    array at the end of the bootstrap.
@@ -1238,7 +1250,8 @@ void __init kmem_cache_init(void)
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
 	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
 
-	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size, cache_line_size());
+	cache_cache.buffer_size = ALIGN(cache_cache.buffer_size,
+					cache_line_size());
 
 	for (order = 0; order < MAX_ORDER; order++) {
 		cache_estimate(order, cache_cache.buffer_size,
@@ -1257,24 +1270,26 @@ void __init kmem_cache_init(void)
 	sizes = malloc_sizes;
 	names = cache_names;
 
-	/* Initialize the caches that provide memory for the array cache
-	 * and the kmem_list3 structures first.
-	 * Without this, further allocations will bug
+	/*
+	 * Initialize the caches that provide memory for the array cache and the
+	 * kmem_list3 structures first.  Without this, further allocations will
+	 * bug.
 	 */
 
 	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-						      sizes[INDEX_AC].cs_size,
-						      ARCH_KMALLOC_MINALIGN,
-						      (ARCH_KMALLOC_FLAGS |
-						       SLAB_PANIC), NULL, NULL);
+					sizes[INDEX_AC].cs_size,
+					ARCH_KMALLOC_MINALIGN,
+					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+					NULL, NULL);
 
-	if (INDEX_AC != INDEX_L3)
+	if (INDEX_AC != INDEX_L3) {
 		sizes[INDEX_L3].cs_cachep =
-		    kmem_cache_create(names[INDEX_L3].name,
-				      sizes[INDEX_L3].cs_size,
-				      ARCH_KMALLOC_MINALIGN,
-				      (ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL,
-				      NULL);
+			kmem_cache_create(names[INDEX_L3].name,
+				sizes[INDEX_L3].cs_size,
+				ARCH_KMALLOC_MINALIGN,
+				ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+				NULL, NULL);
+	}
 
 	while (sizes->cs_size != ULONG_MAX) {
 		/*
@@ -1284,13 +1299,13 @@ void __init kmem_cache_init(void)
 		 * Note for systems short on memory removing the alignment will
 		 * allow tighter packing of the smaller caches.
 		 */
-		if (!sizes->cs_cachep)
+		if (!sizes->cs_cachep) {
 			sizes->cs_cachep = kmem_cache_create(names->name,
-							     sizes->cs_size,
-							     ARCH_KMALLOC_MINALIGN,
-							     (ARCH_KMALLOC_FLAGS
-							      | SLAB_PANIC),
-							     NULL, NULL);
+					sizes->cs_size,
+					ARCH_KMALLOC_MINALIGN,
+					ARCH_KMALLOC_FLAGS|SLAB_PANIC,
+					NULL, NULL);
+		}
 
 		/* Inc off-slab bufctl limit until the ceiling is hit. */
 		if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -1299,13 +1314,11 @@ void __init kmem_cache_init(void)
 		}
 
 		sizes->cs_dmacachep = kmem_cache_create(names->name_dma,
-							sizes->cs_size,
-							ARCH_KMALLOC_MINALIGN,
-							(ARCH_KMALLOC_FLAGS |
-							 SLAB_CACHE_DMA |
-							 SLAB_PANIC), NULL,
-							NULL);
-
+					sizes->cs_size,
+					ARCH_KMALLOC_MINALIGN,
+					ARCH_KMALLOC_FLAGS|SLAB_CACHE_DMA|
+						SLAB_PANIC,
+					NULL, NULL);
 		sizes++;
 		names++;
 	}
@@ -1357,20 +1370,22 @@ void __init kmem_cache_init(void)
 		struct kmem_cache *cachep;
 		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
-		    enable_cpucache(cachep);
+			enable_cpucache(cachep);
 		mutex_unlock(&cache_chain_mutex);
 	}
 
 	/* Done! */
 	g_cpucache_up = FULL;
 
-	/* Register a cpu startup notifier callback
-	 * that initializes cpu_cache_get for all new cpus
+	/*
+	 * Register a cpu startup notifier callback that initializes
+	 * cpu_cache_get for all new cpus
 	 */
 	register_cpu_notifier(&cpucache_notifier);
 
-	/* The reap timers are started later, with a module init call:
-	 * That part of the kernel is not yet operational.
+	/*
+	 * The reap timers are started later, with a module init call: That part
+	 * of the kernel is not yet operational.
 	 */
 }
 
@@ -1378,16 +1393,13 @@ static int __init cpucache_init(void)
 {
 	int cpu;
 
-	/* 
-	 * Register the timers that return unneeded
-	 * pages to gfp.
+	/*
+	 * Register the timers that return unneeded pages to the page allocator
 	 */
 	for_each_online_cpu(cpu)
-	    start_cpu_timer(cpu);
-
+		start_cpu_timer(cpu);
 	return 0;
 }
-
 __initcall(cpucache_init);
 
 /*
@@ -1501,9 +1513,8 @@ static void dump_line(char *data, int offset, int limit)
 {
 	int i;
 	printk(KERN_ERR "%03x:", offset);
-	for (i = 0; i < limit; i++) {
+	for (i = 0; i < limit; i++)
 		printk(" %02x", (unsigned char)data[offset + i]);
-	}
 	printk("\n");
 }
 #endif
@@ -1517,15 +1528,15 @@ static void print_objinfo(struct kmem_cache *cachep, void *objp, int lines)
 
 	if (cachep->flags & SLAB_RED_ZONE) {
 		printk(KERN_ERR "Redzone: 0x%lx/0x%lx.\n",
-		       *dbg_redzone1(cachep, objp),
-		       *dbg_redzone2(cachep, objp));
+			*dbg_redzone1(cachep, objp),
+			*dbg_redzone2(cachep, objp));
 	}
 
 	if (cachep->flags & SLAB_STORE_USER) {
 		printk(KERN_ERR "Last user: [<%p>]",
-		       *dbg_userword(cachep, objp));
+			*dbg_userword(cachep, objp));
 		print_symbol("(%s)",
-			     (unsigned long)*dbg_userword(cachep, objp));
+				(unsigned long)*dbg_userword(cachep, objp));
 		printk("\n");
 	}
 	realobj = (char *)objp + obj_offset(cachep);
@@ -1558,8 +1569,8 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 			/* Print header */
 			if (lines == 0) {
 				printk(KERN_ERR
-				       "Slab corruption: start=%p, len=%d\n",
-				       realobj, size);
+					"Slab corruption: start=%p, len=%d\n",
+					realobj, size);
 				print_objinfo(cachep, objp, 0);
 			}
 			/* Hexdump the affected line */
@@ -1614,11 +1625,10 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 
 		if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-			if ((cachep->buffer_size % PAGE_SIZE) == 0
-			    && OFF_SLAB(cachep))
+			if (cachep->buffer_size % PAGE_SIZE == 0 &&
+					OFF_SLAB(cachep))
 				kernel_map_pages(virt_to_page(objp),
-						 cachep->buffer_size / PAGE_SIZE,
-						 1);
+					cachep->buffer_size / PAGE_SIZE, 1);
 			else
 				check_poison_obj(cachep, objp);
 #else
@@ -1650,10 +1660,10 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 }
 #endif
 
-/**
+/*
  * Destroy all the objs in a slab, and release the mem back to the system.
- * Before calling the slab must have been unlinked from the cache.
- * The cache-lock is not held/needed.
+ * Before calling the slab must have been unlinked from the cache.  The
+ * cache-lock is not held/needed.
  */
 static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 {
@@ -1674,8 +1684,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
 	}
 }
 
-/* For setting up all the kmem_list3s for cache whose buffer_size is same
-   as size of kmem_list3. */
+/*
+ * For setting up all the kmem_list3s for cache whose buffer_size is same as
+ * size of kmem_list3.
+ */
 static void set_up_list3s(struct kmem_cache *cachep, int index)
 {
 	int node;
@@ -1701,13 +1713,13 @@ static void set_up_list3s(struct kmem_cache *cachep, int index)
  * high order pages for slabs.  When the gfp() functions are more friendly
  * towards high-order requests, this should be changed.
  */
-static inline size_t calculate_slab_order(struct kmem_cache *cachep,
+static size_t calculate_slab_order(struct kmem_cache *cachep,
 			size_t size, size_t align, unsigned long flags)
 {
 	size_t left_over = 0;
 	int gfporder;
 
-	for (gfporder = 0 ; gfporder <= MAX_GFP_ORDER; gfporder++) {
+	for (gfporder = 0; gfporder <= MAX_GFP_ORDER; gfporder++) {
 		unsigned int num;
 		size_t remainder;
 
@@ -1742,7 +1754,7 @@ static inline size_t calculate_slab_order(struct kmem_cache *cachep,
 		/*
 		 * Acceptable internal fragmentation?
 		 */
-		if ((left_over * 8) <= (PAGE_SIZE << gfporder))
+		if (left_over * 8 <= (PAGE_SIZE << gfporder))
 			break;
 	}
 	return left_over;
@@ -1817,9 +1829,8 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
  * and the @dtor is run before the pages are handed back.
  *
  * @name must be valid until the cache is destroyed. This implies that
- * the module calling this has to destroy the cache before getting 
- * unloaded.
- * 
+ * the module calling this has to destroy the cache before getting unloaded.
+ *
  * The flags are
  *
  * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
@@ -1837,7 +1848,8 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
  */
 struct kmem_cache *
 kmem_cache_create (const char *name, size_t size, size_t align,
-	unsigned long flags, void (*ctor)(void*, struct kmem_cache *, unsigned long),
+	unsigned long flags,
+	void (*ctor)(void*, struct kmem_cache *, unsigned long),
 	void (*dtor)(void*, struct kmem_cache *, unsigned long))
 {
 	size_t left_over, slab_size, ralign;
@@ -1847,12 +1859,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	/*
 	 * Sanity checks... these are all serious usage bugs.
 	 */
-	if ((!name) ||
-	    in_interrupt() ||
-	    (size < BYTES_PER_WORD) ||
+	if (!name || in_interrupt() || (size < BYTES_PER_WORD) ||
 	    (size > (1 << MAX_OBJ_ORDER) * PAGE_SIZE) || (dtor && !ctor)) {
-		printk(KERN_ERR "%s: Early error in slab %s\n",
-		       __FUNCTION__, name);
+		printk(KERN_ERR "%s: Early error in slab %s\n", __FUNCTION__,
+				name);
 		BUG();
 	}
 
@@ -1906,8 +1916,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * above the next power of two: caches with object sizes just above a
 	 * power of two have a significant amount of internal fragmentation.
 	 */
-	if ((size < 4096
-	     || fls(size - 1) == fls(size - 1 + 3 * BYTES_PER_WORD)))
+	if (size < 4096 || fls(size - 1) == fls(size-1 + 3 * BYTES_PER_WORD))
 		flags |= SLAB_RED_ZONE | SLAB_STORE_USER;
 	if (!(flags & SLAB_DESTROY_BY_RCU))
 		flags |= SLAB_POISON;
@@ -1919,13 +1928,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		BUG_ON(dtor);
 
 	/*
-	 * Always checks flags, a caller might be expecting debug
-	 * support which isn't available.
+	 * Always checks flags, a caller might be expecting debug support which
+	 * isn't available.
 	 */
 	if (flags & ~CREATE_MASK)
 		BUG();
 
-	/* Check that size is in terms of words.  This is needed to avoid
+	/*
+	 * Check that size is in terms of words.  This is needed to avoid
 	 * unaligned accesses for some archs when redzoning is used, and makes
 	 * sure any on-slab bufctl's are also correctly aligned.
 	 */
@@ -1934,12 +1944,14 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		size &= ~(BYTES_PER_WORD - 1);
 	}
 
-	/* calculate out the final buffer alignment: */
+	/* calculate the final buffer alignment: */
+
 	/* 1) arch recommendation: can be overridden for debug */
 	if (flags & SLAB_HWCACHE_ALIGN) {
-		/* Default alignment: as specified by the arch code.
-		 * Except if an object is really small, then squeeze multiple
-		 * objects into one cacheline.
+		/*
+		 * Default alignment: as specified by the arch code.  Except if
+		 * an object is really small, then squeeze multiple objects into
+		 * one cacheline.
 		 */
 		ralign = cache_line_size();
 		while (size <= ralign / 2)
@@ -1959,7 +1971,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 		if (ralign > BYTES_PER_WORD)
 			flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER);
 	}
-	/* 4) Store it. Note that the debug code below can reduce
+	/*
+	 * 4) Store it. Note that the debug code below can reduce
 	 *    the alignment to BYTES_PER_WORD.
 	 */
 	align = ralign;
@@ -2058,7 +2071,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 
 	/* cache setup completed, link it into the list */
 	list_add(&cachep->next, &cache_chain);
-      oops:
+oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
@@ -2109,7 +2122,6 @@ static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 {
 	check_irq_on();
 	preempt_disable();
-
 	local_irq_disable();
 	func(arg);
 	local_irq_enable();
@@ -2120,12 +2132,12 @@ static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
 	preempt_enable();
 }
 
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
-				int force, int node);
+static void drain_array_locked(struct kmem_cache *cachep,
+			struct array_cache *ac, int force, int node);
 
 static void do_drain(void *arg)
 {
-	struct kmem_cache *cachep = (struct kmem_cache *) arg;
+	struct kmem_cache *cachep = arg;
 	struct array_cache *ac;
 	int node = numa_node_id();
 
@@ -2273,16 +2285,15 @@ int kmem_cache_destroy(struct kmem_cache *cachep)
 
 	/* NUMA: free the list3 structures */
 	for_each_online_node(i) {
-		if ((l3 = cachep->nodelists[i])) {
+		l3 = cachep->nodelists[i];
+		if (l3) {
 			kfree(l3->shared);
 			free_alien_cache(l3->alien);
 			kfree(l3);
 		}
 	}
 	kmem_cache_free(&cache_cache, cachep);
-
 	unlock_cpu_hotplug();
-
 	return 0;
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -2305,7 +2316,6 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 	slabp->inuse = 0;
 	slabp->colouroff = colour_off;
 	slabp->s_mem = objp + colour_off;
-
 	return slabp;
 }
 
@@ -2333,9 +2343,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
 			*dbg_redzone2(cachep, objp) = RED_INACTIVE;
 		}
 		/*
-		 * Constructors are not allowed to allocate memory from
-		 * the same cache which they are a constructor for.
-		 * Otherwise, deadlock. They must also be threaded.
+		 * Constructors are not allowed to allocate memory from the same
+		 * cache which they are a constructor for.  Otherwise, deadlock.
+		 * They must also be threaded.
 		 */
 		if (cachep->ctor && !(cachep->flags & SLAB_POISON))
 			cachep->ctor(objp + obj_offset(cachep), cachep,
@@ -2349,8 +2359,8 @@ static void cache_init_objs(struct kmem_cache *cachep,
 				slab_error(cachep, "constructor overwrote the"
 					   " start of an object");
 		}
-		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)
-		    && cachep->flags & SLAB_POISON)
+		if ((cachep->buffer_size % PAGE_SIZE) == 0 &&
+			    OFF_SLAB(cachep) && cachep->flags & SLAB_POISON)
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->buffer_size / PAGE_SIZE, 0);
 #else
@@ -2365,16 +2375,14 @@ static void cache_init_objs(struct kmem_cache *cachep,
 
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
-	if (flags & SLAB_DMA) {
-		if (!(cachep->gfpflags & GFP_DMA))
-			BUG();
-	} else {
-		if (cachep->gfpflags & GFP_DMA)
-			BUG();
-	}
+	if (flags & SLAB_DMA)
+		BUG_ON(!(cachep->gfpflags & GFP_DMA));
+	else
+		BUG_ON(cachep->gfpflags & GFP_DMA);
 }
 
-static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid)
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
+				int nodeid)
 {
 	void *objp = index_to_obj(cachep, slabp, slabp->free);
 	kmem_bufctl_t next;
@@ -2390,8 +2398,8 @@ static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nod
 	return objp;
 }
 
-static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp,
-			  int nodeid)
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
+				void *objp, int nodeid)
 {
 	unsigned int objnr = obj_to_index(cachep, slabp, objp);
 
@@ -2401,7 +2409,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
 
 	if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
 		printk(KERN_ERR "slab: double free detected in cache "
-		       "'%s', objp %p\n", cachep->name, objp);
+				"'%s', objp %p\n", cachep->name, objp);
 		BUG();
 	}
 #endif
@@ -2410,7 +2418,8 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *ob
 	slabp->inuse--;
 }
 
-static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp, void *objp)
+static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
+			void *objp)
 {
 	int i;
 	struct page *page;
@@ -2438,8 +2447,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	unsigned long ctor_flags;
 	struct kmem_list3 *l3;
 
-	/* Be lazy and only check for valid flags here,
-	 * keeping it out of the critical path in kmem_cache_alloc().
+	/*
+	 * Be lazy and only check for valid flags here,  keeping it out of the
+	 * critical path in kmem_cache_alloc().
 	 */
 	if (flags & ~(SLAB_DMA | SLAB_LEVEL_MASK | SLAB_NO_GROW))
 		BUG();
@@ -2480,14 +2490,17 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	 */
 	kmem_flagcheck(cachep, flags);
 
-	/* Get mem for the objs.
-	 * Attempt to allocate a physical page from 'nodeid',
+	/*
+	 * Get mem for the objs.  Attempt to allocate a physical page from
+	 * 'nodeid'.
 	 */
-	if (!(objp = kmem_getpages(cachep, flags, nodeid)))
+	objp = kmem_getpages(cachep, flags, nodeid);
+	if (!objp)
 		goto failed;
 
 	/* Get slab management. */
-	if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
+	slabp = alloc_slabmgmt(cachep, objp, offset, local_flags);
+	if (!slabp)
 		goto opps1;
 
 	slabp->nodeid = nodeid;
@@ -2506,9 +2519,9 @@ static int cache_grow(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	l3->free_objects += cachep->num;
 	spin_unlock(&l3->list_lock);
 	return 1;
-      opps1:
+opps1:
 	kmem_freepages(cachep, objp);
-      failed:
+failed:
 	if (local_flags & __GFP_WAIT)
 		local_irq_disable();
 	return 0;
@@ -2551,8 +2564,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	page = virt_to_page(objp);
 
 	if (page_get_cache(page) != cachep) {
-		printk(KERN_ERR
-		       "mismatch in kmem_cache_free: expected cache %p, got %p\n",
+		printk(KERN_ERR "mismatch in kmem_cache_free: expected "
+				"cache %p, got %p\n",
 		       page_get_cache(page), cachep);
 		printk(KERN_ERR "%p is %s.\n", cachep, cachep->name);
 		printk(KERN_ERR "%p is %s.\n", page_get_cache(page),
@@ -2562,13 +2575,12 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	slabp = page_get_slab(page);
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE
-		    || *dbg_redzone2(cachep, objp) != RED_ACTIVE) {
-			slab_error(cachep,
-				   "double free, or memory outside"
-				   " object was overwritten");
-			printk(KERN_ERR
-			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
+		if (*dbg_redzone1(cachep, objp) != RED_ACTIVE ||
+				*dbg_redzone2(cachep, objp) != RED_ACTIVE) {
+			slab_error(cachep, "double free, or memory outside"
+						" object was overwritten");
+			printk(KERN_ERR "%p: redzone 1:0x%lx, "
+					"redzone 2:0x%lx.\n",
 			       objp, *dbg_redzone1(cachep, objp),
 			       *dbg_redzone2(cachep, objp));
 		}
@@ -2584,9 +2596,10 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
 
 	if (cachep->flags & SLAB_DEBUG_INITIAL) {
-		/* Need to call the slab's constructor so the
-		 * caller can perform a verify of its state (debugging).
-		 * Called without the cache-lock held.
+		/*
+		 * Need to call the slab's constructor so the caller can
+		 * perform a verify of its state (debugging).  Called without
+		 * the cache-lock held.
 		 */
 		cachep->ctor(objp + obj_offset(cachep),
 			     cachep, SLAB_CTOR_CONSTRUCTOR | SLAB_CTOR_VERIFY);
@@ -2599,7 +2612,7 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 	}
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
-		if ((cachep->buffer_size % PAGE_SIZE) == 0 && OFF_SLAB(cachep)) {
+		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
 			store_stackinfo(cachep, objp, (unsigned long)caller);
 			kernel_map_pages(virt_to_page(objp),
 					 cachep->buffer_size / PAGE_SIZE, 0);
@@ -2625,14 +2638,14 @@ static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
 			goto bad;
 	}
 	if (entries != cachep->num - slabp->inuse) {
-	      bad:
-		printk(KERN_ERR
-		       "slab: Internal list corruption detected in cache '%s'(%d), slabp %p(%d). Hexdump:\n",
-		       cachep->name, cachep->num, slabp, slabp->inuse);
+bad:
+		printk(KERN_ERR "slab: Internal list corruption detected in "
+				"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
+			cachep->name, cachep->num, slabp, slabp->inuse);
 		for (i = 0;
 		     i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
 		     i++) {
-			if ((i % 16) == 0)
+			if (i % 16 == 0)
 				printk("\n%03x:", i);
 			printk(" %02x", ((unsigned char *)slabp)[i]);
 		}
@@ -2654,12 +2667,13 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 
 	check_irq_off();
 	ac = cpu_cache_get(cachep);
-      retry:
+retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
-		/* if there was little recent activity on this
-		 * cache, then perform only a partial refill.
-		 * Otherwise we could generate refill bouncing.
+		/*
+		 * If there was little recent activity on this cache, then
+		 * perform only a partial refill.  Otherwise we could generate
+		 * refill bouncing.
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
@@ -2715,29 +2729,29 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags)
 			list_add(&slabp->list, &l3->slabs_partial);
 	}
 
-      must_grow:
+must_grow:
 	l3->free_objects -= ac->avail;
-      alloc_done:
+alloc_done:
 	spin_unlock(&l3->list_lock);
 
 	if (unlikely(!ac->avail)) {
 		int x;
 		x = cache_grow(cachep, flags, numa_node_id());
 
-		// cache_grow can reenable interrupts, then ac could change.
+		/* cache_grow can reenable interrupts, then ac could change. */
 		ac = cpu_cache_get(cachep);
-		if (!x && ac->avail == 0)	// no objects in sight? abort
+		if (!x && ac->avail == 0)	/* no objects in sight? abort */
 			return NULL;
 
-		if (!ac->avail)	// objects refilled by interrupt?
+		if (!ac->avail)		/* objects refilled by interrupt? */
 			goto retry;
 	}
 	ac->touched = 1;
 	return ac->entry[--ac->avail];
 }
 
-static inline void
-cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
+static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep,
+						gfp_t flags)
 {
 	might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
@@ -2746,8 +2760,8 @@ cache_alloc_debugcheck_before(struct kmem_cache *cachep, gfp_t flags)
 }
 
 #if DEBUG
-static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags,
-					void *objp, void *caller)
+static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
+				gfp_t flags, void *objp, void *caller)
 {
 	if (!objp)
 		return objp;
@@ -2767,15 +2781,14 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags
 		*dbg_userword(cachep, objp) = caller;
 
 	if (cachep->flags & SLAB_RED_ZONE) {
-		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE
-		    || *dbg_redzone2(cachep, objp) != RED_INACTIVE) {
-			slab_error(cachep,
-				   "double free, or memory outside"
-				   " object was overwritten");
+		if (*dbg_redzone1(cachep, objp) != RED_INACTIVE ||
+				*dbg_redzone2(cachep, objp) != RED_INACTIVE) {
+			slab_error(cachep, "double free, or memory outside"
+						" object was overwritten");
 			printk(KERN_ERR
-			       "%p: redzone 1: 0x%lx, redzone 2: 0x%lx.\n",
-			       objp, *dbg_redzone1(cachep, objp),
-			       *dbg_redzone2(cachep, objp));
+				"%p: redzone 1:0x%lx, redzone 2:0x%lx\n",
+				objp, *dbg_redzone1(cachep, objp),
+				*dbg_redzone2(cachep, objp));
 		}
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
@@ -2822,8 +2835,8 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	return objp;
 }
 
-static __always_inline void *
-__cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
+static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
+						gfp_t flags, void *caller)
 {
 	unsigned long save_flags;
 	void *objp;
@@ -2843,7 +2856,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 /*
  * A interface to enable slab creation on nodeid
  */
-static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
+static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
+				int nodeid)
 {
 	struct list_head *entry;
 	struct slab *slabp;
@@ -2854,7 +2868,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
 	l3 = cachep->nodelists[nodeid];
 	BUG_ON(!l3);
 
-      retry:
+retry:
 	check_irq_off();
 	spin_lock(&l3->list_lock);
 	entry = l3->slabs_partial.next;
@@ -2881,16 +2895,15 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
 	/* move slabp to correct slabp list: */
 	list_del(&slabp->list);
 
-	if (slabp->free == BUFCTL_END) {
+	if (slabp->free == BUFCTL_END)
 		list_add(&slabp->list, &l3->slabs_full);
-	} else {
+	else
 		list_add(&slabp->list, &l3->slabs_partial);
-	}
 
 	spin_unlock(&l3->list_lock);
 	goto done;
 
-      must_grow:
+must_grow:
 	spin_unlock(&l3->list_lock);
 	x = cache_grow(cachep, flags, nodeid);
 
@@ -2898,7 +2911,7 @@ static void *__cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int node
 		return NULL;
 
 	goto retry;
-      done:
+done:
 	return obj;
 }
 #endif
@@ -2971,7 +2984,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 	}
 
 	free_block(cachep, ac->entry, batchcount, node);
-      free_done:
+free_done:
 #if STATS
 	{
 		int i = 0;
@@ -2992,16 +3005,12 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac)
 #endif
 	spin_unlock(&l3->list_lock);
 	ac->avail -= batchcount;
-	memmove(ac->entry, &(ac->entry[batchcount]),
-		sizeof(void *) * ac->avail);
+	memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
 }
 
 /*
- * __cache_free
- * Release an obj back to its cache. If the obj has a constructed
- * state, it must be in this state _before_ it is released.
- *
- * Called with disabled ints.
+ * Release an obj back to its cache. If the obj has a constructed state, it must
+ * be in this state _before_ it is released.  Called with disabled ints.
  */
 static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 {
@@ -3020,9 +3029,9 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 		if (unlikely(slabp->nodeid != numa_node_id())) {
 			struct array_cache *alien = NULL;
 			int nodeid = slabp->nodeid;
-			struct kmem_list3 *l3 =
-			    cachep->nodelists[numa_node_id()];
+			struct kmem_list3 *l3;
 
+			l3 = cachep->nodelists[numa_node_id()];
 			STATS_INC_NODEFREES(cachep);
 			if (l3->alien && l3->alien[nodeid]) {
 				alien = l3->alien[nodeid];
@@ -3106,7 +3115,7 @@ int fastcall kmem_ptr_validate(struct kmem_cache *cachep, void *ptr)
 	if (unlikely(page_get_cache(page) != cachep))
 		goto out;
 	return 1;
-      out:
+out:
 	return 0;
 }
 
@@ -3132,7 +3141,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	local_irq_save(save_flags);
 
 	if (nodeid == -1 || nodeid == numa_node_id() ||
-	    !cachep->nodelists[nodeid])
+			!cachep->nodelists[nodeid])
 		ptr = ____cache_alloc(cachep, flags);
 	else
 		ptr = __cache_alloc_node(cachep, flags, nodeid);
@@ -3249,7 +3258,7 @@ void *__alloc_percpu(size_t size)
 	/* Catch derefs w/o wrappers */
 	return (void *)(~(unsigned long)pdata);
 
-      unwind_oom:
+unwind_oom:
 	while (--i >= 0) {
 		if (!cpu_possible(i))
 			continue;
@@ -3352,18 +3361,20 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 		struct array_cache *nc = NULL, *new;
 		struct array_cache **new_alien = NULL;
 #ifdef CONFIG_NUMA
-		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
+		new_alien = alloc_alien_cache(node, cachep->limit);
+		if (!new_alien)
 			goto fail;
 #endif
-		if (!(new = alloc_arraycache(node, (cachep->shared *
-						    cachep->batchcount),
-					     0xbaadf00d)))
+		new = alloc_arraycache(node, cachep->shared*cachep->batchcount,
+					0xbaadf00d);
+		if (!new)
 			goto fail;
-		if ((l3 = cachep->nodelists[node])) {
-
+		l3 = cachep->nodelists[node];
+		if (l3) {
 			spin_lock_irq(&l3->list_lock);
 
-			if ((nc = cachep->nodelists[node]->shared))
+			nc = cachep->nodelists[node]->shared;
+			if (nc)
 				free_block(cachep, nc->entry, nc->avail, node);
 
 			l3->shared = new;
@@ -3372,27 +3383,27 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 				new_alien = NULL;
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
-			    cachep->batchcount + cachep->num;
+					cachep->batchcount + cachep->num;
 			spin_unlock_irq(&l3->list_lock);
 			kfree(nc);
 			free_alien_cache(new_alien);
 			continue;
 		}
-		if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
-					GFP_KERNEL, node)))
+		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+		if (!l3)
 			goto fail;
 
 		kmem_list3_init(l3);
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-		    ((unsigned long)cachep) % REAPTIMEOUT_LIST3;
+				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
 		l3->shared = new;
 		l3->alien = new_alien;
 		l3->free_limit = (1 + nr_cpus_node(node)) *
-		    cachep->batchcount + cachep->num;
+					cachep->batchcount + cachep->num;
 		cachep->nodelists[node] = l3;
 	}
 	return err;
-      fail:
+fail:
 	err = -ENOMEM;
 	return err;
 }
@@ -3404,7 +3415,7 @@ struct ccupdate_struct {
 
 static void do_ccupdate_local(void *info)
 {
-	struct ccupdate_struct *new = (struct ccupdate_struct *)info;
+	struct ccupdate_struct *new = info;
 	struct array_cache *old;
 
 	check_irq_off();
@@ -3414,16 +3425,16 @@ static void do_ccupdate_local(void *info)
 	new->new[smp_processor_id()] = old;
 }
 
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit, int batchcount,
-			    int shared)
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+				int batchcount, int shared)
 {
 	struct ccupdate_struct new;
 	int i, err;
 
 	memset(&new.new, 0, sizeof(new.new));
 	for_each_online_cpu(i) {
-		new.new[i] =
-		    alloc_arraycache(cpu_to_node(i), limit, batchcount);
+		new.new[i] = alloc_arraycache(cpu_to_node(i), limit,
+						batchcount);
 		if (!new.new[i]) {
 			for (i--; i >= 0; i--)
 				kfree(new.new[i]);
@@ -3465,10 +3476,11 @@ static void enable_cpucache(struct kmem_cache *cachep)
 	int err;
 	int limit, shared;
 
-	/* The head array serves three purposes:
+	/*
+	 * The head array serves three purposes:
 	 * - create a LIFO ordering, i.e. return objects that are cache-warm
 	 * - reduce the number of spinlock operations.
-	 * - reduce the number of linked list operations on the slab and 
+	 * - reduce the number of linked list operations on the slab and
 	 *   bufctl chains: array operations are cheaper.
 	 * The numbers are guessed, we should auto-tune as described by
 	 * Bonwick.
@@ -3484,7 +3496,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
 	else
 		limit = 120;
 
-	/* Cpu bound tasks (e.g. network routing) can exhibit cpu bound
+	/*
+	 * CPU bound tasks (e.g. network routing) can exhibit cpu bound
 	 * allocation behaviour: Most allocs on one cpu, most free operations
 	 * on another cpu. For these cases, an efficient object passing between
 	 * cpus is necessary. This is provided by a shared array. The array
@@ -3499,9 +3512,9 @@ static void enable_cpucache(struct kmem_cache *cachep)
 #endif
 
 #if DEBUG
-	/* With debugging enabled, large batchcount lead to excessively
-	 * long periods with disabled local interrupts. Limit the 
-	 * batchcount
+	/*
+	 * With debugging enabled, large batchcount lead to excessively long
+	 * periods with disabled local interrupts. Limit the batchcount
 	 */
 	if (limit > 32)
 		limit = 32;
@@ -3512,8 +3525,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
 		       cachep->name, -err);
 }
 
-static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac,
-				int force, int node)
+static void drain_array_locked(struct kmem_cache *cachep,
+				struct array_cache *ac, int force, int node)
 {
 	int tofree;
 
@@ -3522,9 +3535,8 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac
 		ac->touched = 0;
 	} else if (ac->avail) {
 		tofree = force ? ac->avail : (ac->limit + 4) / 5;
-		if (tofree > ac->avail) {
+		if (tofree > ac->avail)
 			tofree = (ac->avail + 1) / 2;
-		}
 		free_block(cachep, ac->entry, tofree, node);
 		ac->avail -= tofree;
 		memmove(ac->entry, &(ac->entry[tofree]),
@@ -3541,8 +3553,8 @@ static void drain_array_locked(struct kmem_cache *cachep, struct array_cache *ac
  * - clear the per-cpu caches for this CPU.
  * - return freeable pages to the main free memory pool.
  *
- * If we cannot acquire the cache chain mutex then just give up - we'll
- * try again on the next iteration.
+ * If we cannot acquire the cache chain mutex then just give up - we'll try
+ * again on the next iteration.
  */
 static void cache_reap(void *unused)
 {
@@ -3590,9 +3602,8 @@ static void cache_reap(void *unused)
 			goto next_unlock;
 		}
 
-		tofree =
-		    (l3->free_limit + 5 * searchp->num -
-		     1) / (5 * searchp->num);
+		tofree = (l3->free_limit + 5 * searchp->num - 1) /
+				(5 * searchp->num);
 		do {
 			p = l3->slabs_free.next;
 			if (p == &(l3->slabs_free))
@@ -3603,9 +3614,9 @@ static void cache_reap(void *unused)
 			list_del(&slabp->list);
 			STATS_INC_REAPED(searchp);
 
-			/* Safe to drop the lock. The slab is no longer
-			 * linked to the cache.
-			 * searchp cannot disappear, we hold
+			/*
+			 * Safe to drop the lock. The slab is no longer linked
+			 * to the cache. searchp cannot disappear, we hold
 			 * cache_chain_lock
 			 */
 			l3->free_objects -= searchp->num;
@@ -3613,15 +3624,15 @@ static void cache_reap(void *unused)
 			slab_destroy(searchp, slabp);
 			spin_lock_irq(&l3->list_lock);
 		} while (--tofree > 0);
-	      next_unlock:
+next_unlock:
 		spin_unlock_irq(&l3->list_lock);
-	      next:
+next:
 		cond_resched();
 	}
 	check_irq_on();
 	mutex_unlock(&cache_chain_mutex);
 	next_reap_node();
-	/* Setup the next iteration */
+	/* Set up the next iteration */
 	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC);
 }
 
@@ -3671,8 +3682,8 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
 {
 	struct kmem_cache *cachep = p;
 	++*pos;
-	return cachep->next.next == &cache_chain ? NULL
-	    : list_entry(cachep->next.next, struct kmem_cache, next);
+	return cachep->next.next == &cache_chain ?
+		NULL : list_entry(cachep->next.next, struct kmem_cache, next);
 }
 
 static void s_stop(struct seq_file *m, void *p)
@@ -3761,7 +3772,9 @@ static int s_show(struct seq_file *m, void *p)
 		unsigned long node_frees = cachep->node_frees;
 
 		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-				%4lu %4lu %4lu %4lu", allocs, high, grown, reaped, errors, max_freeable, node_allocs, node_frees);
+				%4lu %4lu %4lu %4lu", allocs, high, grown,
+				reaped, errors, max_freeable, node_allocs,
+				node_frees);
 	}
 	/* cpu stats */
 	{
@@ -3833,13 +3846,12 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 	mutex_lock(&cache_chain_mutex);
 	res = -EINVAL;
 	list_for_each(p, &cache_chain) {
-		struct kmem_cache *cachep = list_entry(p, struct kmem_cache,
-						       next);
+		struct kmem_cache *cachep;
 
+		cachep = list_entry(p, struct kmem_cache, next);
 		if (!strcmp(cachep->name, kbuf)) {
-			if (limit < 1 ||
-			    batchcount < 1 ||
-			    batchcount > limit || shared < 0) {
+			if (limit < 1 || batchcount < 1 ||
+					batchcount > limit || shared < 0) {
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
-- 
cgit v1.2.3


From b5d8ca7c50826c0b456b4a646875dc573adfde2b Mon Sep 17 00:00:00 2001
From: Ravikiran G Thirumalai <kiran@scalex86.org>
Date: Wed, 22 Mar 2006 00:08:12 -0800
Subject: [PATCH] slab: remove cachep->spinlock

Remove cachep->spinlock.  Locking has moved to the kmem_list3 and most of
the structures protected earlier by cachep->spinlock is now protected by
the l3->list_lock.  slab cache tunables like batchcount are accessed always
with the cache_chain_mutex held.

Patch tested on SMP and NUMA kernels with dbench processes running,
constant onlining/offlining, and constant cache tuning, all at the same
time.

Signed-off-by: Ravikiran Thirumalai <kiran@scalex86.org>
Cc: Christoph Lameter <christoph@lameter.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 7b6f9f10e757..2cd80203984b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -372,17 +372,19 @@ static void kmem_list3_init(struct kmem_list3 *parent)
 struct kmem_cache {
 /* 1) per-cpu data, touched during every alloc/free */
 	struct array_cache *array[NR_CPUS];
+/* 2) Cache tunables. Protected by cache_chain_mutex */
 	unsigned int batchcount;
 	unsigned int limit;
 	unsigned int shared;
+
 	unsigned int buffer_size;
-/* 2) touched by every alloc & free from the backend */
+/* 3) touched by every alloc & free from the backend */
 	struct kmem_list3 *nodelists[MAX_NUMNODES];
+
 	unsigned int flags;		/* constant flags */
 	unsigned int num;		/* # of objs per slab */
-	spinlock_t spinlock;
 
-/* 3) cache_grow/shrink */
+/* 4) cache_grow/shrink */
 	/* order of pgs per slab (2^n) */
 	unsigned int gfporder;
 
@@ -401,11 +403,11 @@ struct kmem_cache {
 	/* de-constructor func */
 	void (*dtor) (void *, struct kmem_cache *, unsigned long);
 
-/* 4) cache creation/removal */
+/* 5) cache creation/removal */
 	const char *name;
 	struct list_head next;
 
-/* 5) statistics */
+/* 6) statistics */
 #if STATS
 	unsigned long num_active;
 	unsigned long num_allocations;
@@ -661,7 +663,6 @@ static struct kmem_cache cache_cache = {
 	.shared = 1,
 	.buffer_size = sizeof(struct kmem_cache),
 	.flags = SLAB_NO_REAP,
-	.spinlock = SPIN_LOCK_UNLOCKED,
 	.name = "kmem_cache",
 #if DEBUG
 	.obj_size = sizeof(struct kmem_cache),
@@ -2057,7 +2058,6 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->gfpflags = 0;
 	if (flags & SLAB_CACHE_DMA)
 		cachep->gfpflags |= GFP_DMA;
-	spin_lock_init(&cachep->spinlock);
 	cachep->buffer_size = size;
 
 	if (flags & CFLGS_OFF_SLAB)
@@ -3425,6 +3425,7 @@ static void do_ccupdate_local(void *info)
 	new->new[smp_processor_id()] = old;
 }
 
+/* Always called with the cache_chain_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared)
 {
@@ -3446,11 +3447,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
 
 	check_irq_on();
-	spin_lock(&cachep->spinlock);
 	cachep->batchcount = batchcount;
 	cachep->limit = limit;
 	cachep->shared = shared;
-	spin_unlock(&cachep->spinlock);
 
 	for_each_online_cpu(i) {
 		struct array_cache *ccold = new.new[i];
@@ -3471,6 +3470,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	return 0;
 }
 
+/* Called with cache_chain_mutex held always */
 static void enable_cpucache(struct kmem_cache *cachep)
 {
 	int err;
@@ -3705,7 +3705,6 @@ static int s_show(struct seq_file *m, void *p)
 	int node;
 	struct kmem_list3 *l3;
 
-	spin_lock(&cachep->spinlock);
 	active_objs = 0;
 	num_slabs = 0;
 	for_each_online_node(node) {
@@ -3788,7 +3787,6 @@ static int s_show(struct seq_file *m, void *p)
 	}
 #endif
 	seq_putc(m, '\n');
-	spin_unlock(&cachep->spinlock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From fcc234f888ba2365c44ba0507eb8a18eebf1f594 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 22 Mar 2006 00:08:13 -0800
Subject: [PATCH] mm: kill kmem_cache_t usage

We have struct kmem_cache now so use it instead of the old typedef.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mempolicy.c | 4 ++--
 mm/mempool.c   | 4 ++--
 mm/rmap.c      | 5 +++--
 mm/shmem.c     | 5 +++--
 4 files changed, 10 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b21869a39f0b..96195dcb62e1 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -98,8 +98,8 @@
 /* The number of pages to migrate per call to migrate_pages() */
 #define MIGRATE_CHUNK_SIZE 256
 
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
 
 #define PDprintk(fmt...)
 
diff --git a/mm/mempool.c b/mm/mempool.c
index 1a99b80480d3..f71893ed3543 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -278,14 +278,14 @@ EXPORT_SYMBOL(mempool_free);
  */
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data)
 {
-	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	struct kmem_cache *mem = pool_data;
 	return kmem_cache_alloc(mem, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_alloc_slab);
 
 void mempool_free_slab(void *element, void *pool_data)
 {
-	kmem_cache_t *mem = (kmem_cache_t *) pool_data;
+	struct kmem_cache *mem = pool_data;
 	kmem_cache_free(mem, element);
 }
 EXPORT_SYMBOL(mempool_free_slab);
diff --git a/mm/rmap.c b/mm/rmap.c
index 67f0e20b101f..134aef9d66c5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -58,7 +58,7 @@
 
 //#define RMAP_DEBUG /* can be enabled only for debugging */
 
-kmem_cache_t *anon_vma_cachep;
+struct kmem_cache *anon_vma_cachep;
 
 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
 {
@@ -166,7 +166,8 @@ void anon_vma_unlink(struct vm_area_struct *vma)
 		anon_vma_free(anon_vma);
 }
 
-static void anon_vma_ctor(void *data, kmem_cache_t *cachep, unsigned long flags)
+static void anon_vma_ctor(void *data, struct kmem_cache *cachep,
+			  unsigned long flags)
 {
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 						SLAB_CTOR_CONSTRUCTOR) {
diff --git a/mm/shmem.c b/mm/shmem.c
index 7c455fbaff7b..f523a1533ce1 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2119,7 +2119,7 @@ failed:
 	return err;
 }
 
-static kmem_cache_t *shmem_inode_cachep;
+static struct kmem_cache *shmem_inode_cachep;
 
 static struct inode *shmem_alloc_inode(struct super_block *sb)
 {
@@ -2139,7 +2139,8 @@ static void shmem_destroy_inode(struct inode *inode)
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
-static void init_once(void *foo, kmem_cache_t *cachep, unsigned long flags)
+static void init_once(void *foo, struct kmem_cache *cachep,
+		      unsigned long flags)
 {
 	struct shmem_inode_info *p = (struct shmem_inode_info *) foo;
 
-- 
cgit v1.2.3


From 911851e6ee6ac4e26f07be342a89632f78494fef Mon Sep 17 00:00:00 2001
From: Randy Dunlap <rdunlap@xenotime.net>
Date: Wed, 22 Mar 2006 00:08:14 -0800
Subject: [PATCH] slab: fix kernel-doc warnings

Fix kernel-doc warnings in mm/slab.c.

Signed-off-by: Randy Dunlap <rdunlap@xenotime.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 2cd80203984b..5c2574989834 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1615,8 +1615,12 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
 
 #if DEBUG
 /**
- * slab_destroy_objs - call the registered destructor for each object in
- *      a slab that is to be destroyed.
+ * slab_destroy_objs - destroy a slab and its objects
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
+ * Call the registered destructor for each object in a slab that is being
+ * destroyed.
  */
 static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 {
@@ -1661,7 +1665,11 @@ static void slab_destroy_objs(struct kmem_cache *cachep, struct slab *slabp)
 }
 #endif
 
-/*
+/**
+ * slab_destroy - destroy and release all objects in a slab
+ * @cachep: cache pointer being destroyed
+ * @slabp: slab pointer being destroyed
+ *
  * Destroy all the objs in a slab, and release the mem back to the system.
  * Before calling the slab must have been unlinked from the cache.  The
  * cache-lock is not held/needed.
@@ -3170,6 +3178,7 @@ EXPORT_SYMBOL(kmalloc_node);
  * kmalloc - allocate memory
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
+ * @caller: function caller for debug tracking of the caller
  *
  * kmalloc is the normal method of allocating memory
  * in the kernel.
-- 
cgit v1.2.3


From ac2b898ca6fb06196a26869c23b66afe7944e52e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:08:15 -0800
Subject: [PATCH] slab: Remove SLAB_NO_REAP option

SLAB_NO_REAP is documented as an option that will cause this slab not to be
reaped under memory pressure.  However, that is not what happens.  The only
thing that SLAB_NO_REAP controls at the moment is the reclaim of the unused
slab elements that were allocated in batch in cache_reap().  Cache_reap()
is run every few seconds independently of memory pressure.

Could we remove the whole thing?  Its only used by three slabs anyways and
I cannot find a reason for having this option.

There is an additional problem with SLAB_NO_REAP.  If set then the recovery
of objects from alien caches is switched off.  Objects not freed on the
same node where they were initially allocated will only be reused if a
certain amount of objects accumulates from one alien node (not very likely)
or if the cache is explicitly shrunk.  (Strangely __cache_shrink does not
check for SLAB_NO_REAP)

Getting rid of SLAB_NO_REAP fixes the problems with alien cache freeing.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/scsi/iscsi_tcp.c |  2 +-
 fs/ocfs2/super.c         |  2 +-
 include/linux/slab.h     |  1 -
 mm/slab.c                | 13 ++-----------
 4 files changed, 4 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/drivers/scsi/iscsi_tcp.c b/drivers/scsi/iscsi_tcp.c
index ff79e68b347c..7b82ff090d42 100644
--- a/drivers/scsi/iscsi_tcp.c
+++ b/drivers/scsi/iscsi_tcp.c
@@ -3639,7 +3639,7 @@ iscsi_tcp_init(void)
 
 	taskcache = kmem_cache_create("iscsi_taskcache",
 			sizeof(struct iscsi_data_task), 0,
-			SLAB_HWCACHE_ALIGN | SLAB_NO_REAP, NULL, NULL);
+			SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if (!taskcache)
 		return -ENOMEM;
 
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 8dd3aafec499..09e1c57a86a0 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -959,7 +959,7 @@ static int ocfs2_initialize_mem_caches(void)
 	ocfs2_lock_cache = kmem_cache_create("ocfs2_lock",
 					     sizeof(struct ocfs2_journal_lock),
 					     0,
-					     SLAB_NO_REAP|SLAB_HWCACHE_ALIGN,
+					     SLAB_HWCACHE_ALIGN,
 					     NULL, NULL);
 	if (!ocfs2_lock_cache)
 		return -ENOMEM;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 38bed95dda7a..2b28c849d75a 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -38,7 +38,6 @@ typedef struct kmem_cache kmem_cache_t;
 #define	SLAB_DEBUG_INITIAL	0x00000200UL	/* Call constructor (as verifier) */
 #define	SLAB_RED_ZONE		0x00000400UL	/* Red zone objs in a cache */
 #define	SLAB_POISON		0x00000800UL	/* Poison objects */
-#define	SLAB_NO_REAP		0x00001000UL	/* never reap from the cache */
 #define	SLAB_HWCACHE_ALIGN	0x00002000UL	/* align objs on a h/w cache lines */
 #define SLAB_CACHE_DMA		0x00004000UL	/* use GFP_DMA memory */
 #define SLAB_MUST_HWCACHE_ALIGN	0x00008000UL	/* force alignment */
diff --git a/mm/slab.c b/mm/slab.c
index 5c2574989834..24235506b2a0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -170,12 +170,12 @@
 #if DEBUG
 # define CREATE_MASK	(SLAB_DEBUG_INITIAL | SLAB_RED_ZONE | \
 			 SLAB_POISON | SLAB_HWCACHE_ALIGN | \
-			 SLAB_NO_REAP | SLAB_CACHE_DMA | \
+			 SLAB_CACHE_DMA | \
 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU)
 #else
-# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | SLAB_NO_REAP | \
+# define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU)
@@ -662,7 +662,6 @@ static struct kmem_cache cache_cache = {
 	.limit = BOOT_CPUCACHE_ENTRIES,
 	.shared = 1,
 	.buffer_size = sizeof(struct kmem_cache),
-	.flags = SLAB_NO_REAP,
 	.name = "kmem_cache",
 #if DEBUG
 	.obj_size = sizeof(struct kmem_cache),
@@ -1848,9 +1847,6 @@ static void setup_cpu_cache(struct kmem_cache *cachep)
  * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
  * for buffer overruns.
  *
- * %SLAB_NO_REAP - Don't automatically reap this cache when we're under
- * memory pressure.
- *
  * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
@@ -3584,10 +3580,6 @@ static void cache_reap(void *unused)
 		struct slab *slabp;
 
 		searchp = list_entry(walk, struct kmem_cache, next);
-
-		if (searchp->flags & SLAB_NO_REAP)
-			goto next;
-
 		check_irq_on();
 
 		l3 = searchp->nodelists[numa_node_id()];
@@ -3635,7 +3627,6 @@ static void cache_reap(void *unused)
 		} while (--tofree > 0);
 next_unlock:
 		spin_unlock_irq(&l3->list_lock);
-next:
 		cond_resched();
 	}
 	check_irq_on();
-- 
cgit v1.2.3


From a07fa3944bf924881450884224cbb2f1269cb9fa Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:17 -0800
Subject: [PATCH] slab: use on_each_cpu()

Slab duplicates on_each_cpu().

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 21 ++-------------------
 1 file changed, 2 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 24235506b2a0..f477acfb732f 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2119,23 +2119,6 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
-/*
- * Waits for all CPUs to execute func().
- */
-static void smp_call_function_all_cpus(void (*func)(void *arg), void *arg)
-{
-	check_irq_on();
-	preempt_disable();
-	local_irq_disable();
-	func(arg);
-	local_irq_enable();
-
-	if (smp_call_function(func, arg, 1, 1))
-		BUG();
-
-	preempt_enable();
-}
-
 static void drain_array_locked(struct kmem_cache *cachep,
 			struct array_cache *ac, int force, int node);
 
@@ -2158,7 +2141,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 	struct kmem_list3 *l3;
 	int node;
 
-	smp_call_function_all_cpus(do_drain, cachep);
+	on_each_cpu(do_drain, cachep, 1, 1);
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
@@ -3449,7 +3432,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	}
 	new.cachep = cachep;
 
-	smp_call_function_all_cpus(do_ccupdate_local, (void *)&new);
+	on_each_cpu(do_ccupdate_local, (void *)&new, 1, 1);
 
 	check_irq_on();
 	cachep->batchcount = batchcount;
-- 
cgit v1.2.3


From 8695949a1d7c99e039595db00af8e0fe4722307d Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:08:18 -0800
Subject: [PATCH] Thin out scan_control: remove nr_to_scan and priority

Make nr_to_scan and priority a parameter instead of putting it into scan
control.  This allows various small optimizations and IMHO makes the code
easier to read.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 59 +++++++++++++++++++++++++----------------------------------
 1 file changed, 25 insertions(+), 34 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index e21bab4deda6..f7c4f37c3b18 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,9 +52,6 @@ typedef enum {
 } pageout_t;
 
 struct scan_control {
-	/* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
-	unsigned long nr_to_scan;
-
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 
@@ -63,9 +60,6 @@ struct scan_control {
 
 	unsigned long nr_mapped;	/* From page_state */
 
-	/* Ask shrink_caches, or shrink_zone to scan at this priority */
-	unsigned int priority;
-
 	/* This context's GFP mask */
 	gfp_t gfp_mask;
 
@@ -1112,11 +1106,10 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
-static void shrink_cache(struct zone *zone, struct scan_control *sc)
+static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
-	int max_scan = sc->nr_to_scan;
 
 	pagevec_init(&pvec, 1);
 
@@ -1192,12 +1185,11 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc)
 {
 	int pgmoved;
 	int pgdeactivate = 0;
 	int pgscanned;
-	int nr_pages = sc->nr_to_scan;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
@@ -1332,10 +1324,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
 static void
-shrink_zone(struct zone *zone, struct scan_control *sc)
+shrink_zone(int priority, struct zone *zone, struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
+	unsigned long nr_to_scan;
 
 	atomic_inc(&zone->reclaim_in_progress);
 
@@ -1343,14 +1336,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 	 * Add one to `nr_to_scan' just to make sure that the kernel will
 	 * slowly sift through the active list.
 	 */
-	zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+	zone->nr_scan_active += (zone->nr_active >> priority) + 1;
 	nr_active = zone->nr_scan_active;
 	if (nr_active >= sc->swap_cluster_max)
 		zone->nr_scan_active = 0;
 	else
 		nr_active = 0;
 
-	zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
+	zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
 	nr_inactive = zone->nr_scan_inactive;
 	if (nr_inactive >= sc->swap_cluster_max)
 		zone->nr_scan_inactive = 0;
@@ -1359,17 +1352,17 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
 
 	while (nr_active || nr_inactive) {
 		if (nr_active) {
-			sc->nr_to_scan = min(nr_active,
+			nr_to_scan = min(nr_active,
 					(unsigned long)sc->swap_cluster_max);
-			nr_active -= sc->nr_to_scan;
-			refill_inactive_zone(zone, sc);
+			nr_active -= nr_to_scan;
+			refill_inactive_zone(nr_to_scan, zone, sc);
 		}
 
 		if (nr_inactive) {
-			sc->nr_to_scan = min(nr_inactive,
+			nr_to_scan = min(nr_inactive,
 					(unsigned long)sc->swap_cluster_max);
-			nr_inactive -= sc->nr_to_scan;
-			shrink_cache(zone, sc);
+			nr_inactive -= nr_to_scan;
+			shrink_cache(nr_to_scan, zone, sc);
 		}
 	}
 
@@ -1395,7 +1388,7 @@ shrink_zone(struct zone *zone, struct scan_control *sc)
  * scan then give up on it.
  */
 static void
-shrink_caches(struct zone **zones, struct scan_control *sc)
+shrink_caches(int priority, struct zone **zones, struct scan_control *sc)
 {
 	int i;
 
@@ -1408,14 +1401,14 @@ shrink_caches(struct zone **zones, struct scan_control *sc)
 		if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
 			continue;
 
-		zone->temp_priority = sc->priority;
-		if (zone->prev_priority > sc->priority)
-			zone->prev_priority = sc->priority;
+		zone->temp_priority = priority;
+		if (zone->prev_priority > priority)
+			zone->prev_priority = priority;
 
-		if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
-		shrink_zone(zone, sc);
+		shrink_zone(priority, zone, sc);
 	}
 }
  
@@ -1462,11 +1455,10 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		sc.nr_mapped = read_page_state(nr_mapped);
 		sc.nr_scanned = 0;
 		sc.nr_reclaimed = 0;
-		sc.priority = priority;
 		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 		if (!priority)
 			disable_swap_token();
-		shrink_caches(zones, &sc);
+		shrink_caches(priority, zones, &sc);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
 			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -1629,9 +1621,8 @@ scan:
 				zone->prev_priority = priority;
 			sc.nr_scanned = 0;
 			sc.nr_reclaimed = 0;
-			sc.priority = priority;
 			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
-			shrink_zone(zone, &sc);
+			shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
@@ -1886,6 +1877,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	struct scan_control sc;
 	cpumask_t mask;
 	int node_id;
+	int priority;
 
 	if (time_before(jiffies,
 		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
@@ -1906,7 +1898,6 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
 	sc.nr_scanned = 0;
 	sc.nr_reclaimed = 0;
-	sc.priority = ZONE_RECLAIM_PRIORITY + 1;
 	sc.nr_mapped = read_page_state(nr_mapped);
 	sc.gfp_mask = gfp_mask;
 
@@ -1932,11 +1923,11 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 * Free memory by calling shrink zone with increasing priorities
 	 * until we have enough memory freed.
 	 */
+	priority = ZONE_RECLAIM_PRIORITY;
 	do {
-		sc.priority--;
-		shrink_zone(zone, &sc);
-
-	} while (sc.nr_reclaimed < nr_pages && sc.priority > 0);
+		shrink_zone(priority, zone, &sc);
+		priority--;
+	} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
 
 	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
 		/*
-- 
cgit v1.2.3


From 179e96395b1f01e95ebe1ff5ef306b810dbbd147 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:18 -0800
Subject: [PATCH] vmscan: scan_control cleanup

Initialise as much of scan_control as possible at the declaration site.  This
tidies things up a bit and assures us that all unmentioned fields are zeroed
out.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 108 ++++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 62 insertions(+), 46 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f7c4f37c3b18..5feef4d4650e 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1431,13 +1431,14 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	int ret = 0;
 	int total_scanned = 0, total_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct scan_control sc;
 	unsigned long lru_pages = 0;
 	int i;
-
-	sc.gfp_mask = gfp_mask;
-	sc.may_writepage = !laptop_mode;
-	sc.may_swap = 1;
+	struct scan_control sc = {
+		.gfp_mask = gfp_mask,
+		.may_writepage = !laptop_mode,
+		.swap_cluster_max = SWAP_CLUSTER_MAX,
+		.may_swap = 1,
+	};
 
 	inc_page_state(allocstall);
 
@@ -1455,7 +1456,6 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		sc.nr_mapped = read_page_state(nr_mapped);
 		sc.nr_scanned = 0;
 		sc.nr_reclaimed = 0;
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
 		if (!priority)
 			disable_swap_token();
 		shrink_caches(priority, zones, &sc);
@@ -1478,7 +1478,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		 * that's undesirable in laptop mode, where we *want* lumpy
 		 * writeout.  So in laptop mode, write out the whole world.
 		 */
-		if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+		if (total_scanned > sc.swap_cluster_max +
+					sc.swap_cluster_max / 2) {
 			wakeup_pdflush(laptop_mode ? 0 : total_scanned);
 			sc.may_writepage = 1;
 		}
@@ -1532,14 +1533,16 @@ static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
 	int i;
 	int total_scanned, total_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct scan_control sc;
+	struct scan_control sc = {
+		.gfp_mask = GFP_KERNEL,
+		.may_swap = 1,
+		.swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX,
+	};
 
 loop_again:
 	total_scanned = 0;
 	total_reclaimed = 0;
-	sc.gfp_mask = GFP_KERNEL;
-	sc.may_writepage = !laptop_mode;
-	sc.may_swap = 1;
+	sc.may_writepage = !laptop_mode,
 	sc.nr_mapped = read_page_state(nr_mapped);
 
 	inc_page_state(pageoutrun);
@@ -1621,7 +1624,6 @@ scan:
 				zone->prev_priority = priority;
 			sc.nr_scanned = 0;
 			sc.nr_reclaimed = 0;
-			sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
 			shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
@@ -1869,46 +1871,21 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
 /*
  * Try to free up some pages from this zone through reclaim.
  */
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-	int nr_pages;
+	const int nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
-	struct scan_control sc;
-	cpumask_t mask;
-	int node_id;
 	int priority;
-
-	if (time_before(jiffies,
-		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
-			return 0;
-
-	if (!(gfp_mask & __GFP_WAIT) ||
-		zone->all_unreclaimable ||
-		atomic_read(&zone->reclaim_in_progress) > 0 ||
-		(p->flags & PF_MEMALLOC))
-			return 0;
-
-	node_id = zone->zone_pgdat->node_id;
-	mask = node_to_cpumask(node_id);
-	if (!cpus_empty(mask) && node_id != numa_node_id())
-		return 0;
-
-	sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE);
-	sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP);
-	sc.nr_scanned = 0;
-	sc.nr_reclaimed = 0;
-	sc.nr_mapped = read_page_state(nr_mapped);
-	sc.gfp_mask = gfp_mask;
+	struct scan_control sc = {
+		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+		.nr_mapped = read_page_state(nr_mapped),
+		.swap_cluster_max = max(nr_pages, SWAP_CLUSTER_MAX),
+		.gfp_mask = gfp_mask,
+	};
 
 	disable_swap_token();
-
-	nr_pages = 1 << order;
-	if (nr_pages > SWAP_CLUSTER_MAX)
-		sc.swap_cluster_max = nr_pages;
-	else
-		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
-
 	cond_resched();
 	/*
 	 * We need to be able to allocate from the reserves for RECLAIM_SWAP
@@ -1949,5 +1926,44 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	return sc.nr_reclaimed >= nr_pages;
 }
+
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+	cpumask_t mask;
+	int node_id;
+
+	/*
+	 * Do not reclaim if there was a recent unsuccessful attempt at zone
+	 * reclaim.  In that case we let allocations go off node for the
+	 * zone_reclaim_interval.  Otherwise we would scan for each off-node
+	 * page allocation.
+	 */
+	if (time_before(jiffies,
+		zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval))
+			return 0;
+
+	/*
+	 * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+	 * not have reclaimable pages and if we should not delay the allocation
+	 * then do not scan.
+	 */
+	if (!(gfp_mask & __GFP_WAIT) ||
+		zone->all_unreclaimable ||
+		atomic_read(&zone->reclaim_in_progress) > 0 ||
+		(current->flags & PF_MEMALLOC))
+			return 0;
+
+	/*
+	 * Only run zone reclaim on the local zone or on zones that do not
+	 * have associated processors. This will favor the local processor
+	 * over remote processors and spread off node memory allocations
+	 * as wide as possible.
+	 */
+	node_id = zone->zone_pgdat->node_id;
+	mask = node_to_cpumask(node_id);
+	if (!cpus_empty(mask) && node_id != numa_node_id())
+		return 0;
+	return __zone_reclaim(zone, gfp_mask, order);
+}
 #endif
 
-- 
cgit v1.2.3


From 69e05944af39fc6c97b09380c8721e38433bd828 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:19 -0800
Subject: [PATCH] vmscan: use unsigned longs

Turn basically everything in vmscan.c into `unsigned long'.  This is to avoid
the possibility that some piece of code in there might decide to operate upon
more than 4G (or even 2G) of pages in one hit.

This might be silly, but we'll need it one day.

Cc: Christoph Lameter <clameter@sgi.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h   |   2 +-
 include/linux/swap.h |   8 ++--
 mm/vmscan.c          | 104 +++++++++++++++++++++++++++++----------------------
 3 files changed, 64 insertions(+), 50 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index e67980654c49..1850cf8bad64 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1046,7 +1046,7 @@ int in_gate_area_no_task(unsigned long addr);
 
 int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
 					void __user *, size_t *, loff_t *);
-int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
 			unsigned long lru_pages);
 void drop_pagecache(void);
 void drop_slab(void);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d572b19afb7d..3dc6c89c49b8 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -172,8 +172,8 @@ extern int rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
-extern int try_to_free_pages(struct zone **, gfp_t);
-extern int shrink_all_memory(int);
+extern unsigned long try_to_free_pages(struct zone **, gfp_t);
+extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
 
 #ifdef CONFIG_NUMA
@@ -190,11 +190,11 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 
 #ifdef CONFIG_MIGRATION
 extern int isolate_lru_page(struct page *p);
-extern int putback_lru_pages(struct list_head *l);
+extern unsigned long putback_lru_pages(struct list_head *l);
 extern int migrate_page(struct page *, struct page *);
 extern void migrate_page_copy(struct page *, struct page *);
 extern int migrate_page_remove_references(struct page *, struct page *, int);
-extern int migrate_pages(struct list_head *l, struct list_head *t,
+extern unsigned long migrate_pages(struct list_head *l, struct list_head *t,
 		struct list_head *moved, struct list_head *failed);
 extern int fail_migrate_page(struct page *, struct page *);
 #else
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5feef4d4650e..62cd7cd257e3 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -177,10 +177,11 @@ EXPORT_SYMBOL(remove_shrinker);
  *
  * Returns the number of slab objects which we shrunk.
  */
-int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages)
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+			unsigned long lru_pages)
 {
 	struct shrinker *shrinker;
-	int ret = 0;
+	unsigned long ret = 0;
 
 	if (scanned == 0)
 		scanned = SWAP_CLUSTER_MAX;
@@ -410,12 +411,13 @@ cannot_free:
 /*
  * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
  */
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static unsigned long shrink_list(struct list_head *page_list,
+				struct scan_control *sc)
 {
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
-	int reclaimed = 0;
+	unsigned long reclaimed = 0;
 
 	cond_resched();
 
@@ -599,11 +601,11 @@ static inline void move_to_lru(struct page *page)
  *
  * returns the number of pages put back.
  */
-int putback_lru_pages(struct list_head *l)
+unsigned long putback_lru_pages(struct list_head *l)
 {
 	struct page *page;
 	struct page *page2;
-	int count = 0;
+	unsigned long count = 0;
 
 	list_for_each_entry_safe(page, page2, l, lru) {
 		move_to_lru(page);
@@ -848,11 +850,11 @@ EXPORT_SYMBOL(migrate_page);
  *
  * Return: Number of pages not migrated when "to" ran empty.
  */
-int migrate_pages(struct list_head *from, struct list_head *to,
+unsigned long migrate_pages(struct list_head *from, struct list_head *to,
 		  struct list_head *moved, struct list_head *failed)
 {
-	int retry;
-	int nr_failed = 0;
+	unsigned long retry;
+	unsigned long nr_failed = 0;
 	int pass = 0;
 	struct page *page;
 	struct page *page2;
@@ -1069,12 +1071,13 @@ int isolate_lru_page(struct page *page)
  *
  * returns how many pages were moved onto *@dst.
  */
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
-			     struct list_head *dst, int *scanned)
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+		struct list_head *src, struct list_head *dst,
+		unsigned long *scanned)
 {
-	int nr_taken = 0;
+	unsigned long nr_taken = 0;
 	struct page *page;
-	int scan = 0;
+	unsigned long scan = 0;
 
 	while (scan++ < nr_to_scan && !list_empty(src)) {
 		struct list_head *target;
@@ -1106,20 +1109,22 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
 /*
  * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
  */
-static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *sc)
+static void shrink_cache(unsigned long max_scan, struct zone *zone,
+			struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
+	unsigned long nr_scanned = 0;
 
 	pagevec_init(&pvec, 1);
 
 	lru_add_drain();
 	spin_lock_irq(&zone->lru_lock);
-	while (max_scan > 0) {
+	do {
 		struct page *page;
-		int nr_taken;
-		int nr_scan;
-		int nr_freed;
+		unsigned long nr_taken;
+		unsigned long nr_scan;
+		unsigned long nr_freed;
 
 		nr_taken = isolate_lru_pages(sc->swap_cluster_max,
 					     &zone->inactive_list,
@@ -1131,7 +1136,7 @@ static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *s
 		if (nr_taken == 0)
 			goto done;
 
-		max_scan -= nr_scan;
+		nr_scanned += nr_scan;
 		nr_freed = shrink_list(&page_list, sc);
 
 		local_irq_disable();
@@ -1161,7 +1166,7 @@ static void shrink_cache(int max_scan, struct zone *zone, struct scan_control *s
 				spin_lock_irq(&zone->lru_lock);
 			}
 		}
-  	}
+  	} while (nr_scanned < max_scan);
 	spin_unlock_irq(&zone->lru_lock);
 done:
 	pagevec_release(&pvec);
@@ -1185,11 +1190,12 @@ done:
  * But we had to alter page->flags anyway.
  */
 static void
-refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc)
+refill_inactive_zone(unsigned long nr_pages, struct zone *zone,
+			struct scan_control *sc)
 {
-	int pgmoved;
+	unsigned long pgmoved;
 	int pgdeactivate = 0;
-	int pgscanned;
+	unsigned long pgscanned;
 	LIST_HEAD(l_hold);	/* The pages which were snipped off */
 	LIST_HEAD(l_inactive);	/* Pages to go onto the inactive_list */
 	LIST_HEAD(l_active);	/* Pages to go onto the active_list */
@@ -1323,8 +1329,8 @@ refill_inactive_zone(int nr_pages, struct zone *zone, struct scan_control *sc)
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void
-shrink_zone(int priority, struct zone *zone, struct scan_control *sc)
+static void shrink_zone(int priority, struct zone *zone,
+			struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
@@ -1387,8 +1393,8 @@ shrink_zone(int priority, struct zone *zone, struct scan_control *sc)
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static void
-shrink_caches(int priority, struct zone **zones, struct scan_control *sc)
+static void shrink_caches(int priority, struct zone **zones,
+				struct scan_control *sc)
 {
 	int i;
 
@@ -1425,11 +1431,12 @@ shrink_caches(int priority, struct zone **zones, struct scan_control *sc)
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
  */
-int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 {
 	int priority;
 	int ret = 0;
-	int total_scanned = 0, total_reclaimed = 0;
+	unsigned long total_scanned = 0;
+	unsigned long total_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
 	int i;
@@ -1525,13 +1532,15 @@ out:
  * the page allocator fallback scheme to ensure that aging of pages is balanced
  * across the zones.
  */
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
+				int order)
 {
-	int to_free = nr_pages;
+	unsigned long to_free = nr_pages;
 	int all_zones_ok;
 	int priority;
 	int i;
-	int total_scanned, total_reclaimed;
+	unsigned long total_scanned;
+	unsigned long total_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
@@ -1776,22 +1785,23 @@ void wakeup_kswapd(struct zone *zone, int order)
  * Try to free `nr_pages' of memory, system-wide.  Returns the number of freed
  * pages.
  */
-int shrink_all_memory(int nr_pages)
+unsigned long shrink_all_memory(unsigned long nr_pages)
 {
 	pg_data_t *pgdat;
-	int nr_to_free = nr_pages;
-	int ret = 0;
+	unsigned long nr_to_free = nr_pages;
+	unsigned long ret = 0;
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
 
 	current->reclaim_state = &reclaim_state;
 	for_each_pgdat(pgdat) {
-		int freed;
+		unsigned long freed;
+
 		freed = balance_pgdat(pgdat, nr_to_free, 0);
 		ret += freed;
 		nr_to_free -= freed;
-		if (nr_to_free <= 0)
+		if ((long)nr_to_free <= 0)
 			break;
 	}
 	current->reclaim_state = NULL;
@@ -1805,8 +1815,7 @@ int shrink_all_memory(int nr_pages)
    away, we get changed to run anywhere: as the first one comes back,
    restore their cpu bindings. */
 static int __devinit cpu_callback(struct notifier_block *nfb,
-				  unsigned long action,
-				  void *hcpu)
+				  unsigned long action, void *hcpu)
 {
 	pg_data_t *pgdat;
 	cpumask_t mask;
@@ -1826,10 +1835,15 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 static int __init kswapd_init(void)
 {
 	pg_data_t *pgdat;
+
 	swap_setup();
-	for_each_pgdat(pgdat)
-		pgdat->kswapd
-		= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
+	for_each_pgdat(pgdat) {
+		pid_t pid;
+
+		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
+		BUG_ON(pid < 0);
+		pgdat->kswapd = find_task_by_pid(pid);
+	}
 	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
 	return 0;
@@ -1873,7 +1887,7 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-	const int nr_pages = 1 << order;
+	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	int priority;
@@ -1881,7 +1895,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
 		.nr_mapped = read_page_state(nr_mapped),
-		.swap_cluster_max = max(nr_pages, SWAP_CLUSTER_MAX),
+		.swap_cluster_max = max_t(unsigned long, nr_pages,
+					SWAP_CLUSTER_MAX),
 		.gfp_mask = gfp_mask,
 	};
 
@@ -1966,4 +1981,3 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	return __zone_reclaim(zone, gfp_mask, order);
 }
 #endif
-
-- 
cgit v1.2.3


From 05ff51376f01fd8837946a4f8144a84f6cc71c19 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:20 -0800
Subject: [PATCH] vmscan return nr_reclaimed

Change all the vmscan functions to retunr the number-of-reclaimed pages and
remove scan_conrtol.nr_reclaimed.

Saves ten-odd bytes of text and makes things clearer and more consistent.

The patch also changes the behaviour of zone_reclaim() when it falls back to slab shrinking.  Christoph says

  "Setting this to one means that we will rescan and shrink the slab for
  each allocation if we are out of zone memory and RECLAIM_SLAB is set.  Plus
  if we do an order 0 allocation we do not go off node as intended.

  "We better set this to zero.  This means the allocation will go offnode
  despite us having potentially freed lots of memory on the zone.  Future
  allocations can then again be done from this zone."

Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 77 ++++++++++++++++++++++++++++++-------------------------------
 1 file changed, 38 insertions(+), 39 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 62cd7cd257e3..8f6ad13d34f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -55,9 +55,6 @@ struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
 
-	/* Incremented by the number of pages reclaimed */
-	unsigned long nr_reclaimed;
-
 	unsigned long nr_mapped;	/* From page_state */
 
 	/* This context's GFP mask */
@@ -409,7 +406,7 @@ cannot_free:
 }
 
 /*
- * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
+ * shrink_list return the number of reclaimed pages
  */
 static unsigned long shrink_list(struct list_head *page_list,
 				struct scan_control *sc)
@@ -417,7 +414,7 @@ static unsigned long shrink_list(struct list_head *page_list,
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
 	int pgactivate = 0;
-	unsigned long reclaimed = 0;
+	unsigned long nr_reclaimed = 0;
 
 	cond_resched();
 
@@ -557,7 +554,7 @@ static unsigned long shrink_list(struct list_head *page_list,
 
 free_it:
 		unlock_page(page);
-		reclaimed++;
+		nr_reclaimed++;
 		if (!pagevec_add(&freed_pvec, page))
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
@@ -575,8 +572,7 @@ keep:
 	if (pagevec_count(&freed_pvec))
 		__pagevec_release_nonlru(&freed_pvec);
 	mod_page_state(pgactivate, pgactivate);
-	sc->nr_reclaimed += reclaimed;
-	return reclaimed;
+	return nr_reclaimed;
 }
 
 #ifdef CONFIG_MIGRATION
@@ -1107,14 +1103,15 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 }
 
 /*
- * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
+ * shrink_cache() return the number of reclaimed pages
  */
-static void shrink_cache(unsigned long max_scan, struct zone *zone,
-			struct scan_control *sc)
+static unsigned long shrink_cache(unsigned long max_scan, struct zone *zone,
+				struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
 	unsigned long nr_scanned = 0;
+	unsigned long nr_reclaimed = 0;
 
 	pagevec_init(&pvec, 1);
 
@@ -1138,7 +1135,7 @@ static void shrink_cache(unsigned long max_scan, struct zone *zone,
 
 		nr_scanned += nr_scan;
 		nr_freed = shrink_list(&page_list, sc);
-
+		nr_reclaimed += nr_freed;
 		local_irq_disable();
 		if (current_is_kswapd()) {
 			__mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
@@ -1170,6 +1167,7 @@ static void shrink_cache(unsigned long max_scan, struct zone *zone,
 	spin_unlock_irq(&zone->lru_lock);
 done:
 	pagevec_release(&pvec);
+	return nr_reclaimed;
 }
 
 /*
@@ -1329,12 +1327,13 @@ refill_inactive_zone(unsigned long nr_pages, struct zone *zone,
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void shrink_zone(int priority, struct zone *zone,
-			struct scan_control *sc)
+static unsigned long shrink_zone(int priority, struct zone *zone,
+				struct scan_control *sc)
 {
 	unsigned long nr_active;
 	unsigned long nr_inactive;
 	unsigned long nr_to_scan;
+	unsigned long nr_reclaimed = 0;
 
 	atomic_inc(&zone->reclaim_in_progress);
 
@@ -1368,13 +1367,14 @@ static void shrink_zone(int priority, struct zone *zone,
 			nr_to_scan = min(nr_inactive,
 					(unsigned long)sc->swap_cluster_max);
 			nr_inactive -= nr_to_scan;
-			shrink_cache(nr_to_scan, zone, sc);
+			nr_reclaimed += shrink_cache(nr_to_scan, zone, sc);
 		}
 	}
 
 	throttle_vm_writeout();
 
 	atomic_dec(&zone->reclaim_in_progress);
+	return nr_reclaimed;
 }
 
 /*
@@ -1393,9 +1393,10 @@ static void shrink_zone(int priority, struct zone *zone,
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static void shrink_caches(int priority, struct zone **zones,
-				struct scan_control *sc)
+static unsigned long shrink_caches(int priority, struct zone **zones,
+					struct scan_control *sc)
 {
+	unsigned long nr_reclaimed = 0;
 	int i;
 
 	for (i = 0; zones[i] != NULL; i++) {
@@ -1414,8 +1415,9 @@ static void shrink_caches(int priority, struct zone **zones,
 		if (zone->all_unreclaimable && priority != DEF_PRIORITY)
 			continue;	/* Let kswapd poll it */
 
-		shrink_zone(priority, zone, sc);
+		nr_reclaimed += shrink_zone(priority, zone, sc);
 	}
+	return nr_reclaimed;
 }
  
 /*
@@ -1436,7 +1438,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	int priority;
 	int ret = 0;
 	unsigned long total_scanned = 0;
-	unsigned long total_reclaimed = 0;
+	unsigned long nr_reclaimed = 0;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long lru_pages = 0;
 	int i;
@@ -1462,18 +1464,16 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
 		sc.nr_mapped = read_page_state(nr_mapped);
 		sc.nr_scanned = 0;
-		sc.nr_reclaimed = 0;
 		if (!priority)
 			disable_swap_token();
-		shrink_caches(priority, zones, &sc);
+		nr_reclaimed += shrink_caches(priority, zones, &sc);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
-			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+			nr_reclaimed += reclaim_state->reclaimed_slab;
 			reclaim_state->reclaimed_slab = 0;
 		}
 		total_scanned += sc.nr_scanned;
-		total_reclaimed += sc.nr_reclaimed;
-		if (total_reclaimed >= sc.swap_cluster_max) {
+		if (nr_reclaimed >= sc.swap_cluster_max) {
 			ret = 1;
 			goto out;
 		}
@@ -1540,7 +1540,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
 	int priority;
 	int i;
 	unsigned long total_scanned;
-	unsigned long total_reclaimed;
+	unsigned long nr_reclaimed;
 	struct reclaim_state *reclaim_state = current->reclaim_state;
 	struct scan_control sc = {
 		.gfp_mask = GFP_KERNEL,
@@ -1550,7 +1550,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages,
 
 loop_again:
 	total_scanned = 0;
-	total_reclaimed = 0;
+	nr_reclaimed = 0;
 	sc.may_writepage = !laptop_mode,
 	sc.nr_mapped = read_page_state(nr_mapped);
 
@@ -1632,13 +1632,11 @@ scan:
 			if (zone->prev_priority > priority)
 				zone->prev_priority = priority;
 			sc.nr_scanned = 0;
-			sc.nr_reclaimed = 0;
-			shrink_zone(priority, zone, &sc);
+			nr_reclaimed += shrink_zone(priority, zone, &sc);
 			reclaim_state->reclaimed_slab = 0;
 			nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
 						lru_pages);
-			sc.nr_reclaimed += reclaim_state->reclaimed_slab;
-			total_reclaimed += sc.nr_reclaimed;
+			nr_reclaimed += reclaim_state->reclaimed_slab;
 			total_scanned += sc.nr_scanned;
 			if (zone->all_unreclaimable)
 				continue;
@@ -1651,10 +1649,10 @@ scan:
 			 * even in laptop mode
 			 */
 			if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
-			    total_scanned > total_reclaimed+total_reclaimed/2)
+			    total_scanned > nr_reclaimed + nr_reclaimed / 2)
 				sc.may_writepage = 1;
 		}
-		if (nr_pages && to_free > total_reclaimed)
+		if (nr_pages && to_free > nr_reclaimed)
 			continue;	/* swsusp: need to do more work */
 		if (all_zones_ok)
 			break;		/* kswapd: all done */
@@ -1671,7 +1669,7 @@ scan:
 		 * matches the direct reclaim path behaviour in terms of impact
 		 * on zone->*_priority.
 		 */
-		if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
+		if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages)
 			break;
 	}
 out:
@@ -1685,7 +1683,7 @@ out:
 		goto loop_again;
 	}
 
-	return total_reclaimed;
+	return nr_reclaimed;
 }
 
 /*
@@ -1891,6 +1889,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
 	int priority;
+	unsigned long nr_reclaimed = 0;
 	struct scan_control sc = {
 		.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
 		.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -1917,11 +1916,11 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	 */
 	priority = ZONE_RECLAIM_PRIORITY;
 	do {
-		shrink_zone(priority, zone, &sc);
+		nr_reclaimed += shrink_zone(priority, zone, &sc);
 		priority--;
-	} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
+	} while (priority >= 0 && nr_reclaimed < nr_pages);
 
-	if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
+	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
 		/*
 		 * shrink_slab does not currently allow us to determine
 		 * how many pages were freed in the zone. So we just
@@ -1936,10 +1935,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 
-	if (sc.nr_reclaimed == 0)
+	if (nr_reclaimed == 0)
 		zone->last_unsuccessful_zone_reclaim = jiffies;
 
-	return sc.nr_reclaimed >= nr_pages;
+	return nr_reclaimed >= nr_pages;
 }
 
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
-- 
cgit v1.2.3


From 1742f19fa920cdd6905f0db5898524dde22ab2a4 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:21 -0800
Subject: [PATCH] vmscan: rename functions

We have:

	try_to_free_pages
	->shrink_caches(struct zone **zones, ..)
	  ->shrink_zone(struct zone *, ...)
	    ->shrink_cache(struct zone *, ...)
	      ->shrink_list(struct list_head *, ...)
	    ->refill_inactive_list((struct zone *, ...)

which is fairly irrational.

Rename things so that we have

 	try_to_free_pages
 	->shrink_zones(struct zone **zones, ..)
 	  ->shrink_zone(struct zone *, ...)
 	    ->shrink_inactive_list(struct zone *, ...)
 	      ->shrink_page_list(struct list_head *, ...)
	    ->shrink_active_list(struct zone *, ...)

Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 8f6ad13d34f5..2d5d4864de88 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -298,7 +298,8 @@ static void handle_write_error(struct address_space *mapping,
 }
 
 /*
- * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
  */
 static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
@@ -406,10 +407,10 @@ cannot_free:
 }
 
 /*
- * shrink_list return the number of reclaimed pages
+ * shrink_page_list() returns the number of reclaimed pages
  */
-static unsigned long shrink_list(struct list_head *page_list,
-				struct scan_control *sc)
+static unsigned long shrink_page_list(struct list_head *page_list,
+					struct scan_control *sc)
 {
 	LIST_HEAD(ret_pages);
 	struct pagevec freed_pvec;
@@ -1103,10 +1104,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 }
 
 /*
- * shrink_cache() return the number of reclaimed pages
+ * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
+ * of reclaimed pages
  */
-static unsigned long shrink_cache(unsigned long max_scan, struct zone *zone,
-				struct scan_control *sc)
+static unsigned long shrink_inactive_list(unsigned long max_scan,
+				struct zone *zone, struct scan_control *sc)
 {
 	LIST_HEAD(page_list);
 	struct pagevec pvec;
@@ -1134,7 +1136,7 @@ static unsigned long shrink_cache(unsigned long max_scan, struct zone *zone,
 			goto done;
 
 		nr_scanned += nr_scan;
-		nr_freed = shrink_list(&page_list, sc);
+		nr_freed = shrink_page_list(&page_list, sc);
 		nr_reclaimed += nr_freed;
 		local_irq_disable();
 		if (current_is_kswapd()) {
@@ -1187,9 +1189,8 @@ done:
  * The downside is that we have to touch page->_count against each page.
  * But we had to alter page->flags anyway.
  */
-static void
-refill_inactive_zone(unsigned long nr_pages, struct zone *zone,
-			struct scan_control *sc)
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+				struct scan_control *sc)
 {
 	unsigned long pgmoved;
 	int pgdeactivate = 0;
@@ -1360,14 +1361,15 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
 			nr_to_scan = min(nr_active,
 					(unsigned long)sc->swap_cluster_max);
 			nr_active -= nr_to_scan;
-			refill_inactive_zone(nr_to_scan, zone, sc);
+			shrink_active_list(nr_to_scan, zone, sc);
 		}
 
 		if (nr_inactive) {
 			nr_to_scan = min(nr_inactive,
 					(unsigned long)sc->swap_cluster_max);
 			nr_inactive -= nr_to_scan;
-			nr_reclaimed += shrink_cache(nr_to_scan, zone, sc);
+			nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
+								sc);
 		}
 	}
 
@@ -1393,7 +1395,7 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static unsigned long shrink_caches(int priority, struct zone **zones,
+static unsigned long shrink_zones(int priority, struct zone **zones,
 					struct scan_control *sc)
 {
 	unsigned long nr_reclaimed = 0;
@@ -1466,7 +1468,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
 		sc.nr_scanned = 0;
 		if (!priority)
 			disable_swap_token();
-		nr_reclaimed += shrink_caches(priority, zones, &sc);
+		nr_reclaimed += shrink_zones(priority, zones, &sc);
 		shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
 		if (reclaim_state) {
 			nr_reclaimed += reclaim_state->reclaimed_slab;
-- 
cgit v1.2.3


From 7fb2d46d396b2491818f8e43b01049b3234e6c07 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:08:22 -0800
Subject: [PATCH] zone_reclaim: additional comments and cleanup

Add some comments to explain how zone reclaim works.  And it fixes the
following issues:

- PF_SWAPWRITE needs to be set for RECLAIM_SWAP to be able to write
  out pages to swap. Currently RECLAIM_SWAP may not do that.

- remove setting nr_reclaimed pages after slab reclaim since the slab shrinking
  code does not use that and the nr_reclaimed pages is just right for the
  intended follow up action.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 2d5d4864de88..c712b946e4ff 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1887,6 +1887,7 @@ int zone_reclaim_interval __read_mostly = 30*HZ;
  */
 static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
+	/* Minimum pages needed in order to stay on node */
 	const unsigned long nr_pages = 1 << order;
 	struct task_struct *p = current;
 	struct reclaim_state reclaim_state;
@@ -1924,9 +1925,12 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) {
 		/*
-		 * shrink_slab does not currently allow us to determine
-		 * how many pages were freed in the zone. So we just
-		 * shake the slab and then go offnode for a single allocation.
+		 * shrink_slab() does not currently allow us to determine how
+		 * many pages were freed in this zone. So we just shake the slab
+		 * a bit and then go off node for this particular allocation
+		 * despite possibly having freed enough memory to allocate in
+		 * this zone.  If we freed local memory then the next
+		 * allocations will be local again.
 		 *
 		 * shrink_slab will free memory on all zones and may take
 		 * a long time.
@@ -1937,8 +1941,14 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 
-	if (nr_reclaimed == 0)
+	if (nr_reclaimed == 0) {
+		/*
+		 * We were unable to reclaim enough pages to stay on node.  We
+		 * now allow off node accesses for a certain time period before
+		 * trying again to reclaim pages from the local zone.
+		 */
 		zone->last_unsuccessful_zone_reclaim = jiffies;
+	}
 
 	return nr_reclaimed >= nr_pages;
 }
-- 
cgit v1.2.3


From c9b02d970c385a253edb36c87643b0df706b50b4 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <wfg@mail.ustc.edu.cn>
Date: Wed, 22 Mar 2006 00:08:23 -0800
Subject: [PATCH] mm: isolate_lru_pages() scan count fix

In isolate_lru_pages(), *scanned reports one more scan because the scan
counter is increased one more time on exit of the while-loop.

Change the while-loop to for-loop to fix it.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Wu Fengguang <wfg@mail.ustc.edu.cn>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index c712b946e4ff..85e95f445022 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1074,9 +1074,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 {
 	unsigned long nr_taken = 0;
 	struct page *page;
-	unsigned long scan = 0;
+	unsigned long scan;
 
-	while (scan++ < nr_to_scan && !list_empty(src)) {
+	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 		struct list_head *target;
 		page = lru_to_page(src);
 		prefetchw_prev_lru_page(page, src, flags);
-- 
cgit v1.2.3


From fb8d14e172a29ba5ac69a73b61196be86fdfc3e1 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <wfg@mail.ustc.edu.cn>
Date: Wed, 22 Mar 2006 00:08:28 -0800
Subject: [PATCH] mm: shrink_inactive_lis() nr_scan accounting fix

In shrink_inactive_list(), nr_scan is not accounted when nr_taken is 0.
But 0 pages taken does not mean 0 pages scanned.

Move the goto statement below the accounting code to fix it.

Signed-off-by: Wu Fengguang <wfg@mail.ustc.edu.cn>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 85e95f445022..486184d2b50c 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1132,9 +1132,6 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 		zone->pages_scanned += nr_scan;
 		spin_unlock_irq(&zone->lru_lock);
 
-		if (nr_taken == 0)
-			goto done;
-
 		nr_scanned += nr_scan;
 		nr_freed = shrink_page_list(&page_list, sc);
 		nr_reclaimed += nr_freed;
@@ -1146,6 +1143,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			__mod_page_state_zone(zone, pgscan_direct, nr_scan);
 		__mod_page_state_zone(zone, pgsteal, nr_freed);
 
+		if (nr_taken == 0)
+			goto done;
+
 		spin_lock(&zone->lru_lock);
 		/*
 		 * Put back any unfreeable pages.
@@ -1166,8 +1166,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 			}
 		}
   	} while (nr_scanned < max_scan);
-	spin_unlock_irq(&zone->lru_lock);
+	spin_unlock(&zone->lru_lock);
 done:
+	local_irq_enable();
 	pagevec_release(&pvec);
 	return nr_reclaimed;
 }
-- 
cgit v1.2.3


From a6f563db09c54c80d80e9013182dc512a5e53d0f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 22 Mar 2006 00:08:29 -0800
Subject: [PATCH] remove VM_DONTCOPY bogosities

Now that it's madvisable, remove two pieces of VM_DONTCOPY bogosity:

1. There was and is no logical reason why VM_DONTCOPY should be in the
   list of flags which forbid vma merging (and those drivers which set
   it are also setting VM_IO, which itself forbids the merge).

2. It's hard to understand the purpose of the VM_HUGETLB, VM_DONTCOPY
   block in vm_stat_account: but never mind, it's under CONFIG_HUGETLB,
   which (unlike CONFIG_HUGETLB_PAGE or CONFIG_HUGETLBFS) has never been
   defined.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 47556d2b3e90..0eb9894db6de 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -612,7 +612,7 @@ again:			remove_next = 1 + (end > next->vm_end);
  * If the vma has a ->close operation then the driver probably needs to release
  * per-vma resources, so we don't attempt to merge those.
  */
-#define VM_SPECIAL (VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
+#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
 
 static inline int is_mergeable_vma(struct vm_area_struct *vma,
 			struct file *file, unsigned long vm_flags)
@@ -845,14 +845,6 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
 	const unsigned long stack_flags
 		= VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN);
 
-#ifdef CONFIG_HUGETLB
-	if (flags & VM_HUGETLB) {
-		if (!(flags & VM_DONTCOPY))
-			mm->shared_vm += pages;
-		return;
-	}
-#endif /* CONFIG_HUGETLB */
-
 	if (file) {
 		mm->shared_vm += pages;
 		if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC)
-- 
cgit v1.2.3


From 0f8053a509ceba4a077a50ea7b77039b5559b428 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:33 -0800
Subject: [PATCH] mm: make __put_page internal

Remove __put_page from outside the core mm/.  It is dangerous because it does
not handle compound pages nicely, and misses 1->0 transitions.  If a user
later appears that really needs the extra speed we can reevaluate.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mm.h |  1 -
 mm/filemap.c       |  2 ++
 mm/internal.h      | 11 +++++++++++
 mm/vmscan.c        |  2 ++
 4 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1850cf8bad64..9b3cdfc8046d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -308,7 +308,6 @@ static inline int get_page_unless_zero(struct page *page)
 }
 
 #define set_page_count(p,v) 	atomic_set(&(p)->_count, (v))
-#define __put_page(p)		atomic_dec(&(p)->_count)
 
 extern void FASTCALL(__page_cache_release(struct page *));
 
diff --git a/mm/filemap.c b/mm/filemap.c
index 44da3d476994..e8f58f7dd7a5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -30,6 +30,8 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include "filemap.h"
+#include "internal.h"
+
 /*
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
diff --git a/mm/internal.h b/mm/internal.h
index 17256bb2f4ef..e3042db2a2d6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -8,6 +8,10 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+#ifndef __MM_INTERNAL_H
+#define __MM_INTERNAL_H
+
+#include <linux/mm.h>
 
 static inline void set_page_refs(struct page *page, int order)
 {
@@ -26,5 +30,12 @@ static inline void set_page_refs(struct page *page, int order)
 #endif /* CONFIG_MMU */
 }
 
+static inline void __put_page(struct page *page)
+{
+	atomic_dec(&page->_count);
+}
+
 extern void fastcall __init __free_pages_bootmem(struct page *page,
 						unsigned int order);
+
+#endif
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 486184d2b50c..3914a94aa905 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -39,6 +39,8 @@
 
 #include <linux/swapops.h>
 
+#include "internal.h"
+
 /* possible outcome of pageout() */
 typedef enum {
 	/* failed to write page out, page is locked */
-- 
cgit v1.2.3


From 84097518d1ecd2330f9488e4c2d09953a3340e74 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:34 -0800
Subject: [PATCH] mm: nommu use compound pages

Now that compound page handling is properly fixed in the VM, move nommu
over to using compound pages rather than rolling their own refcounting.

nommu vm page refcounting is broken anyway, but there is no need to have
divergent code in the core VM now, nor when it gets fixed.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Cc: David Howells <dhowells@redhat.com>

(Needs testing, please).
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ramfs/file-nommu.c |  3 +--
 include/linux/mm.h    |  4 ----
 mm/internal.h         | 12 ------------
 mm/nommu.c            |  4 ++--
 mm/page_alloc.c       |  7 -------
 mm/slab.c             |  9 ++++++++-
 6 files changed, 11 insertions(+), 28 deletions(-)

(limited to 'mm')

diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 3f810acd0bfa..b1ca234068f6 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -87,8 +87,7 @@ static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 	xpages = 1UL << order;
 	npages = (newsize + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
-	for (loop = 0; loop < npages; loop++)
-		set_page_count(pages + loop, 1);
+	split_page(pages, order);
 
 	/* trim off any pages we don't actually require */
 	for (loop = npages; loop < xpages; loop++)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9b3cdfc8046d..3d84b7a35e0d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -327,11 +327,7 @@ static inline void get_page(struct page *page)
 
 void put_page(struct page *page);
 
-#ifdef CONFIG_MMU
 void split_page(struct page *page, unsigned int order);
-#else
-static inline void split_page(struct page *page, unsigned int order) {}
-#endif
 
 /*
  * Multiple processes may "see" the same page. E.g. for untouched
diff --git a/mm/internal.h b/mm/internal.h
index e3042db2a2d6..7bb339779818 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -15,19 +15,7 @@
 
 static inline void set_page_refs(struct page *page, int order)
 {
-#ifdef CONFIG_MMU
 	set_page_count(page, 1);
-#else
-	int i;
-
-	/*
-	 * We need to reference all the pages for this order, otherwise if
-	 * anyone accesses one of the pages with (get/put) it will be freed.
-	 * - eg: access_process_vm()
-	 */
-	for (i = 0; i < (1 << order); i++)
-		set_page_count(page + i, 1);
-#endif /* CONFIG_MMU */
 }
 
 static inline void __put_page(struct page *page)
diff --git a/mm/nommu.c b/mm/nommu.c
index 4951f4786f28..db45efac17cc 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -159,7 +159,7 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
 	/*
 	 * kmalloc doesn't like __GFP_HIGHMEM for some reason
 	 */
-	return kmalloc(size, gfp_mask & ~__GFP_HIGHMEM);
+	return kmalloc(size, (gfp_mask | __GFP_COMP) & ~__GFP_HIGHMEM);
 }
 
 struct page * vmalloc_to_page(void *addr)
@@ -623,7 +623,7 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
 	 * - note that this may not return a page-aligned address if the object
 	 *   we're allocating is smaller than a page
 	 */
-	base = kmalloc(len, GFP_KERNEL);
+	base = kmalloc(len, GFP_KERNEL|__GFP_COMP);
 	if (!base)
 		goto enomem;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7aa0181287e1..e197818a7cf6 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -422,11 +422,6 @@ static void __free_pages_ok(struct page *page, unsigned int order)
 		mutex_debug_check_no_locks_freed(page_address(page),
 						 PAGE_SIZE<<order);
 
-#ifndef CONFIG_MMU
-	for (i = 1 ; i < (1 << order) ; ++i)
-		__put_page(page + i);
-#endif
-
 	for (i = 0 ; i < (1 << order) ; ++i)
 		reserved += free_pages_check(page + i);
 	if (reserved)
@@ -746,7 +741,6 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 		clear_highpage(page + i);
 }
 
-#ifdef CONFIG_MMU
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
@@ -766,7 +760,6 @@ void split_page(struct page *page, unsigned int order)
 		set_page_count(page + i, 1);
 	}
 }
-#endif
 
 /*
  * Really, prep_compound_page() should be called from __rmqueue_bulk().  But
diff --git a/mm/slab.c b/mm/slab.c
index f477acfb732f..ff0ab772f49d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -590,6 +590,8 @@ static inline void page_set_cache(struct page *page, struct kmem_cache *cache)
 
 static inline struct kmem_cache *page_get_cache(struct page *page)
 {
+	if (unlikely(PageCompound(page)))
+		page = (struct page *)page_private(page);
 	return (struct kmem_cache *)page->lru.next;
 }
 
@@ -600,6 +602,8 @@ static inline void page_set_slab(struct page *page, struct slab *slab)
 
 static inline struct slab *page_get_slab(struct page *page)
 {
+	if (unlikely(PageCompound(page)))
+		page = (struct page *)page_private(page);
 	return (struct slab *)page->lru.prev;
 }
 
@@ -2412,8 +2416,11 @@ static void set_slab_attr(struct kmem_cache *cachep, struct slab *slabp,
 	struct page *page;
 
 	/* Nasty!!!!!! I hope this is OK. */
-	i = 1 << cachep->gfporder;
 	page = virt_to_page(objp);
+
+	i = 1;
+	if (likely(!PageCompound(page)))
+		i <<= cachep->gfporder;
 	do {
 		page_set_cache(page, cachep);
 		page_set_slab(page, slabp);
-- 
cgit v1.2.3


From 7835e98b2e3c66dba79cb0ff8ebb90a2fe030c29 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:40 -0800
Subject: [PATCH] remove set_page_count() outside mm/

set_page_count usage outside mm/ is limited to setting the refcount to 1.
Remove set_page_count from outside mm/, and replace those users with
init_page_count() and set_page_refcounted().

This allows more debug checking, and tighter control on how code is allowed
to play around with page->_count.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/alpha/mm/init.c                   |  2 +-
 arch/arm/mm/init.c                     |  2 +-
 arch/arm26/mm/init.c                   |  2 +-
 arch/cris/mm/init.c                    |  2 +-
 arch/frv/mm/init.c                     |  6 +++---
 arch/h8300/mm/init.c                   |  4 ++--
 arch/i386/mm/init.c                    |  6 +++---
 arch/ia64/mm/init.c                    |  6 +++---
 arch/m32r/mm/init.c                    |  4 ++--
 arch/m68k/mm/init.c                    |  2 +-
 arch/m68k/mm/memory.c                  |  2 +-
 arch/m68k/mm/motorola.c                |  2 +-
 arch/m68knommu/mm/init.c               |  4 ++--
 arch/mips/arc/memory.c                 |  2 +-
 arch/mips/dec/prom/memory.c            |  2 +-
 arch/mips/mips-boards/generic/memory.c |  2 +-
 arch/mips/mips-boards/sim/sim_mem.c    |  2 +-
 arch/mips/mm/init.c                    |  6 +++---
 arch/mips/sgi-ip27/ip27-memory.c       |  2 +-
 arch/parisc/mm/init.c                  |  4 ++--
 arch/powerpc/mm/init_32.c              |  4 ++--
 arch/powerpc/mm/init_64.c              |  4 ++--
 arch/powerpc/mm/mem.c                  |  4 ++--
 arch/powerpc/platforms/cell/setup.c    |  2 +-
 arch/ppc/mm/init.c                     |  6 +++---
 arch/s390/mm/init.c                    |  4 ++--
 arch/sh/mm/init.c                      |  4 ++--
 arch/sh64/mm/init.c                    |  4 ++--
 arch/sparc/kernel/sun4d_smp.c          |  6 +++---
 arch/sparc/kernel/sun4m_smp.c          |  6 +++---
 arch/sparc/mm/init.c                   |  6 +++---
 arch/sparc64/mm/init.c                 |  4 ++--
 arch/um/kernel/mem.c                   |  4 ++--
 arch/x86_64/mm/init.c                  |  6 +++---
 arch/xtensa/mm/init.c                  |  2 +-
 drivers/video/acornfb.c                |  2 +-
 include/linux/mm.h                     | 11 +++++++++--
 mm/hugetlb.c                           |  5 +++--
 mm/internal.h                          | 13 ++++++++++++-
 mm/page_alloc.c                        | 14 ++++++--------
 40 files changed, 96 insertions(+), 79 deletions(-)

(limited to 'mm')

diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c
index 486d7945583d..544ac5dc09eb 100644
--- a/arch/alpha/mm/init.c
+++ b/arch/alpha/mm/init.c
@@ -357,7 +357,7 @@ free_reserved_mem(void *start, void *end)
 	void *__start = start;
 	for (; __start < end; __start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(__start));
-		set_page_count(virt_to_page(__start), 1);
+		init_page_count(virt_to_page(__start));
 		free_page((long)__start);
 		totalram_pages++;
 	}
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 8b276ee38acf..b0321e943b76 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -531,7 +531,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s)
 	for (; addr < end; addr += PAGE_SIZE) {
 		struct page *page = virt_to_page(addr);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c
index 1f09a9d0fb83..e3ecaa453747 100644
--- a/arch/arm26/mm/init.c
+++ b/arch/arm26/mm/init.c
@@ -324,7 +324,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s)
 	for (; addr < end; addr += PAGE_SIZE) {
 		struct page *page = virt_to_page(addr);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c
index 31a0018b525a..b7842ff213a6 100644
--- a/arch/cris/mm/init.c
+++ b/arch/cris/mm/init.c
@@ -216,7 +216,7 @@ free_initmem(void)
         addr = (unsigned long)(&__init_begin);
         for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
                 ClearPageReserved(virt_to_page(addr));
-                set_page_count(virt_to_page(addr), 1);
+                init_page_count(virt_to_page(addr));
                 free_page(addr);
                 totalram_pages++;
         }
diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c
index 765088ea8a50..8899aa1a4f06 100644
--- a/arch/frv/mm/init.c
+++ b/arch/frv/mm/init.c
@@ -169,7 +169,7 @@ void __init mem_init(void)
 		struct page *page = &mem_map[pfn];
 
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 		totalram_pages++;
 	}
@@ -210,7 +210,7 @@ void __init free_initmem(void)
 	/* next to check that the page we free is not a partial page */
 	for (addr = start; addr < end; addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -230,7 +230,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c
index 1e0929ddc8c4..09efc4b1f038 100644
--- a/arch/h8300/mm/init.c
+++ b/arch/h8300/mm/init.c
@@ -196,7 +196,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
@@ -219,7 +219,7 @@ free_initmem()
 	/* next to check that the page we free is not a partial page */
 	for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c
index 2700f01994ba..7ba55a6e2dbc 100644
--- a/arch/i386/mm/init.c
+++ b/arch/i386/mm/init.c
@@ -270,7 +270,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
 
 static void __meminit free_new_highpage(struct page *page)
 {
-	set_page_count(page, 1);
+	init_page_count(page);
 	__free_page(page);
 	totalhigh_pages++;
 }
@@ -727,7 +727,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		memset((void *)addr, 0xcc, PAGE_SIZE);
 		free_page(addr);
 		totalram_pages++;
@@ -766,7 +766,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b38b6d213c15..08d94e6bfa18 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -197,7 +197,7 @@ free_initmem (void)
 	eaddr = (unsigned long) ia64_imva(__init_end);
 	while (addr < eaddr) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		++totalram_pages;
 		addr += PAGE_SIZE;
@@ -252,7 +252,7 @@ free_initrd_mem (unsigned long start, unsigned long end)
 			continue;
 		page = virt_to_page(start);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(start);
 		++totalram_pages;
 	}
@@ -640,7 +640,7 @@ mem_init (void)
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
-	set_page_count(page, 1);
+	init_page_count(page);
 	__free_page(page);
 	totalram_pages++;
 	num_physpages++;
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index 6facf15b04f3..c9e7dad860b7 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -226,7 +226,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -244,7 +244,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	unsigned long p;
 	for (p = start; p < end; p += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(p));
-		set_page_count(virt_to_page(p), 1);
+		init_page_count(virt_to_page(p));
 		free_page(p);
 		totalram_pages++;
 	}
diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c
index c45beb955943..a190e39c907a 100644
--- a/arch/m68k/mm/init.c
+++ b/arch/m68k/mm/init.c
@@ -137,7 +137,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c
index 559942ce0e1e..d6d582a5abb0 100644
--- a/arch/m68k/mm/memory.c
+++ b/arch/m68k/mm/memory.c
@@ -54,7 +54,7 @@ void __init init_pointer_table(unsigned long ptable)
 
 	/* unreserve the page so it's possible to free that page */
 	PD_PAGE(dp)->flags &= ~(1 << PG_reserved);
-	set_page_count(PD_PAGE(dp), 1);
+	init_page_count(PD_PAGE(dp));
 
 	return;
 }
diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c
index d855fec26317..afb57eeafdcb 100644
--- a/arch/m68k/mm/motorola.c
+++ b/arch/m68k/mm/motorola.c
@@ -276,7 +276,7 @@ void free_initmem(void)
 	addr = (unsigned long)&__init_begin;
 	for (; addr < (unsigned long)&__init_end; addr += PAGE_SIZE) {
 		virt_to_page(addr)->flags &= ~(1 << PG_reserved);
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c
index 89f0b554ffb7..d79503fe6e42 100644
--- a/arch/m68knommu/mm/init.c
+++ b/arch/m68knommu/mm/init.c
@@ -195,7 +195,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	int pages = 0;
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 		pages++;
@@ -218,7 +218,7 @@ free_initmem()
 	/* next to check that the page we free is not a partial page */
 	for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
diff --git a/arch/mips/arc/memory.c b/arch/mips/arc/memory.c
index 958d2eb78862..8a9ef58cc399 100644
--- a/arch/mips/arc/memory.c
+++ b/arch/mips/arc/memory.c
@@ -158,7 +158,7 @@ unsigned long __init prom_free_prom_memory(void)
 		while (addr < boot_mem_map.map[i].addr
 			      + boot_mem_map.map[i].size) {
 			ClearPageReserved(virt_to_page(__va(addr)));
-			set_page_count(virt_to_page(__va(addr)), 1);
+			init_page_count(virt_to_page(__va(addr)));
 			free_page((unsigned long)__va(addr));
 			addr += PAGE_SIZE;
 			freed += PAGE_SIZE;
diff --git a/arch/mips/dec/prom/memory.c b/arch/mips/dec/prom/memory.c
index 81cb5a76cfb7..1edaf3074ee9 100644
--- a/arch/mips/dec/prom/memory.c
+++ b/arch/mips/dec/prom/memory.c
@@ -118,7 +118,7 @@ unsigned long __init prom_free_prom_memory(void)
 	addr = PAGE_SIZE;
 	while (addr < end) {
 		ClearPageReserved(virt_to_page(__va(addr)));
-		set_page_count(virt_to_page(__va(addr)), 1);
+		init_page_count(virt_to_page(__va(addr)));
 		free_page((unsigned long)__va(addr));
 		addr += PAGE_SIZE;
 	}
diff --git a/arch/mips/mips-boards/generic/memory.c b/arch/mips/mips-boards/generic/memory.c
index 2c8afd77a20b..ee5e70c95cf3 100644
--- a/arch/mips/mips-boards/generic/memory.c
+++ b/arch/mips/mips-boards/generic/memory.c
@@ -174,7 +174,7 @@ unsigned long __init prom_free_prom_memory(void)
 		while (addr < boot_mem_map.map[i].addr
 			      + boot_mem_map.map[i].size) {
 			ClearPageReserved(virt_to_page(__va(addr)));
-			set_page_count(virt_to_page(__va(addr)), 1);
+			init_page_count(virt_to_page(__va(addr)));
 			free_page((unsigned long)__va(addr));
 			addr += PAGE_SIZE;
 			freed += PAGE_SIZE;
diff --git a/arch/mips/mips-boards/sim/sim_mem.c b/arch/mips/mips-boards/sim/sim_mem.c
index 0dbd7435bb2a..1ec4e75656bd 100644
--- a/arch/mips/mips-boards/sim/sim_mem.c
+++ b/arch/mips/mips-boards/sim/sim_mem.c
@@ -117,7 +117,7 @@ unsigned long __init prom_free_prom_memory(void)
 		while (addr < boot_mem_map.map[i].addr
 			      + boot_mem_map.map[i].size) {
 			ClearPageReserved(virt_to_page(__va(addr)));
-			set_page_count(virt_to_page(__va(addr)), 1);
+			init_page_count(virt_to_page(__va(addr)));
 			free_page((unsigned long)__va(addr));
 			addr += PAGE_SIZE;
 			freed += PAGE_SIZE;
diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c
index a140da9732db..52f7d59fe612 100644
--- a/arch/mips/mm/init.c
+++ b/arch/mips/mm/init.c
@@ -245,7 +245,7 @@ void __init mem_init(void)
 #ifdef CONFIG_LIMITED_DMA
 		set_page_address(page, lowmem_page_address(page));
 #endif
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 		totalhigh_pages++;
 	}
@@ -292,7 +292,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
@@ -315,7 +315,7 @@ void free_initmem(void)
 		page = addr;
 #endif
 		ClearPageReserved(virt_to_page(page));
-		set_page_count(virt_to_page(page), 1);
+		init_page_count(virt_to_page(page));
 		free_page(page);
 		totalram_pages++;
 		freed += PAGE_SIZE;
diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c
index ed93a9792959..e0d095daa5ed 100644
--- a/arch/mips/sgi-ip27/ip27-memory.c
+++ b/arch/mips/sgi-ip27/ip27-memory.c
@@ -559,7 +559,7 @@ void __init mem_init(void)
 				/* if (!page_is_ram(pgnr)) continue; */
 				/* commented out until page_is_ram works */
 				ClearPageReserved(p);
-				set_page_count(p, 1);
+				init_page_count(p);
 				__free_page(p);
 				totalram_pages++;
 			}
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 7847ca13d6c2..852eda3953dc 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -398,7 +398,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		num_physpages++;
 		totalram_pages++;
@@ -1018,7 +1018,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		num_physpages++;
 		totalram_pages++;
diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c
index 7d0d75c11848..b57fb3a2b7bb 100644
--- a/arch/powerpc/mm/init_32.c
+++ b/arch/powerpc/mm/init_32.c
@@ -216,7 +216,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name)
 
 	while (start < end) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		cnt++;
 		start += PAGE_SIZE;
@@ -248,7 +248,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 81cfb0c2ec58..bacb71c89811 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -140,7 +140,7 @@ void free_initmem(void)
 	for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) {
 		memset((void *)addr, 0xcc, PAGE_SIZE);
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -155,7 +155,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 6ae5c130d0db..454cac01d8cc 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -108,7 +108,7 @@ EXPORT_SYMBOL(phys_mem_access_prot);
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
-	set_page_count(page, 1);
+	init_page_count(page);
 	__free_page(page);
 	totalram_pages++;
 	num_physpages++;
@@ -376,7 +376,7 @@ void __init mem_init(void)
 			struct page *page = pfn_to_page(pfn);
 
 			ClearPageReserved(page);
-			set_page_count(page, 1);
+			init_page_count(page);
 			__free_page(page);
 			totalhigh_pages++;
 		}
diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c
index b33a4443f5a9..fec8e65b36ea 100644
--- a/arch/powerpc/platforms/cell/setup.c
+++ b/arch/powerpc/platforms/cell/setup.c
@@ -115,7 +115,7 @@ static void __init cell_spuprop_present(struct device_node *spe,
 		for (pfn = start_pfn; pfn < end_pfn; pfn++) {
 			struct page *page = pfn_to_page(pfn);
 			set_page_links(page, ZONE_DMA, node_id, pfn);
-			set_page_count(page, 1);
+			init_page_count(page);
 			reset_page_mapcount(page);
 			SetPageReserved(page);
 			INIT_LIST_HEAD(&page->lru);
diff --git a/arch/ppc/mm/init.c b/arch/ppc/mm/init.c
index 134db5c04203..cb1c294fb932 100644
--- a/arch/ppc/mm/init.c
+++ b/arch/ppc/mm/init.c
@@ -140,7 +140,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name)
 
 	while (start < end) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		cnt++;
 		start += PAGE_SIZE;
@@ -172,7 +172,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
@@ -441,7 +441,7 @@ void __init mem_init(void)
 			struct page *page = mem_map + pfn;
 
 			ClearPageReserved(page);
-			set_page_count(page, 1);
+			init_page_count(page);
 			__free_page(page);
 			totalhigh_pages++;
 		}
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index df953383724d..a055894f3bd8 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -292,7 +292,7 @@ void free_initmem(void)
         addr = (unsigned long)(&__init_begin);
         for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
         }
@@ -307,7 +307,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
                 printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
         for (; start < end; start += PAGE_SIZE) {
                 ClearPageReserved(virt_to_page(start));
-                set_page_count(virt_to_page(start), 1);
+                init_page_count(virt_to_page(start));
                 free_page(start);
                 totalram_pages++;
         }
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index e342565f75fb..77b4a838fe10 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -273,7 +273,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -286,7 +286,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	unsigned long p;
 	for (p = start; p < end; p += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(p));
-		set_page_count(virt_to_page(p), 1);
+		init_page_count(virt_to_page(p));
 		free_page(p);
 		totalram_pages++;
 	}
diff --git a/arch/sh64/mm/init.c b/arch/sh64/mm/init.c
index a65e8bb2c3cc..1169757fb38b 100644
--- a/arch/sh64/mm/init.c
+++ b/arch/sh64/mm/init.c
@@ -173,7 +173,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		free_page(addr);
 		totalram_pages++;
 	}
@@ -186,7 +186,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	unsigned long p;
 	for (p = start; p < end; p += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(p));
-		set_page_count(virt_to_page(p), 1);
+		init_page_count(virt_to_page(p));
 		free_page(p);
 		totalram_pages++;
 	}
diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c
index 40d426cce824..4219dd2ce3a2 100644
--- a/arch/sparc/kernel/sun4d_smp.c
+++ b/arch/sparc/kernel/sun4d_smp.c
@@ -266,19 +266,19 @@ void __init smp4d_boot_cpus(void)
 
 	/* Free unneeded trap tables */
 	ClearPageReserved(virt_to_page(trapbase_cpu1));
-	set_page_count(virt_to_page(trapbase_cpu1), 1);
+	init_page_count(virt_to_page(trapbase_cpu1));
 	free_page((unsigned long)trapbase_cpu1);
 	totalram_pages++;
 	num_physpages++;
 
 	ClearPageReserved(virt_to_page(trapbase_cpu2));
-	set_page_count(virt_to_page(trapbase_cpu2), 1);
+	init_page_count(virt_to_page(trapbase_cpu2));
 	free_page((unsigned long)trapbase_cpu2);
 	totalram_pages++;
 	num_physpages++;
 
 	ClearPageReserved(virt_to_page(trapbase_cpu3));
-	set_page_count(virt_to_page(trapbase_cpu3), 1);
+	init_page_count(virt_to_page(trapbase_cpu3));
 	free_page((unsigned long)trapbase_cpu3);
 	totalram_pages++;
 	num_physpages++;
diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c
index a21f27d10e55..fbbd8a474c4c 100644
--- a/arch/sparc/kernel/sun4m_smp.c
+++ b/arch/sparc/kernel/sun4m_smp.c
@@ -233,21 +233,21 @@ void __init smp4m_boot_cpus(void)
 	/* Free unneeded trap tables */
 	if (!cpu_isset(i, cpu_present_map)) {
 		ClearPageReserved(virt_to_page(trapbase_cpu1));
-		set_page_count(virt_to_page(trapbase_cpu1), 1);
+		init_page_count(virt_to_page(trapbase_cpu1));
 		free_page((unsigned long)trapbase_cpu1);
 		totalram_pages++;
 		num_physpages++;
 	}
 	if (!cpu_isset(2, cpu_present_map)) {
 		ClearPageReserved(virt_to_page(trapbase_cpu2));
-		set_page_count(virt_to_page(trapbase_cpu2), 1);
+		init_page_count(virt_to_page(trapbase_cpu2));
 		free_page((unsigned long)trapbase_cpu2);
 		totalram_pages++;
 		num_physpages++;
 	}
 	if (!cpu_isset(3, cpu_present_map)) {
 		ClearPageReserved(virt_to_page(trapbase_cpu3));
-		set_page_count(virt_to_page(trapbase_cpu3), 1);
+		init_page_count(virt_to_page(trapbase_cpu3));
 		free_page((unsigned long)trapbase_cpu3);
 		totalram_pages++;
 		num_physpages++;
diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c
index c03babaa0498..898669732466 100644
--- a/arch/sparc/mm/init.c
+++ b/arch/sparc/mm/init.c
@@ -383,7 +383,7 @@ void map_high_region(unsigned long start_pfn, unsigned long end_pfn)
 		struct page *page = pfn_to_page(tmp);
 
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 		totalhigh_pages++;
 	}
@@ -480,7 +480,7 @@ void free_initmem (void)
 		p = virt_to_page(addr);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		totalram_pages++;
 		num_physpages++;
@@ -497,7 +497,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		struct page *p = virt_to_page(start);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		num_physpages++;
 	}
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index c2b556106fc1..2ae143ba50d8 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -1461,7 +1461,7 @@ void free_initmem(void)
 		p = virt_to_page(page);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		num_physpages++;
 		totalram_pages++;
@@ -1477,7 +1477,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 		struct page *p = virt_to_page(start);
 
 		ClearPageReserved(p);
-		set_page_count(p, 1);
+		init_page_count(p);
 		__free_page(p);
 		num_physpages++;
 		totalram_pages++;
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index fa4f915be5c5..92cce96b5e24 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -57,7 +57,7 @@ static void setup_highmem(unsigned long highmem_start,
 	for(i = 0; i < highmem_len >> PAGE_SHIFT; i++){
 		page = &mem_map[highmem_pfn + i];
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		__free_page(page);
 	}
 }
@@ -296,7 +296,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 			(end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index 7af1742aa958..40ed13d263cd 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -486,7 +486,7 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size)
 void online_page(struct page *page)
 {
 	ClearPageReserved(page);
-	set_page_count(page, 1);
+	init_page_count(page);
 	__free_page(page);
 	totalram_pages++;
 	num_physpages++;
@@ -592,7 +592,7 @@ void free_initmem(void)
 	addr = (unsigned long)(&__init_begin);
 	for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(addr));
-		set_page_count(virt_to_page(addr), 1);
+		init_page_count(virt_to_page(addr));
 		memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); 
 		free_page(addr);
 		totalram_pages++;
@@ -632,7 +632,7 @@ void free_initrd_mem(unsigned long start, unsigned long end)
 	printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10);
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page(start);
 		totalram_pages++;
 	}
diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c
index 5a91d6c9e66d..e1be4235f367 100644
--- a/arch/xtensa/mm/init.c
+++ b/arch/xtensa/mm/init.c
@@ -272,7 +272,7 @@ free_reserved_mem(void *start, void *end)
 {
 	for (; start < end; start += PAGE_SIZE) {
 		ClearPageReserved(virt_to_page(start));
-		set_page_count(virt_to_page(start), 1);
+		init_page_count(virt_to_page(start));
 		free_page((unsigned long)start);
 		totalram_pages++;
 	}
diff --git a/drivers/video/acornfb.c b/drivers/video/acornfb.c
index b058273527bb..76448d6ae896 100644
--- a/drivers/video/acornfb.c
+++ b/drivers/video/acornfb.c
@@ -1269,7 +1269,7 @@ free_unused_pages(unsigned int virtual_start, unsigned int virtual_end)
 		 */
 		page = virt_to_page(virtual_start);
 		ClearPageReserved(page);
-		set_page_count(page, 1);
+		init_page_count(page);
 		free_page(virtual_start);
 
 		virtual_start += PAGE_SIZE;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3d84b7a35e0d..7d8c127daad7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -307,8 +307,6 @@ static inline int get_page_unless_zero(struct page *page)
 	return atomic_inc_not_zero(&page->_count);
 }
 
-#define set_page_count(p,v) 	atomic_set(&(p)->_count, (v))
-
 extern void FASTCALL(__page_cache_release(struct page *));
 
 static inline int page_count(struct page *page)
@@ -325,6 +323,15 @@ static inline void get_page(struct page *page)
 	atomic_inc(&page->_count);
 }
 
+/*
+ * Setup the page count before being freed into the page allocator for
+ * the first time (boot or memory hotplug)
+ */
+static inline void init_page_count(struct page *page)
+{
+	atomic_set(&page->_count, 1);
+}
+
 void put_page(struct page *page);
 
 void split_page(struct page *page, unsigned int order);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 39d49ecea8e8..20117a4b8ab6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,7 @@
 #include <asm/pgtable.h>
 
 #include <linux/hugetlb.h>
+#include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
 static unsigned long nr_huge_pages, free_huge_pages;
@@ -106,7 +107,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 		return NULL;
 	}
 	spin_unlock(&hugetlb_lock);
-	set_page_count(page, 1);
+	set_page_refcounted(page);
 	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
 		clear_user_highpage(&page[i], addr);
 	return page;
@@ -152,7 +153,7 @@ static void update_and_free_page(struct page *page)
 				1 << PG_private | 1<< PG_writeback);
 	}
 	page[1].lru.next = NULL;
-	set_page_count(page, 1);
+	set_page_refcounted(page);
 	__free_pages(page, HUGETLB_PAGE_ORDER);
 }
 
diff --git a/mm/internal.h b/mm/internal.h
index 7bb339779818..d20e3cc4aef0 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -13,8 +13,19 @@
 
 #include <linux/mm.h>
 
-static inline void set_page_refs(struct page *page, int order)
+static inline void set_page_count(struct page *page, int v)
 {
+	atomic_set(&page->_count, v);
+}
+
+/*
+ * Turn a non-refcounted page (->_count == 0) into refcounted with
+ * a count of one.
+ */
+static inline void set_page_refcounted(struct page *page)
+{
+	BUG_ON(PageCompound(page) && page_private(page) != (unsigned long)page);
+	BUG_ON(atomic_read(&page->_count));
 	set_page_count(page, 1);
 }
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e197818a7cf6..7f65b5a63bb3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -442,7 +442,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 	if (order == 0) {
 		__ClearPageReserved(page);
 		set_page_count(page, 0);
-		set_page_refs(page, 0);
+		set_page_refcounted(page);
 		__free_page(page);
 	} else {
 		int loop;
@@ -457,7 +457,7 @@ void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
 			set_page_count(p, 0);
 		}
 
-		set_page_refs(page, order);
+		set_page_refcounted(page);
 		__free_pages(page, order);
 	}
 }
@@ -525,7 +525,7 @@ static int prep_new_page(struct page *page, int order)
 			1 << PG_referenced | 1 << PG_arch_1 |
 			1 << PG_checked | 1 << PG_mappedtodisk);
 	set_page_private(page, 0);
-	set_page_refs(page, order);
+	set_page_refcounted(page);
 	kernel_map_pages(page, 1 << order, 1);
 	return 0;
 }
@@ -755,10 +755,8 @@ void split_page(struct page *page, unsigned int order)
 
 	BUG_ON(PageCompound(page));
 	BUG_ON(!page_count(page));
-	for (i = 1; i < (1 << order); i++) {
-		BUG_ON(page_count(page + i));
-		set_page_count(page + i, 1);
-	}
+	for (i = 1; i < (1 << order); i++)
+		set_page_refcounted(page + i);
 }
 
 /*
@@ -1771,7 +1769,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
 			continue;
 		page = pfn_to_page(pfn);
 		set_page_links(page, zone, nid, pfn);
-		set_page_count(page, 1);
+		init_page_count(page);
 		reset_page_mapcount(page);
 		SetPageReserved(page);
 		INIT_LIST_HEAD(&page->lru);
-- 
cgit v1.2.3


From 17cf44064ae744f081309108fa67f0e942b10167 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:41 -0800
Subject: [PATCH] mm: cleanup prep_ stuff

Move the prep_ stuff into prep_new_page.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7f65b5a63bb3..bdff85899638 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -212,6 +212,15 @@ static void destroy_compound_page(struct page *page, unsigned long order)
 	}
 }
 
+static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
+{
+	int i;
+
+	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+	for (i = 0; i < (1 << order); i++)
+		clear_highpage(page + i);
+}
+
 /*
  * function for dealing with page's order in buddy system.
  * zone->lock is already acquired when we use these.
@@ -496,7 +505,7 @@ static inline void expand(struct zone *zone, struct page *page,
 /*
  * This page is about to be returned from the page allocator
  */
-static int prep_new_page(struct page *page, int order)
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 {
 	if (unlikely(page_mapcount(page) |
 		(page->mapping != NULL)  |
@@ -527,6 +536,13 @@ static int prep_new_page(struct page *page, int order)
 	set_page_private(page, 0);
 	set_page_refcounted(page);
 	kernel_map_pages(page, 1 << order, 1);
+
+	if (gfp_flags & __GFP_ZERO)
+		prep_zero_page(page, order, gfp_flags);
+
+	if (order && (gfp_flags & __GFP_COMP))
+		prep_compound_page(page, order);
+
 	return 0;
 }
 
@@ -732,15 +748,6 @@ void fastcall free_cold_page(struct page *page)
 	free_hot_cold_page(page, 1);
 }
 
-static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
-{
-	int i;
-
-	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
-	for(i = 0; i < (1 << order); i++)
-		clear_highpage(page + i);
-}
-
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
@@ -802,14 +809,8 @@ again:
 	put_cpu();
 
 	BUG_ON(bad_range(zone, page));
-	if (prep_new_page(page, order))
+	if (prep_new_page(page, order, gfp_flags))
 		goto again;
-
-	if (gfp_flags & __GFP_ZERO)
-		prep_zero_page(page, order, gfp_flags);
-
-	if (order && (gfp_flags & __GFP_COMP))
-		prep_compound_page(page, order);
 	return page;
 
 failed:
-- 
cgit v1.2.3


From 6626c5d53bc45c59798628677ba5606f02e371f3 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Wed, 22 Mar 2006 00:08:42 -0800
Subject: [PATCH] mm: prep_zero_page() in irq is a bug

prep_zero_page() uses KM_USER0 and hence may not be used from IRQ context, at
least for highmem pages.

Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index bdff85899638..ed91684cb1f5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -217,6 +217,11 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 	int i;
 
 	BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM);
+	/*
+	 * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO
+	 * and __GFP_HIGHMEM from hard or soft interrupt context.
+	 */
+	BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
 	for (i = 0; i < (1 << order); i++)
 		clear_highpage(page + i);
 }
-- 
cgit v1.2.3


From b7ab795b7bec9997d4fde39f249d52823d36d98d Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 22 Mar 2006 00:08:42 -0800
Subject: [PATCH] mm: more CONFIG_DEBUG_VM

Put a few more checks under CONFIG_DEBUG_VM

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 12 +++---------
 mm/rmap.c   |  9 ++++-----
 2 files changed, 7 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 6af555c1c42a..71bc664efed5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -388,7 +388,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 {
 	unsigned long pfn = pte_pfn(pte);
 
-	if (vma->vm_flags & VM_PFNMAP) {
+	if (unlikely(vma->vm_flags & VM_PFNMAP)) {
 		unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
 		if (pfn == vma->vm_pgoff + off)
 			return NULL;
@@ -396,18 +396,12 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 			return NULL;
 	}
 
-	/*
-	 * Add some anal sanity checks for now. Eventually,
-	 * we should just do "return pfn_to_page(pfn)", but
-	 * in the meantime we check that we get a valid pfn,
-	 * and that the resulting page looks ok.
-	 *
-	 * Remove this test eventually!
-	 */
+#ifdef CONFIG_DEBUG_VM
 	if (unlikely(!pfn_valid(pfn))) {
 		print_bad_pte(vma, pte, addr);
 		return NULL;
 	}
+#endif
 
 	/*
 	 * NOTE! We still have PageReserved() pages in the page 
diff --git a/mm/rmap.c b/mm/rmap.c
index 134aef9d66c5..1963e269314d 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,13 +56,11 @@
 
 #include <asm/tlbflush.h>
 
-//#define RMAP_DEBUG /* can be enabled only for debugging */
-
 struct kmem_cache *anon_vma_cachep;
 
 static inline void validate_anon_vma(struct vm_area_struct *find_vma)
 {
-#ifdef RMAP_DEBUG
+#ifdef CONFIG_DEBUG_VM
 	struct anon_vma *anon_vma = find_vma->anon_vma;
 	struct vm_area_struct *vma;
 	unsigned int mapcount = 0;
@@ -551,13 +549,14 @@ void page_add_file_rmap(struct page *page)
 void page_remove_rmap(struct page *page)
 {
 	if (atomic_add_negative(-1, &page->_mapcount)) {
-		if (page_mapcount(page) < 0) {
+#ifdef CONFIG_DEBUG_VM
+		if (unlikely(page_mapcount(page) < 0)) {
 			printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
 			printk (KERN_EMERG "  page->flags = %lx\n", page->flags);
 			printk (KERN_EMERG "  page->count = %x\n", page_count(page));
 			printk (KERN_EMERG "  page->mapping = %p\n", page->mapping);
 		}
-
+#endif
 		BUG_ON(page_mapcount(page) < 0);
 		/*
 		 * It would be tidy to reset the PageAnon mapping here,
-- 
cgit v1.2.3


From 6e5ef1a96e6e3b123da56292bc35017c8c401491 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:08:45 -0800
Subject: [PATCH] vmscan: emove obsolete checks from shrink_list() and fix
 unlikely in refill_inactive_zone()

As suggested by Marcelo:

1. The optimization introduced recently for not calling
   page_referenced() during zone reclaim makes two additional checks in
   shrink_list unnecessary.

2. The if (unlikely(sc->may_swap)) in refill_inactive_zone is optimized
   for the zone_reclaim case.  However, most peoples system only does swap.
   Undo that.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Marcelo Tosatti <marcelo.tosatti@cyclades.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 3914a94aa905..f713e9f6ac73 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -460,12 +460,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * Anonymous process memory has backing store?
 		 * Try to allocate it some swap space here.
 		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!sc->may_swap)
-				goto keep_locked;
+		if (PageAnon(page) && !PageSwapCache(page))
 			if (!add_to_swap(page, GFP_ATOMIC))
 				goto activate_locked;
-		}
 #endif /* CONFIG_SWAP */
 
 		mapping = page_mapping(page);
@@ -477,12 +474,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 * processes. Try to unmap it here.
 		 */
 		if (page_mapped(page) && mapping) {
-			/*
-			 * No unmapping if we do not swap
-			 */
-			if (!sc->may_swap)
-				goto keep_locked;
-
 			switch (try_to_unmap(page, 0)) {
 			case SWAP_FAIL:
 				goto activate_locked;
@@ -1205,7 +1196,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 	struct pagevec pvec;
 	int reclaim_mapped = 0;
 
-	if (unlikely(sc->may_swap)) {
+	if (sc->may_swap) {
 		long mapped_ratio;
 		long distress;
 		long swap_tendency;
-- 
cgit v1.2.3


From d15c023b44e5d323f1f4130b85d29f08e43433b1 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Wed, 22 Mar 2006 00:08:46 -0800
Subject: [PATCH] shmem: inline to avoid warning

shmem.c was named and shamed in Jesper's "Building 100 kernels" warnings:
shmem_parse_mpol is only used when CONFIG_TMPFS parses mount options; and
only called from that one site, so mark it inline like its non-NUMA stub.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/shmem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/shmem.c b/mm/shmem.c
index f523a1533ce1..37eaf42ed2c6 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -875,7 +875,7 @@ redirty:
 }
 
 #ifdef CONFIG_NUMA
-static int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
+static inline int shmem_parse_mpol(char *value, int *policy, nodemask_t *policy_nodes)
 {
 	char *nodelist = strchr(value, ':');
 	int err = 1;
-- 
cgit v1.2.3


From a564da3964db3256069190c2ae95069143ac37fb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@tv-sign.ru>
Date: Wed, 22 Mar 2006 00:08:47 -0800
Subject: [PATCH] readahead: ->prev_page can overrun the ahead window

If get_next_ra_size() does not grow fast enough, ->prev_page can overrun
the ahead window.  This means the caller will read the pages from
->ahead_start + ->ahead_size to ->prev_page synchronously.

Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Steven Pratt <slpratt@austin.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/readahead.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index 8d6eeaaa6296..57557e294987 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -52,13 +52,24 @@ static inline unsigned long get_min_readahead(struct file_ra_state *ra)
 	return (VM_MIN_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 }
 
+static inline void reset_ahead_window(struct file_ra_state *ra)
+{
+	/*
+	 * ... but preserve ahead_start + ahead_size value,
+	 * see 'recheck:' label in page_cache_readahead().
+	 * Note: We never use ->ahead_size as rvalue without
+	 * checking ->ahead_start != 0 first.
+	 */
+	ra->ahead_size += ra->ahead_start;
+	ra->ahead_start = 0;
+}
+
 static inline void ra_off(struct file_ra_state *ra)
 {
 	ra->start = 0;
 	ra->flags = 0;
 	ra->size = 0;
-	ra->ahead_start = 0;
-	ra->ahead_size = 0;
+	reset_ahead_window(ra);
 	return;
 }
 
@@ -426,8 +437,7 @@ static int make_ahead_window(struct address_space *mapping, struct file *filp,
 		 * congestion.  The ahead window will any way be closed
 		 * in case we failed due to excessive page cache hits.
 		 */
-		ra->ahead_start = 0;
-		ra->ahead_size = 0;
+		reset_ahead_window(ra);
 	}
 
 	return ret;
@@ -520,11 +530,11 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
 	 * If we get here we are doing sequential IO and this was not the first
 	 * occurence (ie we have an existing window)
 	 */
-
 	if (ra->ahead_start == 0) {	 /* no ahead window yet */
 		if (!make_ahead_window(mapping, filp, ra, 0))
-			goto out;
+			goto recheck;
 	}
+
 	/*
 	 * Already have an ahead window, check if we crossed into it.
 	 * If so, shift windows and issue a new ahead window.
@@ -536,6 +546,10 @@ page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra,
 		ra->start = ra->ahead_start;
 		ra->size = ra->ahead_size;
 		make_ahead_window(mapping, filp, ra, 0);
+recheck:
+		/* prev_page shouldn't overrun the ahead window */
+		ra->prev_page = min(ra->prev_page,
+			ra->ahead_start + ra->ahead_size - 1);
 	}
 
 out:
-- 
cgit v1.2.3


From aed75ff3caafce404d9be7f0c088716375be5279 Mon Sep 17 00:00:00 2001
From: Steven Pratt <slpratt@austin.ibm.com>
Date: Wed, 22 Mar 2006 00:08:48 -0800
Subject: [PATCH] readahead: fix initial window size calculation

The current current get_init_ra_size is not optimal across different IO
sizes and max_readahead values.  Here is a quick summary of sizes computed
under current design and under the attached patch.  All of these assume 1st
IO at offset 0, or 1st detected sequential IO.

	32k max, 4k request

	old         new
	-----------------
	 8k        8k
	16k       16k
	32k       32k

	128k max, 4k request
	old         new
	-----------------
	32k         16k
	64k         32k
	128k        64k
	128k       128k

	128k max, 32k request
	old         new
	-----------------
	32k         64k    <-----
	64k        128k
	128k       128k

	512k max, 4k request
	old         new
	-----------------
	4k         32k     <----
	16k        64k
	64k       128k
	128k      256k
	512k      512k

Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Steven Pratt <slpratt@austin.ibm.com>
Cc: Ram Pai <linuxram@us.ibm.com>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/readahead.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/readahead.c b/mm/readahead.c
index 57557e294987..301b36c4a0ce 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -83,10 +83,10 @@ static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
 {
 	unsigned long newsize = roundup_pow_of_two(size);
 
-	if (newsize <= max / 64)
-		newsize = newsize * newsize;
+	if (newsize <= max / 32)
+		newsize = newsize * 4;
 	else if (newsize <= max / 4)
-		newsize = max / 4;
+		newsize = newsize * 2;
 	else
 		newsize = max;
 	return newsize;
-- 
cgit v1.2.3


From 8f860591ffb29738cf5539b6fbf27f50dcdeb380 Mon Sep 17 00:00:00 2001
From: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Date: Wed, 22 Mar 2006 00:08:50 -0800
Subject: [PATCH] Enable mprotect on huge pages

2.6.16-rc3 uses hugetlb on-demand paging, but it doesn_t support hugetlb
mprotect.

From: David Gibson <david@gibson.dropbear.id.au>

  Remove a test from the mprotect() path which checks that the mprotect()ed
  range on a hugepage VMA is hugepage aligned (yes, really, the sense of
  is_aligned_hugepage_range() is the opposite of what you'd guess :-/).

  In fact, we don't need this test.  If the given addresses match the
  beginning/end of a hugepage VMA they must already be suitably aligned.  If
  they don't, then mprotect_fixup() will attempt to split the VMA.  The very
  first test in split_vma() will check for a badly aligned address on a
  hugepage VMA and return -EINVAL if necessary.

From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>

  On i386 and x86-64, pte flag _PAGE_PSE collides with _PAGE_PROTNONE.  The
  identify of hugetlb pte is lost when changing page protection via mprotect.
  A page fault occurs later will trigger a bug check in huge_pte_alloc().

  The fix is to always make new pte a hugetlb pte and also to clean up
  legacy code where _PAGE_PRESENT is forced on in the pre-faulting day.

Signed-off-by: Zhang Yanmin <yanmin.zhang@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Signed-off-by: Nishanth Aravamudan <nacc@us.ibm.com>
Cc: Andi Kleen <ak@muc.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-i386/pgtable.h   |  5 ++---
 include/asm-ia64/pgtable.h   |  2 +-
 include/asm-x86_64/pgtable.h |  4 ++--
 include/linux/hugetlb.h      |  4 ++++
 mm/hugetlb.c                 | 29 +++++++++++++++++++++++++++++
 mm/mprotect.c                | 12 +++++-------
 6 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index 088a945bf26b..ee056c41a9fb 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -219,13 +219,12 @@ extern unsigned long pg0[];
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
  */
-#define __LARGE_PTE (_PAGE_PSE | _PAGE_PRESENT)
 static inline int pte_user(pte_t pte)		{ return (pte).pte_low & _PAGE_USER; }
 static inline int pte_read(pte_t pte)		{ return (pte).pte_low & _PAGE_USER; }
 static inline int pte_dirty(pte_t pte)		{ return (pte).pte_low & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)		{ return (pte).pte_low & _PAGE_ACCESSED; }
 static inline int pte_write(pte_t pte)		{ return (pte).pte_low & _PAGE_RW; }
-static inline int pte_huge(pte_t pte)		{ return ((pte).pte_low & __LARGE_PTE) == __LARGE_PTE; }
+static inline int pte_huge(pte_t pte)		{ return (pte).pte_low & _PAGE_PSE; }
 
 /*
  * The following only works if pte_present() is not true.
@@ -242,7 +241,7 @@ static inline pte_t pte_mkexec(pte_t pte)	{ (pte).pte_low |= _PAGE_USER; return
 static inline pte_t pte_mkdirty(pte_t pte)	{ (pte).pte_low |= _PAGE_DIRTY; return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ (pte).pte_low |= _PAGE_ACCESSED; return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ (pte).pte_low |= _PAGE_RW; return pte; }
-static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= __LARGE_PTE; return pte; }
+static inline pte_t pte_mkhuge(pte_t pte)	{ (pte).pte_low |= _PAGE_PSE; return pte; }
 
 #ifdef CONFIG_X86_PAE
 # include <asm/pgtable-3level.h>
diff --git a/include/asm-ia64/pgtable.h b/include/asm-ia64/pgtable.h
index e2560c58384b..5890972a69bf 100644
--- a/include/asm-ia64/pgtable.h
+++ b/include/asm-ia64/pgtable.h
@@ -314,7 +314,7 @@ ia64_phys_addr_valid (unsigned long addr)
 #define pte_mkyoung(pte)	(__pte(pte_val(pte) | _PAGE_A))
 #define pte_mkclean(pte)	(__pte(pte_val(pte) & ~_PAGE_D))
 #define pte_mkdirty(pte)	(__pte(pte_val(pte) | _PAGE_D))
-#define pte_mkhuge(pte)		(__pte(pte_val(pte) | _PAGE_P))
+#define pte_mkhuge(pte)		(__pte(pte_val(pte)))
 
 /*
  * Macro to a page protection value as "uncacheable".  Note that "protection" is really a
diff --git a/include/asm-x86_64/pgtable.h b/include/asm-x86_64/pgtable.h
index 715fd94cf577..a617d364d08d 100644
--- a/include/asm-x86_64/pgtable.h
+++ b/include/asm-x86_64/pgtable.h
@@ -273,7 +273,7 @@ static inline int pte_dirty(pte_t pte)		{ return pte_val(pte) & _PAGE_DIRTY; }
 static inline int pte_young(pte_t pte)		{ return pte_val(pte) & _PAGE_ACCESSED; }
 static inline int pte_write(pte_t pte)		{ return pte_val(pte) & _PAGE_RW; }
 static inline int pte_file(pte_t pte)		{ return pte_val(pte) & _PAGE_FILE; }
-static inline int pte_huge(pte_t pte)		{ return (pte_val(pte) & __LARGE_PTE) == __LARGE_PTE; }
+static inline int pte_huge(pte_t pte)		{ return pte_val(pte) & _PAGE_PSE; }
 
 static inline pte_t pte_rdprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
 static inline pte_t pte_exprotect(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) & ~_PAGE_USER)); return pte; }
@@ -285,7 +285,7 @@ static inline pte_t pte_mkexec(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _
 static inline pte_t pte_mkdirty(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_DIRTY)); return pte; }
 static inline pte_t pte_mkyoung(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_ACCESSED)); return pte; }
 static inline pte_t pte_mkwrite(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_RW)); return pte; }
-static inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | __LARGE_PTE)); return pte; }
+static inline pte_t pte_mkhuge(pte_t pte)	{ set_pte(&pte, __pte(pte_val(pte) | _PAGE_PSE)); return pte; }
 
 struct vm_area_struct;
 
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 68d82ad6b17c..fa83836b63d2 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -41,6 +41,8 @@ struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
 				pmd_t *pmd, int write);
 int is_aligned_hugepage_range(unsigned long addr, unsigned long len);
 int pmd_huge(pmd_t pmd);
+void hugetlb_change_protection(struct vm_area_struct *vma,
+		unsigned long address, unsigned long end, pgprot_t newprot);
 
 #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define is_hugepage_only_range(mm, addr, len)	0
@@ -101,6 +103,8 @@ static inline unsigned long hugetlb_total_pages(void)
 #define free_huge_page(p)			({ (void)(p); BUG(); })
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
 
+#define hugetlb_change_protection(vma, address, end, newprot)
+
 #ifndef HPAGE_MASK
 #define HPAGE_MASK	PAGE_MASK		/* Keep the compiler happy */
 #define HPAGE_SIZE	PAGE_SIZE
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 20117a4b8ab6..783098f6cf8e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -565,3 +565,32 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	return i;
 }
+
+void hugetlb_change_protection(struct vm_area_struct *vma,
+		unsigned long address, unsigned long end, pgprot_t newprot)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long start = address;
+	pte_t *ptep;
+	pte_t pte;
+
+	BUG_ON(address >= end);
+	flush_cache_range(vma, address, end);
+
+	spin_lock(&mm->page_table_lock);
+	for (; address < end; address += HPAGE_SIZE) {
+		ptep = huge_pte_offset(mm, address);
+		if (!ptep)
+			continue;
+		if (!pte_none(*ptep)) {
+			pte = huge_ptep_get_and_clear(mm, address, ptep);
+			pte = pte_mkhuge(pte_modify(pte, newprot));
+			set_huge_pte_at(mm, address, ptep, pte);
+			lazy_mmu_prot_update(pte);
+		}
+	}
+	spin_unlock(&mm->page_table_lock);
+
+	flush_tlb_range(vma, start, end);
+}
+
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 653b8571c1ed..4c14d4289b61 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -124,7 +124,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	 * a MAP_NORESERVE private mapping to writable will now reserve.
 	 */
 	if (newflags & VM_WRITE) {
-		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
+		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
 				return -ENOMEM;
@@ -166,7 +166,10 @@ success:
 	 */
 	vma->vm_flags = newflags;
 	vma->vm_page_prot = newprot;
-	change_protection(vma, start, end, newprot);
+	if (is_vm_hugetlb_page(vma))
+		hugetlb_change_protection(vma, start, end, newprot);
+	else
+		change_protection(vma, start, end, newprot);
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
 	return 0;
@@ -240,11 +243,6 @@ sys_mprotect(unsigned long start, size_t len, unsigned long prot)
 
 		/* Here we know that  vma->vm_start <= nstart < vma->vm_end. */
 
-		if (is_vm_hugetlb_page(vma)) {
-			error = -EACCES;
-			goto out;
-		}
-
 		newflags = vm_flags | (vma->vm_flags & ~(VM_READ | VM_WRITE | VM_EXEC));
 
 		/* newflags >> 4 shift VM_MAY% in place of VM_% */
-- 
cgit v1.2.3


From 79ac6ba40eb8d70f0d204e98ae9b63280ad1018c Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:51 -0800
Subject: [PATCH] hugepage: Small fixes to hugepage clear/copy path

Move the loops used in mm/hugetlb.c to clear and copy hugepages to their
own functions for clarity.  As we do so, we add some checks of need_resched
- we are, after all copying megabytes of memory here.  We also add
might_sleep() accordingly.  We generally dropped locks around the clear and
copy, already but not everyone has PREEMPT enabled, so we should still be
checking explicitly.

For this to work, we need to remove the clear_huge_page() from
alloc_huge_page(), which is called with the page_table_lock held in the COW
path.  We move the clear_huge_page() to just after the alloc_huge_page() in
the hugepage no-page path.  In the COW path, the new page is about to be
copied over, so clearing it was just a waste of time anyway.  So as a side
effect we also fix the fact that we held the page_table_lock for far too
long in this path by calling alloc_huge_page() under it.

It causes no regressions on the libhugetlbfs testsuite (ppc64, POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 783098f6cf8e..41b1038f76da 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -27,6 +27,29 @@ static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
 
+static void clear_huge_page(struct page *page, unsigned long addr)
+{
+	int i;
+
+	might_sleep();
+	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) {
+		cond_resched();
+		clear_user_highpage(page + i, addr);
+	}
+}
+
+static void copy_huge_page(struct page *dst, struct page *src,
+			   unsigned long addr)
+{
+	int i;
+
+	might_sleep();
+	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
+		cond_resched();
+		copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+	}
+}
+
 /*
  * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
  */
@@ -98,7 +121,6 @@ void free_huge_page(struct page *page)
 struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
 	struct page *page;
-	int i;
 
 	spin_lock(&hugetlb_lock);
 	page = dequeue_huge_page(vma, addr);
@@ -108,8 +130,6 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 	}
 	spin_unlock(&hugetlb_lock);
 	set_page_refcounted(page);
-	for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); ++i)
-		clear_user_highpage(&page[i], addr);
 	return page;
 }
 
@@ -367,7 +387,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, pte_t *ptep, pte_t pte)
 {
 	struct page *old_page, *new_page;
-	int i, avoidcopy;
+	int avoidcopy;
 
 	old_page = pte_page(pte);
 
@@ -388,9 +408,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 
 	spin_unlock(&mm->page_table_lock);
-	for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++)
-		copy_user_highpage(new_page + i, old_page + i,
-				   address + i*PAGE_SIZE);
+	copy_huge_page(new_page, old_page, address);
 	spin_lock(&mm->page_table_lock);
 
 	ptep = huge_pte_offset(mm, address & HPAGE_MASK);
@@ -435,6 +453,7 @@ retry:
 			ret = VM_FAULT_OOM;
 			goto out;
 		}
+		clear_huge_page(page, address);
 
 		if (vma->vm_flags & VM_SHARED) {
 			int err;
-- 
cgit v1.2.3


From 3935baa9bcda3ccaee4f7849f5157d316e34412e Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:53 -0800
Subject: [PATCH] hugepage: serialize hugepage allocation and instantiation

Currently, no lock or mutex is held between allocating a hugepage and
inserting it into the pagetables / page cache.  When we do go to insert the
page into pagetables or page cache, we recheck and may free the newly
allocated hugepage.  However, since the number of hugepages in the system
is strictly limited, and it's usualy to want to use all of them, this can
still lead to spurious allocation failures.

For example, suppose two processes are both mapping (MAP_SHARED) the same
hugepage file, large enough to consume the entire available hugepage pool.
If they race instantiating the last page in the mapping, they will both
attempt to allocate the last available hugepage.  One will fail, of course,
returning OOM from the fault and thus causing the process to be killed,
despite the fact that the entire mapping can, in fact, be instantiated.

The patch fixes this race by the simple method of adding a (sleeping) mutex
to serialize the hugepage fault path between allocation and insertion into
pagetables and/or page cache.  It would be possible to avoid the
serialization by catching the allocation failures, waiting on some
condition, then rechecking to see if someone else has instantiated the page
for us.  Given the likely frequency of hugepage instantiations, it seems
very doubtful it's worth the extra complexity.

This patch causes no regression on the libhugetlbfs testsuite, and one
test, which can trigger this race now passes where it previously failed.

Actually, the test still sometimes fails, though less often and only as a
shmat() failure, rather processes getting OOM killed by the VM.  The dodgy
heuristic tests in fs/hugetlbfs/inode.c for whether there's enough hugepage
space aren't protected by the new mutex, and would be ugly to do so, so
there's still a race there.  Another patch to replace those tests with
something saner for this reason as well as others coming...

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 41b1038f76da..d5987a87bbe5 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/mempolicy.h>
 #include <linux/cpuset.h>
+#include <linux/mutex.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -26,6 +27,10 @@ unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
+/*
+ * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ */
+static DEFINE_SPINLOCK(hugetlb_lock);
 
 static void clear_huge_page(struct page *page, unsigned long addr)
 {
@@ -50,11 +55,6 @@ static void copy_huge_page(struct page *dst, struct page *src,
 	}
 }
 
-/*
- * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
- */
-static DEFINE_SPINLOCK(hugetlb_lock);
-
 static void enqueue_huge_page(struct page *page)
 {
 	int nid = page_to_nid(page);
@@ -508,14 +508,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t *ptep;
 	pte_t entry;
 	int ret;
+	static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 
 	ptep = huge_pte_alloc(mm, address);
 	if (!ptep)
 		return VM_FAULT_OOM;
 
+	/*
+	 * Serialize hugepage allocation and instantiation, so that we don't
+	 * get spurious allocation failures if two CPUs race to instantiate
+	 * the same page in the page cache.
+	 */
+	mutex_lock(&hugetlb_instantiation_mutex);
 	entry = *ptep;
-	if (pte_none(entry))
-		return hugetlb_no_page(mm, vma, address, ptep, write_access);
+	if (pte_none(entry)) {
+		ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
+		mutex_unlock(&hugetlb_instantiation_mutex);
+		return ret;
+	}
 
 	ret = VM_FAULT_MINOR;
 
@@ -525,6 +535,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		if (write_access && !pte_write(entry))
 			ret = hugetlb_cow(mm, vma, address, ptep, entry);
 	spin_unlock(&mm->page_table_lock);
+	mutex_unlock(&hugetlb_instantiation_mutex);
 
 	return ret;
 }
-- 
cgit v1.2.3


From b45b5bd65f668a665db40d093e4e1fe563533608 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:55 -0800
Subject: [PATCH] hugepage: Strict page reservation for hugepage inodes

These days, hugepages are demand-allocated at first fault time.  There's a
somewhat dubious (and racy) heuristic when making a new mmap() to check if
there are enough available hugepages to fully satisfy that mapping.

A particularly obvious case where the heuristic breaks down is where a
process maps its hugepages not as a single chunk, but as a bunch of
individually mmap()ed (or shmat()ed) blocks without touching and
instantiating the pages in between allocations.  In this case the size of
each block is compared against the total number of available hugepages.
It's thus easy for the process to become overcommitted, because each block
mapping will succeed, although the total number of hugepages required by
all blocks exceeds the number available.  In particular, this defeats such
a program which will detect a mapping failure and adjust its hugepage usage
downward accordingly.

The patch below addresses this problem, by strictly reserving a number of
physical hugepages for hugepage inodes which have been mapped, but not
instatiated.  MAP_SHARED mappings are thus "safe" - they will fail on
mmap(), not later with an OOM SIGKILL.  MAP_PRIVATE mappings can still
trigger an OOM.  (Actually SHARED mappings can technically still OOM, but
only if the sysadmin explicitly reduces the hugepage pool between mapping
and instantiation)

This patch appears to address the problem at hand - it allows DB2 to start
correctly, for instance, which previously suffered the failure described
above.

This patch causes no regressions on the libhugetblfs testsuite, and makes a
test (designed to catch this problem) pass which previously failed (ppc64,
POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/hugetlbfs/inode.c    |  74 ++++++++------------------
 include/linux/hugetlb.h |   8 ++-
 mm/hugetlb.c            | 136 ++++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 154 insertions(+), 64 deletions(-)

(limited to 'mm')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index b35195289945..1a1c2fcb7823 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -56,48 +56,10 @@ static void huge_pagevec_release(struct pagevec *pvec)
 	pagevec_reinit(pvec);
 }
 
-/*
- * huge_pages_needed tries to determine the number of new huge pages that
- * will be required to fully populate this VMA.  This will be equal to
- * the size of the VMA in huge pages minus the number of huge pages
- * (covered by this VMA) that are found in the page cache.
- *
- * Result is in bytes to be compatible with is_hugepage_mem_enough()
- */
-static unsigned long
-huge_pages_needed(struct address_space *mapping, struct vm_area_struct *vma)
-{
-	int i;
-	struct pagevec pvec;
-	unsigned long start = vma->vm_start;
-	unsigned long end = vma->vm_end;
-	unsigned long hugepages = (end - start) >> HPAGE_SHIFT;
-	pgoff_t next = vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT);
-	pgoff_t endpg = next + hugepages;
-
-	pagevec_init(&pvec, 0);
-	while (next < endpg) {
-		if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE))
-			break;
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			struct page *page = pvec.pages[i];
-			if (page->index > next)
-				next = page->index;
-			if (page->index >= endpg)
-				break;
-			next++;
-			hugepages--;
-		}
-		huge_pagevec_release(&pvec);
-	}
-	return hugepages << HPAGE_SHIFT;
-}
-
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file->f_dentry->d_inode;
-	struct address_space *mapping = inode->i_mapping;
-	unsigned long bytes;
+	struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
 	loff_t len, vma_len;
 	int ret;
 
@@ -113,10 +75,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
 		return -EINVAL;
 
-	bytes = huge_pages_needed(mapping, vma);
-	if (!is_hugepage_mem_enough(bytes))
-		return -ENOMEM;
-
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
 	mutex_lock(&inode->i_mutex);
@@ -129,6 +87,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
 		goto out;
 
+	if (vma->vm_flags & VM_MAYSHARE)
+		if (hugetlb_extend_reservation(info, len >> HPAGE_SHIFT) != 0)
+			goto out;
+
 	ret = 0;
 	hugetlb_prefault_arch_hook(vma->vm_mm);
 	if (inode->i_size < len)
@@ -227,13 +189,18 @@ static void truncate_huge_page(struct page *page)
 	put_page(page);
 }
 
-static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
+static void truncate_hugepages(struct inode *inode, loff_t lstart)
 {
+	struct address_space *mapping = &inode->i_data;
 	const pgoff_t start = lstart >> HPAGE_SHIFT;
 	struct pagevec pvec;
 	pgoff_t next;
 	int i;
 
+	hugetlb_truncate_reservation(HUGETLBFS_I(inode),
+				     lstart >> HPAGE_SHIFT);
+	if (!mapping->nrpages)
+		return;
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (1) {
@@ -262,8 +229,7 @@ static void truncate_hugepages(struct address_space *mapping, loff_t lstart)
 
 static void hugetlbfs_delete_inode(struct inode *inode)
 {
-	if (inode->i_data.nrpages)
-		truncate_hugepages(&inode->i_data, 0);
+	truncate_hugepages(inode, 0);
 	clear_inode(inode);
 }
 
@@ -296,8 +262,7 @@ static void hugetlbfs_forget_inode(struct inode *inode)
 	inode->i_state |= I_FREEING;
 	inodes_stat.nr_inodes--;
 	spin_unlock(&inode_lock);
-	if (inode->i_data.nrpages)
-		truncate_hugepages(&inode->i_data, 0);
+	truncate_hugepages(inode, 0);
 	clear_inode(inode);
 	destroy_inode(inode);
 }
@@ -356,7 +321,7 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 	if (!prio_tree_empty(&mapping->i_mmap))
 		hugetlb_vmtruncate_list(&mapping->i_mmap, pgoff);
 	spin_unlock(&mapping->i_mmap_lock);
-	truncate_hugepages(mapping, offset);
+	truncate_hugepages(inode, offset);
 	return 0;
 }
 
@@ -573,6 +538,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
 		hugetlbfs_inc_free_inodes(sbinfo);
 		return NULL;
 	}
+	p->prereserved_hpages = 0;
 	return &p->vfs_inode;
 }
 
@@ -805,9 +771,6 @@ struct file *hugetlb_zero_setup(size_t size)
 	if (!can_do_hugetlb_shm())
 		return ERR_PTR(-EPERM);
 
-	if (!is_hugepage_mem_enough(size))
-		return ERR_PTR(-ENOMEM);
-
 	if (!user_shm_lock(size, current->user))
 		return ERR_PTR(-ENOMEM);
 
@@ -831,6 +794,11 @@ struct file *hugetlb_zero_setup(size_t size)
 	if (!inode)
 		goto out_file;
 
+	error = -ENOMEM;
+	if (hugetlb_extend_reservation(HUGETLBFS_I(inode),
+				       size >> HPAGE_SHIFT) != 0)
+		goto out_inode;
+
 	d_instantiate(dentry, inode);
 	inode->i_size = size;
 	inode->i_nlink = 0;
@@ -841,6 +809,8 @@ struct file *hugetlb_zero_setup(size_t size)
 	file->f_mode = FMODE_WRITE | FMODE_READ;
 	return file;
 
+out_inode:
+	iput(inode);
 out_file:
 	put_filp(file);
 out_dentry:
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index fa83836b63d2..cafe73eecb05 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -20,7 +20,6 @@ void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long)
 int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
-int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
 struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
 void free_huge_page(struct page *);
@@ -89,7 +88,6 @@ static inline unsigned long hugetlb_total_pages(void)
 #define copy_hugetlb_page_range(src, dst, vma)	({ BUG(); 0; })
 #define hugetlb_prefault(mapping, vma)		({ BUG(); 0; })
 #define unmap_hugepage_range(vma, start, end)	BUG()
-#define is_hugepage_mem_enough(size)		0
 #define hugetlb_report_meminfo(buf)		0
 #define hugetlb_report_node_meminfo(n, buf)	0
 #define follow_huge_pmd(mm, addr, pmd, write)	NULL
@@ -132,6 +130,8 @@ struct hugetlbfs_sb_info {
 
 struct hugetlbfs_inode_info {
 	struct shared_policy policy;
+	/* Protected by the (global) hugetlb_lock */
+	unsigned long prereserved_hpages;
 	struct inode vfs_inode;
 };
 
@@ -148,6 +148,10 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 extern struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_zero_setup(size_t);
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+			       unsigned long atleast_hpages);
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+				  unsigned long atmost_hpages);
 int hugetlb_get_quota(struct address_space *mapping);
 void hugetlb_put_quota(struct address_space *mapping);
 
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index d5987a87bbe5..27fad5d9bcf6 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -22,7 +22,7 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
@@ -120,17 +120,136 @@ void free_huge_page(struct page *page)
 
 struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
 {
+	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page;
+	int use_reserve = 0;
+	unsigned long idx;
 
 	spin_lock(&hugetlb_lock);
-	page = dequeue_huge_page(vma, addr);
-	if (!page) {
-		spin_unlock(&hugetlb_lock);
-		return NULL;
+
+	if (vma->vm_flags & VM_MAYSHARE) {
+
+		/* idx = radix tree index, i.e. offset into file in
+		 * HPAGE_SIZE units */
+		idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+			+ (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+		/* The hugetlbfs specific inode info stores the number
+		 * of "guaranteed available" (huge) pages.  That is,
+		 * the first 'prereserved_hpages' pages of the inode
+		 * are either already instantiated, or have been
+		 * pre-reserved (by hugetlb_reserve_for_inode()). Here
+		 * we're in the process of instantiating the page, so
+		 * we use this to determine whether to draw from the
+		 * pre-reserved pool or the truly free pool. */
+		if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
+			use_reserve = 1;
+	}
+
+	if (!use_reserve) {
+		if (free_huge_pages <= reserved_huge_pages)
+			goto fail;
+	} else {
+		BUG_ON(reserved_huge_pages == 0);
+		reserved_huge_pages--;
 	}
+
+	page = dequeue_huge_page(vma, addr);
+	if (!page)
+		goto fail;
+
 	spin_unlock(&hugetlb_lock);
 	set_page_refcounted(page);
 	return page;
+
+ fail:
+	WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+	spin_unlock(&hugetlb_lock);
+	return NULL;
+}
+
+/* hugetlb_extend_reservation()
+ *
+ * Ensure that at least 'atleast' hugepages are, and will remain,
+ * available to instantiate the first 'atleast' pages of the given
+ * inode.  If the inode doesn't already have this many pages reserved
+ * or instantiated, set aside some hugepages in the reserved pool to
+ * satisfy later faults (or fail now if there aren't enough, rather
+ * than getting the SIGBUS later).
+ */
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+			       unsigned long atleast)
+{
+	struct inode *inode = &info->vfs_inode;
+	unsigned long change_in_reserve = 0;
+	int ret = 0;
+
+	spin_lock(&hugetlb_lock);
+	read_lock_irq(&inode->i_mapping->tree_lock);
+
+	if (info->prereserved_hpages >= atleast)
+		goto out;
+
+	/* Because we always call this on shared mappings, none of the
+	 * pages beyond info->prereserved_hpages can have been
+	 * instantiated, so we need to reserve all of them now. */
+	change_in_reserve = atleast - info->prereserved_hpages;
+
+	if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	reserved_huge_pages += change_in_reserve;
+	info->prereserved_hpages = atleast;
+
+ out:
+	read_unlock_irq(&inode->i_mapping->tree_lock);
+	spin_unlock(&hugetlb_lock);
+
+	return ret;
+}
+
+/* hugetlb_truncate_reservation()
+ *
+ * This returns pages reserved for the given inode to the general free
+ * hugepage pool.  If the inode has any pages prereserved, but not
+ * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
+ * them.
+ */
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+				  unsigned long atmost)
+{
+	struct inode *inode = &info->vfs_inode;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long idx;
+	unsigned long change_in_reserve = 0;
+	struct page *page;
+
+	spin_lock(&hugetlb_lock);
+	read_lock_irq(&inode->i_mapping->tree_lock);
+
+	if (info->prereserved_hpages <= atmost)
+		goto out;
+
+	/* Count pages which were reserved, but not instantiated, and
+	 * which we can now release. */
+	for (idx = atmost; idx < info->prereserved_hpages; idx++) {
+		page = radix_tree_lookup(&mapping->page_tree, idx);
+		if (!page)
+			/* Pages which are already instantiated can't
+			 * be unreserved (and in fact have already
+			 * been removed from the reserved pool) */
+			change_in_reserve++;
+	}
+
+	BUG_ON(reserved_huge_pages < change_in_reserve);
+	reserved_huge_pages -= change_in_reserve;
+	info->prereserved_hpages = atmost;
+
+ out:
+	read_unlock_irq(&inode->i_mapping->tree_lock);
+	spin_unlock(&hugetlb_lock);
 }
 
 static int __init hugetlb_init(void)
@@ -238,9 +357,11 @@ int hugetlb_report_meminfo(char *buf)
 	return sprintf(buf,
 			"HugePages_Total: %5lu\n"
 			"HugePages_Free:  %5lu\n"
+		        "HugePages_Rsvd:  %5lu\n"
 			"Hugepagesize:    %5lu kB\n",
 			nr_huge_pages,
 			free_huge_pages,
+		        reserved_huge_pages,
 			HPAGE_SIZE/1024);
 }
 
@@ -253,11 +374,6 @@ int hugetlb_report_node_meminfo(int nid, char *buf)
 		nid, free_huge_pages_node[nid]);
 }
 
-int is_hugepage_mem_enough(size_t size)
-{
-	return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
-}
-
 /* Return the number pages of memory we physically have, in PAGE_SIZE units. */
 unsigned long hugetlb_total_pages(void)
 {
-- 
cgit v1.2.3


From 27a85ef1b81300cfff06b4c8037e9914dfb09acc Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:56 -0800
Subject: [PATCH] hugepage: Make {alloc,free}_huge_page() local

Originally, mm/hugetlb.c just handled the hugepage physical allocation path
and its {alloc,free}_huge_page() functions were used from the arch specific
hugepage code.  These days those functions are only used with mm/hugetlb.c
itself.  Therefore, this patch makes them static and removes their
prototypes from hugetlb.h.  This requires a small rearrangement of code in
mm/hugetlb.c to avoid a forward declaration.

This patch causes no regressions on the libhugetlbfs testsuite (ppc64,
POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/hugetlb.h |  4 ----
 mm/hugetlb.c            | 25 +++++++++++++------------
 2 files changed, 13 insertions(+), 16 deletions(-)

(limited to 'mm')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index cafe73eecb05..5d84c368ffe4 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -21,8 +21,6 @@ int hugetlb_prefault(struct address_space *, struct vm_area_struct *);
 int hugetlb_report_meminfo(char *);
 int hugetlb_report_node_meminfo(int, char *);
 unsigned long hugetlb_total_pages(void);
-struct page *alloc_huge_page(struct vm_area_struct *, unsigned long);
-void free_huge_page(struct page *);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
 
@@ -97,8 +95,6 @@ static inline unsigned long hugetlb_total_pages(void)
 #define is_hugepage_only_range(mm, addr, len)	0
 #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
 						do { } while (0)
-#define alloc_huge_page(vma, addr)		({ NULL; })
-#define free_huge_page(p)			({ (void)(p); BUG(); })
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
 
 #define hugetlb_change_protection(vma, address, end, newprot)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 27fad5d9bcf6..075877b1cbc0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -88,6 +88,17 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
 	return page;
 }
 
+static void free_huge_page(struct page *page)
+{
+	BUG_ON(page_count(page));
+
+	INIT_LIST_HEAD(&page->lru);
+
+	spin_lock(&hugetlb_lock);
+	enqueue_huge_page(page);
+	spin_unlock(&hugetlb_lock);
+}
+
 static int alloc_fresh_huge_page(void)
 {
 	static int nid = 0;
@@ -107,18 +118,8 @@ static int alloc_fresh_huge_page(void)
 	return 0;
 }
 
-void free_huge_page(struct page *page)
-{
-	BUG_ON(page_count(page));
-
-	INIT_LIST_HEAD(&page->lru);
-
-	spin_lock(&hugetlb_lock);
-	enqueue_huge_page(page);
-	spin_unlock(&hugetlb_lock);
-}
-
-struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+				    unsigned long addr)
 {
 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct page *page;
-- 
cgit v1.2.3


From 9da61aef0fd5b17dd4bf4baf33db12c470def774 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:57 -0800
Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables()

free_pgtables() has special logic to call hugetlb_free_pgd_range() instead
of the normal free_pgd_range() on hugepage VMAs.  However, the test it uses
to do so is incorrect: it calls is_hugepage_only_range on a hugepage sized
range at the start of the vma.  is_hugepage_only_range() will return true
if the given range has any intersection with a hugepage address region, and
in this case the given region need not be hugepage aligned.  So, for
example, this test can return true if called on, say, a 4k VMA immediately
preceding a (nicely aligned) hugepage VMA.

At present we get away with this because the powerpc version of
hugetlb_free_pgd_range() is just a call to free_pgd_range().  On ia64 (the
only other arch with a non-trivial is_hugepage_only_range()) we get away
with it for a different reason; the hugepage area is not contiguous with
the rest of the user address space, and VMAs are not permitted in between,
so the test can't return a false positive there.

Nonetheless this should be fixed.  We do that in the patch below by
replacing the is_hugepage_only_range() test with an explicit test of the
VMA using is_vm_hugetlb_page().

This in turn changes behaviour for platforms where is_hugepage_only_range()
returns false always (everything except powerpc and ia64).  We address this
by ensuring that hugetlb_free_pgd_range() is defined to be identical to
free_pgd_range() (instead of a no-op) on everything except ia64.  Even so,
it will prevent some otherwise possible coalescing of calls down to
free_pgd_range().  Since this only happens for hugepage VMAs, removing this
small optimization seems unlikely to cause any trouble.

This patch causes no regressions on the libhugetlbfs testsuite - ppc64
POWER5 (8-way), ppc64 G5 (2-way) and i386 Pentium M (UP).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-ia64/page.h       | 1 +
 include/asm-powerpc/pgtable.h | 5 -----
 include/linux/hugetlb.h       | 9 +++++----
 mm/memory.c                   | 5 ++---
 4 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/include/asm-ia64/page.h b/include/asm-ia64/page.h
index 5e6362a786b7..732cf3086741 100644
--- a/include/asm-ia64/page.h
+++ b/include/asm-ia64/page.h
@@ -57,6 +57,7 @@
 
 # define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
 # define ARCH_HAS_HUGEPAGE_ONLY_RANGE
+# define ARCH_HAS_HUGETLB_FREE_PGD_RANGE
 #endif /* CONFIG_HUGETLB_PAGE */
 
 #ifdef __ASSEMBLY__
diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h
index e38931379a72..185ee15963a1 100644
--- a/include/asm-powerpc/pgtable.h
+++ b/include/asm-powerpc/pgtable.h
@@ -468,11 +468,6 @@ extern pgd_t swapper_pg_dir[];
 
 extern void paging_init(void);
 
-#ifdef CONFIG_HUGETLB_PAGE
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
-	free_pgd_range(tlb, addr, end, floor, ceiling)
-#endif
-
 /*
  * This gets called at the end of handling a page fault, when
  * the kernel has put a new PTE into the page table for the process.
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 5d84c368ffe4..e465fbf1ef5f 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -43,8 +43,10 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 
 #ifndef ARCH_HAS_HUGEPAGE_ONLY_RANGE
 #define is_hugepage_only_range(mm, addr, len)	0
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
-						do { } while (0)
+#endif
+
+#ifndef ARCH_HAS_HUGETLB_FREE_PGD_RANGE
+#define hugetlb_free_pgd_range	free_pgd_range
 #endif
 
 #ifndef ARCH_HAS_PREPARE_HUGEPAGE_RANGE
@@ -93,8 +95,7 @@ static inline unsigned long hugetlb_total_pages(void)
 #define prepare_hugepage_range(addr, len)	(-EINVAL)
 #define pmd_huge(x)	0
 #define is_hugepage_only_range(mm, addr, len)	0
-#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) \
-						do { } while (0)
+#define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; })
 #define hugetlb_fault(mm, vma, addr, write)	({ BUG(); 0; })
 
 #define hugetlb_change_protection(vma, address, end, newprot)
diff --git a/mm/memory.c b/mm/memory.c
index 71bc664efed5..f6e3be9cbf5a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -277,7 +277,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 		anon_vma_unlink(vma);
 		unlink_file_vma(vma);
 
-		if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
+		if (is_vm_hugetlb_page(vma)) {
 			hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
 				floor, next? next->vm_start: ceiling);
 		} else {
@@ -285,8 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 			 * Optimization: gather nearby vmas into one call down
 			 */
 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
-			  && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
-							HPAGE_SIZE)) {
+			       && !is_vm_hugetlb_page(vma)) {
 				vma = next;
 				next = vma->vm_next;
 				anon_vma_unlink(vma);
-- 
cgit v1.2.3


From 4866920b93fd7d5b520278c3c76e6f4d5a352d81 Mon Sep 17 00:00:00 2001
From: David Gibson <david@gibson.dropbear.id.au>
Date: Wed, 22 Mar 2006 00:08:58 -0800
Subject: [PATCH] hugepage: Fix hugepage logic in free_pgtables() harder

Turns out the hugepage logic in free_pgtables() was doubly broken.  The
loop coalescing multiple normal page VMAs into one call to free_pgd_range()
had an off by one error, which could mean it would coalesce one hugepage
VMA into the same bundle (checking 'vma' not 'next' in the loop).  I
transferred this bug into the new is_vm_hugetlb_page() based version.
Here's the fix.

This one didn't bite on powerpc previously for the same reason the
is_hugepage_only_range() problem didn't: powerpc's hugetlb_free_pgd_range()
is identical to free_pgd_range().  It didn't bite on ia64 because the
hugepage region is distant enough from any other region that the separated
PMD_SIZE distance test would always prevent coalescing the two together.

No libhugetlbfs testsuite regressions (ppc64, POWER5).

Signed-off-by: David Gibson <dwg@au1.ibm.com>
Cc: William Lee Irwin III <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index f6e3be9cbf5a..80c3fb370f91 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -285,7 +285,7 @@ void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
 			 * Optimization: gather nearby vmas into one call down
 			 */
 			while (next && next->vm_start <= vma->vm_end + PMD_SIZE
-			       && !is_vm_hugetlb_page(vma)) {
+			       && !is_vm_hugetlb_page(next)) {
 				vma = next;
 				next = vma->vm_next;
 				anon_vma_unlink(vma);
-- 
cgit v1.2.3


From d5d4b0aa4e1430d73050babba999365593bdb9d2 Mon Sep 17 00:00:00 2001
From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Date: Wed, 22 Mar 2006 00:09:03 -0800
Subject: [PATCH] optimize follow_hugetlb_page

follow_hugetlb_page() walks a range of user virtual address and then fills
in list of struct page * into an array that is passed from the argument
list.  It also gets a reference count via get_page().  For compound page,
get_page() actually traverse back to head page via page_private() macro and
then adds a reference count to the head page.  Since we are doing a virt to
pte look up, kernel already has a struct page pointer into the head page.
So instead of traverse into the small unit page struct and then follow a
link back to the head page, optimize that with incrementing the reference
count directly on the head page.

The benefit is that we don't take a cache miss on accessing page struct for
the corresponding user address and more importantly, not to pollute the
cache with a "not very useful" round trip of pointer chasing.  This adds a
moderate performance gain on an I/O intensive database transaction
workload.

Signed-off-by: Ken Chen <kenneth.w.chen@intel.com>
Cc: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 075877b1cbc0..06699d871a8e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -661,10 +661,10 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			struct page **pages, struct vm_area_struct **vmas,
 			unsigned long *position, int *length, int i)
 {
-	unsigned long vpfn, vaddr = *position;
+	unsigned long pfn_offset;
+	unsigned long vaddr = *position;
 	int remainder = *length;
 
-	vpfn = vaddr/PAGE_SIZE;
 	spin_lock(&mm->page_table_lock);
 	while (vaddr < vma->vm_end && remainder) {
 		pte_t *pte;
@@ -692,19 +692,28 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 			break;
 		}
 
-		if (pages) {
-			page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
-			get_page(page);
-			pages[i] = page;
-		}
+		pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+		page = pte_page(*pte);
+same_page:
+		get_page(page);
+		if (pages)
+			pages[i] = page + pfn_offset;
 
 		if (vmas)
 			vmas[i] = vma;
 
 		vaddr += PAGE_SIZE;
-		++vpfn;
+		++pfn_offset;
 		--remainder;
 		++i;
+		if (vaddr < vma->vm_end && remainder &&
+				pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+			/*
+			 * We use pfn_offset to avoid touching the pageframes
+			 * of this compound page.
+			 */
+			goto same_page;
+		}
 	}
 	spin_unlock(&mm->page_table_lock);
 	*length = remainder;
-- 
cgit v1.2.3


From 248a0301e703cbf781aa02a91bcfc6da75870dd7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 22 Mar 2006 00:09:04 -0800
Subject: [PATCH] mm: make shrink_all_memory try harder

Make shrink_all_memory() repeat the attempts to free more memory if there
seems to be no pages to free.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/vmscan.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'mm')

diff --git a/mm/vmscan.c b/mm/vmscan.c
index f713e9f6ac73..548e023c193b 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -33,6 +33,7 @@
 #include <linux/cpuset.h>
 #include <linux/notifier.h>
 #include <linux/rwsem.h>
+#include <linux/delay.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -1784,11 +1785,13 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 	pg_data_t *pgdat;
 	unsigned long nr_to_free = nr_pages;
 	unsigned long ret = 0;
+	unsigned retry = 2;
 	struct reclaim_state reclaim_state = {
 		.reclaimed_slab = 0,
 	};
 
 	current->reclaim_state = &reclaim_state;
+repeat:
 	for_each_pgdat(pgdat) {
 		unsigned long freed;
 
@@ -1798,6 +1801,10 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 		if ((long)nr_to_free <= 0)
 			break;
 	}
+	if (retry-- && ret < nr_pages) {
+		blk_congestion_wait(WRITE, HZ/5);
+		goto repeat;
+	}
 	current->reclaim_state = NULL;
 	return ret;
 }
-- 
cgit v1.2.3


From 35386e3b0f876bf194982f48f027af0c216499ce Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:09:05 -0800
Subject: [PATCH] slab: cache_reap(): further reduction in interrupt holdoff

cache_reap takes the l3->list_lock (disabling interrupts) unconditionally
and then does a few checks and maybe does some cleanup.  This patch makes
cache_reap() only take the lock if there is work to do and then the lock is
taken and released for each cleaning action.

The checking of when to do the next reaping is done without any locking and
becomes racy.  Should not matter since reaping can also be skipped if the
slab mutex cannot be acquired.

The same is true for the touched processing.  If we get this wrong once in
awhile then we will mistakenly clean or not clean the shared cache.  This
will impact performance slightly.

Note that the additional drain_array() function introduced here will fall
out in a subsequent patch since array cleaning will now be very similar
from all callers.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Manfred Spraul <manfred@colorfullife.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 57 +++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 43 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index ff0ab772f49d..1845c0127394 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -292,13 +292,13 @@ struct kmem_list3 {
 	struct list_head slabs_full;
 	struct list_head slabs_free;
 	unsigned long free_objects;
-	unsigned long next_reap;
-	int free_touched;
 	unsigned int free_limit;
 	unsigned int colour_next;	/* Per-node cache coloring */
 	spinlock_t list_lock;
 	struct array_cache *shared;	/* shared per node */
 	struct array_cache **alien;	/* on other nodes */
+	unsigned long next_reap;	/* updated without locking */
+	int free_touched;		/* updated without locking */
 };
 
 /*
@@ -3539,6 +3539,22 @@ static void drain_array_locked(struct kmem_cache *cachep,
 	}
 }
 
+
+/*
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary.
+ */
+static void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3,
+					 struct array_cache *ac)
+{
+	if (ac && ac->avail) {
+		spin_lock_irq(&l3->list_lock);
+		drain_array_locked(searchp, ac, 0,
+				   numa_node_id());
+		spin_unlock_irq(&l3->list_lock);
+	}
+}
+
 /**
  * cache_reap - Reclaim memory from caches.
  * @unused: unused parameter
@@ -3572,33 +3588,48 @@ static void cache_reap(void *unused)
 		searchp = list_entry(walk, struct kmem_cache, next);
 		check_irq_on();
 
+		/*
+		 * We only take the l3 lock if absolutely necessary and we
+		 * have established with reasonable certainty that
+		 * we can do some work if the lock was obtained.
+		 */
 		l3 = searchp->nodelists[numa_node_id()];
+
 		reap_alien(searchp, l3);
-		spin_lock_irq(&l3->list_lock);
 
-		drain_array_locked(searchp, cpu_cache_get(searchp), 0,
-				   numa_node_id());
+		drain_array(searchp, l3, cpu_cache_get(searchp));
 
+		/*
+		 * These are racy checks but it does not matter
+		 * if we skip one check or scan twice.
+		 */
 		if (time_after(l3->next_reap, jiffies))
-			goto next_unlock;
+			goto next;
 
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
 
-		if (l3->shared)
-			drain_array_locked(searchp, l3->shared, 0,
-					   numa_node_id());
+		drain_array(searchp, l3, l3->shared);
 
 		if (l3->free_touched) {
 			l3->free_touched = 0;
-			goto next_unlock;
+			goto next;
 		}
 
 		tofree = (l3->free_limit + 5 * searchp->num - 1) /
 				(5 * searchp->num);
 		do {
+			/*
+			 * Do not lock if there are no free blocks.
+			 */
+			if (list_empty(&l3->slabs_free))
+				break;
+
+			spin_lock_irq(&l3->list_lock);
 			p = l3->slabs_free.next;
-			if (p == &(l3->slabs_free))
+			if (p == &(l3->slabs_free)) {
+				spin_unlock_irq(&l3->list_lock);
 				break;
+			}
 
 			slabp = list_entry(p, struct slab, list);
 			BUG_ON(slabp->inuse);
@@ -3613,10 +3644,8 @@ static void cache_reap(void *unused)
 			l3->free_objects -= searchp->num;
 			spin_unlock_irq(&l3->list_lock);
 			slab_destroy(searchp, slabp);
-			spin_lock_irq(&l3->list_lock);
 		} while (--tofree > 0);
-next_unlock:
-		spin_unlock_irq(&l3->list_lock);
+next:
 		cond_resched();
 	}
 	check_irq_on();
-- 
cgit v1.2.3


From aab2207cf8d9c343b6b5f0e4d27e1732f8618d14 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:09:06 -0800
Subject: [PATCH] slab: make drain_array more universal by adding more
 parameters

And a parameter to drain_array to control the freeing of all objects and
then use drain_array() to replace instances of drain_array_locked with
drain_array.  Doing so will avoid taking locks in those locations if the
arrays are empty.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 1845c0127394..d73b38e7d7e8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2126,6 +2126,10 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 static void drain_array_locked(struct kmem_cache *cachep,
 			struct array_cache *ac, int force, int node);
 
+static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+			struct array_cache *ac,
+			int force, int node);
+
 static void do_drain(void *arg)
 {
 	struct kmem_cache *cachep = arg;
@@ -2150,9 +2154,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
 		if (l3) {
-			spin_lock_irq(&l3->list_lock);
-			drain_array_locked(cachep, l3->shared, 1, node);
-			spin_unlock_irq(&l3->list_lock);
+			drain_array(cachep, l3, l3->shared, 1, node);
 			if (l3->alien)
 				drain_alien_cache(cachep, l3->alien);
 		}
@@ -3545,12 +3547,11 @@ static void drain_array_locked(struct kmem_cache *cachep,
  * necessary.
  */
 static void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3,
-					 struct array_cache *ac)
+			 struct array_cache *ac, int force, int node)
 {
 	if (ac && ac->avail) {
 		spin_lock_irq(&l3->list_lock);
-		drain_array_locked(searchp, ac, 0,
-				   numa_node_id());
+		drain_array_locked(searchp, ac, force, node);
 		spin_unlock_irq(&l3->list_lock);
 	}
 }
@@ -3571,6 +3572,7 @@ static void cache_reap(void *unused)
 {
 	struct list_head *walk;
 	struct kmem_list3 *l3;
+	int node = numa_node_id();
 
 	if (!mutex_trylock(&cache_chain_mutex)) {
 		/* Give up. Setup the next iteration. */
@@ -3593,11 +3595,11 @@ static void cache_reap(void *unused)
 		 * have established with reasonable certainty that
 		 * we can do some work if the lock was obtained.
 		 */
-		l3 = searchp->nodelists[numa_node_id()];
+		l3 = searchp->nodelists[node];
 
 		reap_alien(searchp, l3);
 
-		drain_array(searchp, l3, cpu_cache_get(searchp));
+		drain_array(searchp, l3, cpu_cache_get(searchp), 0, node);
 
 		/*
 		 * These are racy checks but it does not matter
@@ -3608,7 +3610,7 @@ static void cache_reap(void *unused)
 
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
 
-		drain_array(searchp, l3, l3->shared);
+		drain_array(searchp, l3, l3->shared, 0, node);
 
 		if (l3->free_touched) {
 			l3->free_touched = 0;
-- 
cgit v1.2.3


From 1b55253a7f95adc82eb20937b57b3e3e32ba65df Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:09:07 -0800
Subject: [PATCH] slab: remove drain_array_locked

Remove drain_array_locked and use that opportunity to limit the time the l3
lock is taken further.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index d73b38e7d7e8..3274144c0d16 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2123,9 +2123,6 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node)
 #define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
-static void drain_array_locked(struct kmem_cache *cachep,
-			struct array_cache *ac, int force, int node);
-
 static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 			struct array_cache *ac,
 			int force, int node);
@@ -3522,40 +3519,32 @@ static void enable_cpucache(struct kmem_cache *cachep)
 		       cachep->name, -err);
 }
 
-static void drain_array_locked(struct kmem_cache *cachep,
-				struct array_cache *ac, int force, int node)
+/*
+ * Drain an array if it contains any elements taking the l3 lock only if
+ * necessary.
+ */
+void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
+			 struct array_cache *ac, int force, int node)
 {
 	int tofree;
 
-	check_spinlock_acquired_node(cachep, node);
+	if (!ac || !ac->avail)
+		return;
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else if (ac->avail) {
 		tofree = force ? ac->avail : (ac->limit + 4) / 5;
 		if (tofree > ac->avail)
 			tofree = (ac->avail + 1) / 2;
+		spin_lock_irq(&l3->list_lock);
 		free_block(cachep, ac->entry, tofree, node);
+		spin_unlock_irq(&l3->list_lock);
 		ac->avail -= tofree;
 		memmove(ac->entry, &(ac->entry[tofree]),
 			sizeof(void *) * ac->avail);
 	}
 }
 
-
-/*
- * Drain an array if it contains any elements taking the l3 lock only if
- * necessary.
- */
-static void drain_array(struct kmem_cache *searchp, struct kmem_list3 *l3,
-			 struct array_cache *ac, int force, int node)
-{
-	if (ac && ac->avail) {
-		spin_lock_irq(&l3->list_lock);
-		drain_array_locked(searchp, ac, force, node);
-		spin_unlock_irq(&l3->list_lock);
-	}
-}
-
 /**
  * cache_reap - Reclaim memory from caches.
  * @unused: unused parameter
-- 
cgit v1.2.3


From b18e7e654d7af741d2bf34a90dc34128d0217fea Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Wed, 22 Mar 2006 00:09:07 -0800
Subject: [PATCH] slab: fix drain_array() so that it works correctly with the
 shared_array

The list_lock also protects the shared array and we call drain_array() with
the shared array.  Therefore we cannot go as far as I wanted to but have to
take the lock in a way so that it also protects the array_cache in
drain_pages.

(Note: maybe we should make the array_cache locking more consistent?  I.e.
always take the array cache lock for shared arrays and disable interrupts
for the per cpu arrays?)

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 3274144c0d16..6b691ecbac44 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3521,7 +3521,8 @@ static void enable_cpucache(struct kmem_cache *cachep)
 
 /*
  * Drain an array if it contains any elements taking the l3 lock only if
- * necessary.
+ * necessary. Note that the l3 listlock also protects the array_cache
+ * if drain_array() is used on the shared array.
  */
 void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 			 struct array_cache *ac, int force, int node)
@@ -3532,16 +3533,18 @@ void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3,
 		return;
 	if (ac->touched && !force) {
 		ac->touched = 0;
-	} else if (ac->avail) {
-		tofree = force ? ac->avail : (ac->limit + 4) / 5;
-		if (tofree > ac->avail)
-			tofree = (ac->avail + 1) / 2;
+	} else {
 		spin_lock_irq(&l3->list_lock);
-		free_block(cachep, ac->entry, tofree, node);
+		if (ac->avail) {
+			tofree = force ? ac->avail : (ac->limit + 4) / 5;
+			if (tofree > ac->avail)
+				tofree = (ac->avail + 1) / 2;
+			free_block(cachep, ac->entry, tofree, node);
+			ac->avail -= tofree;
+			memmove(ac->entry, &(ac->entry[tofree]),
+				sizeof(void *) * ac->avail);
+		}
 		spin_unlock_irq(&l3->list_lock);
-		ac->avail -= tofree;
-		memmove(ac->entry, &(ac->entry[tofree]),
-			sizeof(void *) * ac->avail);
 	}
 }
 
-- 
cgit v1.2.3


From 879336c3930ae9273ea1c45214cb8adae0ce494a Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 22 Mar 2006 00:09:08 -0800
Subject: [PATCH] drain_node_pages: interrupt latency reduction / optimization

1. Only disable interrupts if there is actually something to free

2. Only dirty the pcp cacheline if we actually freed something.

3. Disable interrupts for each single pcp and not for cleaning
  all the pcps in all zones of a node.

drain_node_pages is called every 2 seconds from cache_reap. This
fix should avoid most disabling of interrupts.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ed91684cb1f5..b7f14a4799a5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -603,13 +603,14 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
 /*
  * Called from the slab reaper to drain pagesets on a particular node that
  * belong to the currently executing processor.
+ * Note that this function must be called with the thread pinned to
+ * a single processor.
  */
 void drain_node_pages(int nodeid)
 {
 	int i, z;
 	unsigned long flags;
 
-	local_irq_save(flags);
 	for (z = 0; z < MAX_NR_ZONES; z++) {
 		struct zone *zone = NODE_DATA(nodeid)->node_zones + z;
 		struct per_cpu_pageset *pset;
@@ -619,11 +620,14 @@ void drain_node_pages(int nodeid)
 			struct per_cpu_pages *pcp;
 
 			pcp = &pset->pcp[i];
-			free_pages_bulk(zone, pcp->count, &pcp->list, 0);
-			pcp->count = 0;
+			if (pcp->count) {
+				local_irq_save(flags);
+				free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+				pcp->count = 0;
+				local_irq_restore(flags);
+			}
 		}
 	}
-	local_irq_restore(flags);
 }
 #endif
 
-- 
cgit v1.2.3


From 9b65ef59d42a56fa1358958ede77aaa5bac385a8 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <mita@miraclelinux.com>
Date: Wed, 22 Mar 2006 00:09:09 -0800
Subject: [PATCH] fix swap cluster offset

When we've allocated SWAPFILE_CLUSTER pages, ->cluster_next should be the
first index of swap cluster.  But current code probably sets it wrong offset.

Signed-off-by: Akinobu Mita <mita@miraclelinux.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/swapfile.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f9cf0d073b8..365ed6ff182d 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -116,7 +116,7 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
 				last_in_cluster = offset + SWAPFILE_CLUSTER;
 			else if (offset == last_in_cluster) {
 				spin_lock(&swap_lock);
-				si->cluster_next = offset-SWAPFILE_CLUSTER-1;
+				si->cluster_next = offset-SWAPFILE_CLUSTER+1;
 				goto cluster;
 			}
 			if (unlikely(--latency_ration < 0)) {
-- 
cgit v1.2.3


From fdb7cc59084ba7eef935e4e40aaaf538ee34c625 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 22 Mar 2006 00:09:10 -0800
Subject: [PATCH] mm: hugetlb alloc_fresh_huge_page bogus node loop fix

Fix bogus node loop in hugetlb.c alloc_fresh_huge_page(), which was
assuming that nodes are numbered contiguously from 0 to num_online_nodes().
Once the hotplug folks get this far, that will be false.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/hugetlb.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 06699d871a8e..ebad6bbb3501 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -105,7 +105,9 @@ static int alloc_fresh_huge_page(void)
 	struct page *page;
 	page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
 					HUGETLB_PAGE_ORDER);
-	nid = (nid + 1) % num_online_nodes();
+	nid = next_node(nid, node_online_map);
+	if (nid == MAX_NUMNODES)
+		nid = first_node(node_online_map);
 	if (page) {
 		page[1].lru.next = (void *)free_huge_page;	/* dtor */
 		spin_lock(&hugetlb_lock);
-- 
cgit v1.2.3


From 442295c94bf650221af3ef20fc68fa3e93876818 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Wed, 22 Mar 2006 00:09:11 -0800
Subject: [PATCH] mm: slab cache interleave rotor fix

The alien cache rotor in mm/slab.c assumes that the first online node is
node 0.  Eventually for some archs, especially with hotplug, this will no
longer be true.

Fix the interleave rotor to handle the general case of node numbering.

Signed-off-by: Paul Jackson <pj@sgi.com>
Acked-by: Christoph Lameter <clameter@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 6b691ecbac44..1c8f5ee230d5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -830,7 +830,7 @@ static void init_reap_node(int cpu)
 
 	node = next_node(cpu_to_node(cpu), node_online_map);
 	if (node == MAX_NUMNODES)
-		node = 0;
+		node = first_node(node_online_map);
 
 	__get_cpu_var(reap_node) = node;
 }
-- 
cgit v1.2.3


From b20a35035f983f4ac7e29c4a68f30e43510007e0 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Wed, 22 Mar 2006 00:09:12 -0800
Subject: [PATCH] page migration reorg

Centralize the page migration functions in anticipation of additional
tinkering.  Creates a new file mm/migrate.c

1. Extract buffer_migrate_page() from fs/buffer.c

2. Extract central migration code from vmscan.c

3. Extract some components from mempolicy.c

4. Export pageout() and remove_from_swap() from vmscan.c

5. Make it possible to configure NUMA systems without page migration
   and non-NUMA systems with page migration.

I had to so some #ifdeffing in mempolicy.c that may need a cleanup.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/buffer.c                |  62 -----
 fs/xfs/linux-2.6/xfs_buf.c |   1 +
 include/linux/migrate.h    |  36 +++
 include/linux/swap.h       |  34 ++-
 mm/Kconfig                 |   6 +
 mm/Makefile                |   2 +
 mm/mempolicy.c             | 113 ++------
 mm/migrate.c               | 655 +++++++++++++++++++++++++++++++++++++++++++++
 mm/swap_state.c            |   1 +
 mm/vmscan.c                | 491 +--------------------------------
 10 files changed, 741 insertions(+), 660 deletions(-)
 create mode 100644 include/linux/migrate.h
 create mode 100644 mm/migrate.c

(limited to 'mm')

diff --git a/fs/buffer.c b/fs/buffer.c
index a9b399402007..1d3683d496f8 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3050,68 +3050,6 @@ asmlinkage long sys_bdflush(int func, long data)
 	return 0;
 }
 
-/*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist.
- */
-#ifdef CONFIG_MIGRATION
-int buffer_migrate_page(struct page *newpage, struct page *page)
-{
-	struct address_space *mapping = page->mapping;
-	struct buffer_head *bh, *head;
-	int rc;
-
-	if (!mapping)
-		return -EAGAIN;
-
-	if (!page_has_buffers(page))
-		return migrate_page(newpage, page);
-
-	head = page_buffers(page);
-
-	rc = migrate_page_remove_references(newpage, page, 3);
-	if (rc)
-		return rc;
-
-	bh = head;
-	do {
-		get_bh(bh);
-		lock_buffer(bh);
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	ClearPagePrivate(page);
-	set_page_private(newpage, page_private(page));
-	set_page_private(page, 0);
-	put_page(page);
-	get_page(newpage);
-
-	bh = head;
-	do {
-		set_bh_page(bh, newpage, bh_offset(bh));
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	SetPagePrivate(newpage);
-
-	migrate_page_copy(newpage, page);
-
-	bh = head;
-	do {
-		unlock_buffer(bh);
- 		put_bh(bh);
-		bh = bh->b_this_page;
-
-	} while (bh != head);
-
-	return 0;
-}
-EXPORT_SYMBOL(buffer_migrate_page);
-#endif
-
 /*
  * Buffer-head allocation
  */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index bfb4f2917bb6..8cdfa4151659 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -29,6 +29,7 @@
 #include <linux/blkdev.h>
 #include <linux/hash.h>
 #include <linux/kthread.h>
+#include <linux/migrate.h>
 #include "xfs_linux.h"
 
 STATIC kmem_zone_t *xfs_buf_zone;
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
new file mode 100644
index 000000000000..7d09962c3c0b
--- /dev/null
+++ b/include/linux/migrate.h
@@ -0,0 +1,36 @@
+#ifndef _LINUX_MIGRATE_H
+#define _LINUX_MIGRATE_H
+
+#include <linux/config.h>
+#include <linux/mm.h>
+
+#ifdef CONFIG_MIGRATION
+extern int isolate_lru_page(struct page *p, struct list_head *pagelist);
+extern int putback_lru_pages(struct list_head *l);
+extern int migrate_page(struct page *, struct page *);
+extern void migrate_page_copy(struct page *, struct page *);
+extern int migrate_page_remove_references(struct page *, struct page *, int);
+extern int migrate_pages(struct list_head *l, struct list_head *t,
+		struct list_head *moved, struct list_head *failed);
+int migrate_pages_to(struct list_head *pagelist,
+			struct vm_area_struct *vma, int dest);
+extern int fail_migrate_page(struct page *, struct page *);
+
+extern int migrate_prep(void);
+
+#else
+
+static inline int isolate_lru_page(struct page *p, struct list_head *list)
+					{ return -ENOSYS; }
+static inline int putback_lru_pages(struct list_head *l) { return 0; }
+static inline int migrate_pages(struct list_head *l, struct list_head *t,
+	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
+
+static inline int migrate_prep(void) { return -ENOSYS; }
+
+/* Possible settings for the migrate_page() method in address_operations */
+#define migrate_page NULL
+#define fail_migrate_page NULL
+
+#endif /* CONFIG_MIGRATION */
+#endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3dc6c89c49b8..12415dd94451 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -175,6 +175,21 @@ extern void swap_setup(void);
 extern unsigned long try_to_free_pages(struct zone **, gfp_t);
 extern unsigned long shrink_all_memory(unsigned long nr_pages);
 extern int vm_swappiness;
+extern int remove_mapping(struct address_space *mapping, struct page *page);
+
+/* possible outcome of pageout() */
+typedef enum {
+	/* failed to write page out, page is locked */
+	PAGE_KEEP,
+	/* move page to the active list, page is locked */
+	PAGE_ACTIVATE,
+	/* page has been sent to the disk successfully, page is unlocked */
+	PAGE_SUCCESS,
+	/* page is clean and locked */
+	PAGE_CLEAN,
+} pageout_t;
+
+extern pageout_t pageout(struct page *page, struct address_space *mapping);
 
 #ifdef CONFIG_NUMA
 extern int zone_reclaim_mode;
@@ -188,25 +203,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
-#ifdef CONFIG_MIGRATION
-extern int isolate_lru_page(struct page *p);
-extern unsigned long putback_lru_pages(struct list_head *l);
-extern int migrate_page(struct page *, struct page *);
-extern void migrate_page_copy(struct page *, struct page *);
-extern int migrate_page_remove_references(struct page *, struct page *, int);
-extern unsigned long migrate_pages(struct list_head *l, struct list_head *t,
-		struct list_head *moved, struct list_head *failed);
-extern int fail_migrate_page(struct page *, struct page *);
-#else
-static inline int isolate_lru_page(struct page *p) { return -ENOSYS; }
-static inline int putback_lru_pages(struct list_head *l) { return 0; }
-static inline int migrate_pages(struct list_head *l, struct list_head *t,
-	struct list_head *moved, struct list_head *failed) { return -ENOSYS; }
-/* Possible settings for the migrate_page() method in address_operations */
-#define migrate_page NULL
-#define fail_migrate_page NULL
-#endif
-
 #ifdef CONFIG_MMU
 /* linux/mm/shmem.c */
 extern int shmem_unuse(swp_entry_t entry, struct page *page);
diff --git a/mm/Kconfig b/mm/Kconfig
index a9cb80ae6409..bd80460360db 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -137,5 +137,11 @@ config SPLIT_PTLOCK_CPUS
 # support for page migration
 #
 config MIGRATION
+	bool "Page migration"
 	def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
 	depends on SWAP
+	help
+	  Allows the migration of the physical location of pages of processes
+	  while the virtual addresses are not changed. This is useful for
+	  example on NUMA systems to put pages nearer to the processors accessing
+	  the page.
diff --git a/mm/Makefile b/mm/Makefile
index 9aa03fa1dcc3..f10c753dce6d 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -22,3 +22,5 @@ obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
+obj-$(CONFIG_MIGRATION) += migrate.o
+
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 96195dcb62e1..e93cc740c22b 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -86,6 +86,7 @@
 #include <linux/swap.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/migrate.h>
 
 #include <asm/tlbflush.h>
 #include <asm/uaccess.h>
@@ -95,9 +96,6 @@
 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)		/* Invert check for nodemask */
 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)		/* Gather statistics */
 
-/* The number of pages to migrate per call to migrate_pages() */
-#define MIGRATE_CHUNK_SIZE 256
-
 static struct kmem_cache *policy_cache;
 static struct kmem_cache *sn_cache;
 
@@ -331,17 +329,10 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 	struct vm_area_struct *first, *vma, *prev;
 
 	if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-		/* Must have swap device for migration */
-		if (nr_swap_pages <= 0)
-			return ERR_PTR(-ENODEV);
 
-		/*
-		 * Clear the LRU lists so pages can be isolated.
-		 * Note that pages may be moved off the LRU after we have
-		 * drained them. Those pages will fail to migrate like other
-		 * pages that may be busy.
-		 */
-		lru_add_drain_all();
+		err = migrate_prep();
+		if (err)
+			return ERR_PTR(err);
 	}
 
 	first = find_vma(mm, start);
@@ -550,92 +541,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
 	return err;
 }
 
+#ifdef CONFIG_MIGRATION
 /*
  * page migration
  */
-
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 				unsigned long flags)
 {
 	/*
 	 * Avoid migrating a page that is shared with others.
 	 */
-	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
-		if (isolate_lru_page(page))
-			list_add_tail(&page->lru, pagelist);
-	}
-}
-
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-static int migrate_pages_to(struct list_head *pagelist,
-			struct vm_area_struct *vma, int dest)
-{
-	LIST_HEAD(newlist);
-	LIST_HEAD(moved);
-	LIST_HEAD(failed);
-	int err = 0;
-	unsigned long offset = 0;
-	int nr_pages;
-	struct page *page;
-	struct list_head *p;
-
-redo:
-	nr_pages = 0;
-	list_for_each(p, pagelist) {
-		if (vma) {
-			/*
-			 * The address passed to alloc_page_vma is used to
-			 * generate the proper interleave behavior. We fake
-			 * the address here by an increasing offset in order
-			 * to get the proper distribution of pages.
-			 *
-			 * No decision has been made as to which page
-			 * a certain old page is moved to so we cannot
-			 * specify the correct address.
-			 */
-			page = alloc_page_vma(GFP_HIGHUSER, vma,
-					offset + vma->vm_start);
-			offset += PAGE_SIZE;
-		}
-		else
-			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
-		if (!page) {
-			err = -ENOMEM;
-			goto out;
-		}
-		list_add_tail(&page->lru, &newlist);
-		nr_pages++;
-		if (nr_pages > MIGRATE_CHUNK_SIZE)
-			break;
-	}
-	err = migrate_pages(pagelist, &newlist, &moved, &failed);
-
-	putback_lru_pages(&moved);	/* Call release pages instead ?? */
-
-	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-		goto redo;
-out:
-	/* Return leftover allocated pages */
-	while (!list_empty(&newlist)) {
-		page = list_entry(newlist.next, struct page, lru);
-		list_del(&page->lru);
-		__free_page(page);
-	}
-	list_splice(&failed, pagelist);
-	if (err < 0)
-		return err;
-
-	/* Calculate number of leftover pages */
-	nr_pages = 0;
-	list_for_each(p, pagelist)
-		nr_pages++;
-	return nr_pages;
+	if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+		isolate_lru_page(page, pagelist);
 }
 
 /*
@@ -742,8 +659,23 @@ int do_migrate_pages(struct mm_struct *mm,
 	if (err < 0)
 		return err;
 	return busy;
+
 }
 
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+				unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+	const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+	return -ENOSYS;
+}
+#endif
+
 long do_mbind(unsigned long start, unsigned long len,
 		unsigned long mode, nodemask_t *nmask, unsigned long flags)
 {
@@ -808,6 +740,7 @@ long do_mbind(unsigned long start, unsigned long len,
 		if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 			err = -EIO;
 	}
+
 	if (!list_empty(&pagelist))
 		putback_lru_pages(&pagelist);
 
diff --git a/mm/migrate.c b/mm/migrate.c
new file mode 100644
index 000000000000..09f6e4aa87fc
--- /dev/null
+++ b/mm/migrate.c
@@ -0,0 +1,655 @@
+/*
+ * Memory Migration functionality - linux/mm/migration.c
+ *
+ * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
+ *
+ * Page migration was first developed in the context of the memory hotplug
+ * project. The main authors of the migration code are:
+ *
+ * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
+ * Hirokazu Takahashi <taka@valinux.co.jp>
+ * Dave Hansen <haveblue@us.ibm.com>
+ * Christoph Lameter <clameter@sgi.com>
+ */
+
+#include <linux/migrate.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/buffer_head.h>	/* for try_to_release_page(),
+					buffer_heads_over_limit */
+#include <linux/mm_inline.h>
+#include <linux/pagevec.h>
+#include <linux/rmap.h>
+#include <linux/topology.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/swapops.h>
+
+#include "internal.h"
+
+#include "internal.h"
+
+/* The maximum number of pages to take off the LRU for migration */
+#define MIGRATE_CHUNK_SIZE 256
+
+#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
+
+/*
+ * Isolate one page from the LRU lists. If successful put it onto
+ * the indicated list with elevated page count.
+ *
+ * Result:
+ *  -EBUSY: page not on LRU list
+ *  0: page removed from LRU list and added to the specified list.
+ */
+int isolate_lru_page(struct page *page, struct list_head *pagelist)
+{
+	int ret = -EBUSY;
+
+	if (PageLRU(page)) {
+		struct zone *zone = page_zone(page);
+
+		spin_lock_irq(&zone->lru_lock);
+		if (PageLRU(page)) {
+			ret = 0;
+			get_page(page);
+			ClearPageLRU(page);
+			if (PageActive(page))
+				del_page_from_active_list(zone, page);
+			else
+				del_page_from_inactive_list(zone, page);
+			list_add_tail(&page->lru, pagelist);
+		}
+		spin_unlock_irq(&zone->lru_lock);
+	}
+	return ret;
+}
+
+/*
+ * migrate_prep() needs to be called after we have compiled the list of pages
+ * to be migrated using isolate_lru_page() but before we begin a series of calls
+ * to migrate_pages().
+ */
+int migrate_prep(void)
+{
+	/* Must have swap device for migration */
+	if (nr_swap_pages <= 0)
+		return -ENODEV;
+
+	/*
+	 * Clear the LRU lists so pages can be isolated.
+	 * Note that pages may be moved off the LRU after we have
+	 * drained them. Those pages will fail to migrate like other
+	 * pages that may be busy.
+	 */
+	lru_add_drain_all();
+
+	return 0;
+}
+
+static inline void move_to_lru(struct page *page)
+{
+	list_del(&page->lru);
+	if (PageActive(page)) {
+		/*
+		 * lru_cache_add_active checks that
+		 * the PG_active bit is off.
+		 */
+		ClearPageActive(page);
+		lru_cache_add_active(page);
+	} else {
+		lru_cache_add(page);
+	}
+	put_page(page);
+}
+
+/*
+ * Add isolated pages on the list back to the LRU.
+ *
+ * returns the number of pages put back.
+ */
+int putback_lru_pages(struct list_head *l)
+{
+	struct page *page;
+	struct page *page2;
+	int count = 0;
+
+	list_for_each_entry_safe(page, page2, l, lru) {
+		move_to_lru(page);
+		count++;
+	}
+	return count;
+}
+
+/*
+ * Non migratable page
+ */
+int fail_migrate_page(struct page *newpage, struct page *page)
+{
+	return -EIO;
+}
+EXPORT_SYMBOL(fail_migrate_page);
+
+/*
+ * swapout a single page
+ * page is locked upon entry, unlocked on exit
+ */
+static int swap_page(struct page *page)
+{
+	struct address_space *mapping = page_mapping(page);
+
+	if (page_mapped(page) && mapping)
+		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
+			goto unlock_retry;
+
+	if (PageDirty(page)) {
+		/* Page is dirty, try to write it out here */
+		switch(pageout(page, mapping)) {
+		case PAGE_KEEP:
+		case PAGE_ACTIVATE:
+			goto unlock_retry;
+
+		case PAGE_SUCCESS:
+			goto retry;
+
+		case PAGE_CLEAN:
+			; /* try to free the page below */
+		}
+	}
+
+	if (PagePrivate(page)) {
+		if (!try_to_release_page(page, GFP_KERNEL) ||
+		    (!mapping && page_count(page) == 1))
+			goto unlock_retry;
+	}
+
+	if (remove_mapping(mapping, page)) {
+		/* Success */
+		unlock_page(page);
+		return 0;
+	}
+
+unlock_retry:
+	unlock_page(page);
+
+retry:
+	return -EAGAIN;
+}
+EXPORT_SYMBOL(swap_page);
+
+/*
+ * Remove references for a page and establish the new page with the correct
+ * basic settings to be able to stop accesses to the page.
+ */
+int migrate_page_remove_references(struct page *newpage,
+				struct page *page, int nr_refs)
+{
+	struct address_space *mapping = page_mapping(page);
+	struct page **radix_pointer;
+
+	/*
+	 * Avoid doing any of the following work if the page count
+	 * indicates that the page is in use or truncate has removed
+	 * the page.
+	 */
+	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
+		return -EAGAIN;
+
+	/*
+	 * Establish swap ptes for anonymous pages or destroy pte
+	 * maps for files.
+	 *
+	 * In order to reestablish file backed mappings the fault handlers
+	 * will take the radix tree_lock which may then be used to stop
+  	 * processses from accessing this page until the new page is ready.
+	 *
+	 * A process accessing via a swap pte (an anonymous page) will take a
+	 * page_lock on the old page which will block the process until the
+	 * migration attempt is complete. At that time the PageSwapCache bit
+	 * will be examined. If the page was migrated then the PageSwapCache
+	 * bit will be clear and the operation to retrieve the page will be
+	 * retried which will find the new page in the radix tree. Then a new
+	 * direct mapping may be generated based on the radix tree contents.
+	 *
+	 * If the page was not migrated then the PageSwapCache bit
+	 * is still set and the operation may continue.
+	 */
+	if (try_to_unmap(page, 1) == SWAP_FAIL)
+		/* A vma has VM_LOCKED set -> permanent failure */
+		return -EPERM;
+
+	/*
+	 * Give up if we were unable to remove all mappings.
+	 */
+	if (page_mapcount(page))
+		return -EAGAIN;
+
+	write_lock_irq(&mapping->tree_lock);
+
+	radix_pointer = (struct page **)radix_tree_lookup_slot(
+						&mapping->page_tree,
+						page_index(page));
+
+	if (!page_mapping(page) || page_count(page) != nr_refs ||
+			*radix_pointer != page) {
+		write_unlock_irq(&mapping->tree_lock);
+		return 1;
+	}
+
+	/*
+	 * Now we know that no one else is looking at the page.
+	 *
+	 * Certain minimal information about a page must be available
+	 * in order for other subsystems to properly handle the page if they
+	 * find it through the radix tree update before we are finished
+	 * copying the page.
+	 */
+	get_page(newpage);
+	newpage->index = page->index;
+	newpage->mapping = page->mapping;
+	if (PageSwapCache(page)) {
+		SetPageSwapCache(newpage);
+		set_page_private(newpage, page_private(page));
+	}
+
+	*radix_pointer = newpage;
+	__put_page(page);
+	write_unlock_irq(&mapping->tree_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(migrate_page_remove_references);
+
+/*
+ * Copy the page to its new location
+ */
+void migrate_page_copy(struct page *newpage, struct page *page)
+{
+	copy_highpage(newpage, page);
+
+	if (PageError(page))
+		SetPageError(newpage);
+	if (PageReferenced(page))
+		SetPageReferenced(newpage);
+	if (PageUptodate(page))
+		SetPageUptodate(newpage);
+	if (PageActive(page))
+		SetPageActive(newpage);
+	if (PageChecked(page))
+		SetPageChecked(newpage);
+	if (PageMappedToDisk(page))
+		SetPageMappedToDisk(newpage);
+
+	if (PageDirty(page)) {
+		clear_page_dirty_for_io(page);
+		set_page_dirty(newpage);
+ 	}
+
+	ClearPageSwapCache(page);
+	ClearPageActive(page);
+	ClearPagePrivate(page);
+	set_page_private(page, 0);
+	page->mapping = NULL;
+
+	/*
+	 * If any waiters have accumulated on the new page then
+	 * wake them up.
+	 */
+	if (PageWriteback(newpage))
+		end_page_writeback(newpage);
+}
+EXPORT_SYMBOL(migrate_page_copy);
+
+/*
+ * Common logic to directly migrate a single page suitable for
+ * pages that do not use PagePrivate.
+ *
+ * Pages are locked upon entry and exit.
+ */
+int migrate_page(struct page *newpage, struct page *page)
+{
+	int rc;
+
+	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
+
+	rc = migrate_page_remove_references(newpage, page, 2);
+
+	if (rc)
+		return rc;
+
+	migrate_page_copy(newpage, page);
+
+	/*
+	 * Remove auxiliary swap entries and replace
+	 * them with real ptes.
+	 *
+	 * Note that a real pte entry will allow processes that are not
+	 * waiting on the page lock to use the new page via the page tables
+	 * before the new page is unlocked.
+	 */
+	remove_from_swap(newpage);
+	return 0;
+}
+EXPORT_SYMBOL(migrate_page);
+
+/*
+ * migrate_pages
+ *
+ * Two lists are passed to this function. The first list
+ * contains the pages isolated from the LRU to be migrated.
+ * The second list contains new pages that the pages isolated
+ * can be moved to. If the second list is NULL then all
+ * pages are swapped out.
+ *
+ * The function returns after 10 attempts or if no pages
+ * are movable anymore because to has become empty
+ * or no retryable pages exist anymore.
+ *
+ * Return: Number of pages not migrated when "to" ran empty.
+ */
+int migrate_pages(struct list_head *from, struct list_head *to,
+		  struct list_head *moved, struct list_head *failed)
+{
+	int retry;
+	int nr_failed = 0;
+	int pass = 0;
+	struct page *page;
+	struct page *page2;
+	int swapwrite = current->flags & PF_SWAPWRITE;
+	int rc;
+
+	if (!swapwrite)
+		current->flags |= PF_SWAPWRITE;
+
+redo:
+	retry = 0;
+
+	list_for_each_entry_safe(page, page2, from, lru) {
+		struct page *newpage = NULL;
+		struct address_space *mapping;
+
+		cond_resched();
+
+		rc = 0;
+		if (page_count(page) == 1)
+			/* page was freed from under us. So we are done. */
+			goto next;
+
+		if (to && list_empty(to))
+			break;
+
+		/*
+		 * Skip locked pages during the first two passes to give the
+		 * functions holding the lock time to release the page. Later we
+		 * use lock_page() to have a higher chance of acquiring the
+		 * lock.
+		 */
+		rc = -EAGAIN;
+		if (pass > 2)
+			lock_page(page);
+		else
+			if (TestSetPageLocked(page))
+				goto next;
+
+		/*
+		 * Only wait on writeback if we have already done a pass where
+		 * we we may have triggered writeouts for lots of pages.
+		 */
+		if (pass > 0) {
+			wait_on_page_writeback(page);
+		} else {
+			if (PageWriteback(page))
+				goto unlock_page;
+		}
+
+		/*
+		 * Anonymous pages must have swap cache references otherwise
+		 * the information contained in the page maps cannot be
+		 * preserved.
+		 */
+		if (PageAnon(page) && !PageSwapCache(page)) {
+			if (!add_to_swap(page, GFP_KERNEL)) {
+				rc = -ENOMEM;
+				goto unlock_page;
+			}
+		}
+
+		if (!to) {
+			rc = swap_page(page);
+			goto next;
+		}
+
+		newpage = lru_to_page(to);
+		lock_page(newpage);
+
+		/*
+		 * Pages are properly locked and writeback is complete.
+		 * Try to migrate the page.
+		 */
+		mapping = page_mapping(page);
+		if (!mapping)
+			goto unlock_both;
+
+		if (mapping->a_ops->migratepage) {
+			/*
+			 * Most pages have a mapping and most filesystems
+			 * should provide a migration function. Anonymous
+			 * pages are part of swap space which also has its
+			 * own migration function. This is the most common
+			 * path for page migration.
+			 */
+			rc = mapping->a_ops->migratepage(newpage, page);
+			goto unlock_both;
+                }
+
+		/*
+		 * Default handling if a filesystem does not provide
+		 * a migration function. We can only migrate clean
+		 * pages so try to write out any dirty pages first.
+		 */
+		if (PageDirty(page)) {
+			switch (pageout(page, mapping)) {
+			case PAGE_KEEP:
+			case PAGE_ACTIVATE:
+				goto unlock_both;
+
+			case PAGE_SUCCESS:
+				unlock_page(newpage);
+				goto next;
+
+			case PAGE_CLEAN:
+				; /* try to migrate the page below */
+			}
+                }
+
+		/*
+		 * Buffers are managed in a filesystem specific way.
+		 * We must have no buffers or drop them.
+		 */
+		if (!page_has_buffers(page) ||
+		    try_to_release_page(page, GFP_KERNEL)) {
+			rc = migrate_page(newpage, page);
+			goto unlock_both;
+		}
+
+		/*
+		 * On early passes with mapped pages simply
+		 * retry. There may be a lock held for some
+		 * buffers that may go away. Later
+		 * swap them out.
+		 */
+		if (pass > 4) {
+			/*
+			 * Persistently unable to drop buffers..... As a
+			 * measure of last resort we fall back to
+			 * swap_page().
+			 */
+			unlock_page(newpage);
+			newpage = NULL;
+			rc = swap_page(page);
+			goto next;
+		}
+
+unlock_both:
+		unlock_page(newpage);
+
+unlock_page:
+		unlock_page(page);
+
+next:
+		if (rc == -EAGAIN) {
+			retry++;
+		} else if (rc) {
+			/* Permanent failure */
+			list_move(&page->lru, failed);
+			nr_failed++;
+		} else {
+			if (newpage) {
+				/* Successful migration. Return page to LRU */
+				move_to_lru(newpage);
+			}
+			list_move(&page->lru, moved);
+		}
+	}
+	if (retry && pass++ < 10)
+		goto redo;
+
+	if (!swapwrite)
+		current->flags &= ~PF_SWAPWRITE;
+
+	return nr_failed + retry;
+}
+
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist.
+ */
+int buffer_migrate_page(struct page *newpage, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct buffer_head *bh, *head;
+	int rc;
+
+	if (!mapping)
+		return -EAGAIN;
+
+	if (!page_has_buffers(page))
+		return migrate_page(newpage, page);
+
+	head = page_buffers(page);
+
+	rc = migrate_page_remove_references(newpage, page, 3);
+
+	if (rc)
+		return rc;
+
+	bh = head;
+	do {
+		get_bh(bh);
+		lock_buffer(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	ClearPagePrivate(page);
+	set_page_private(newpage, page_private(page));
+	set_page_private(page, 0);
+	put_page(page);
+	get_page(newpage);
+
+	bh = head;
+	do {
+		set_bh_page(bh, newpage, bh_offset(bh));
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	SetPagePrivate(newpage);
+
+	migrate_page_copy(newpage, page);
+
+	bh = head;
+	do {
+		unlock_buffer(bh);
+ 		put_bh(bh);
+		bh = bh->b_this_page;
+
+	} while (bh != head);
+
+	return 0;
+}
+EXPORT_SYMBOL(buffer_migrate_page);
+
+/*
+ * Migrate the list 'pagelist' of pages to a certain destination.
+ *
+ * Specify destination with either non-NULL vma or dest_node >= 0
+ * Return the number of pages not migrated or error code
+ */
+int migrate_pages_to(struct list_head *pagelist,
+			struct vm_area_struct *vma, int dest)
+{
+	LIST_HEAD(newlist);
+	LIST_HEAD(moved);
+	LIST_HEAD(failed);
+	int err = 0;
+	unsigned long offset = 0;
+	int nr_pages;
+	struct page *page;
+	struct list_head *p;
+
+redo:
+	nr_pages = 0;
+	list_for_each(p, pagelist) {
+		if (vma) {
+			/*
+			 * The address passed to alloc_page_vma is used to
+			 * generate the proper interleave behavior. We fake
+			 * the address here by an increasing offset in order
+			 * to get the proper distribution of pages.
+			 *
+			 * No decision has been made as to which page
+			 * a certain old page is moved to so we cannot
+			 * specify the correct address.
+			 */
+			page = alloc_page_vma(GFP_HIGHUSER, vma,
+					offset + vma->vm_start);
+			offset += PAGE_SIZE;
+		}
+		else
+			page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
+
+		if (!page) {
+			err = -ENOMEM;
+			goto out;
+		}
+		list_add_tail(&page->lru, &newlist);
+		nr_pages++;
+		if (nr_pages > MIGRATE_CHUNK_SIZE)
+			break;
+	}
+	err = migrate_pages(pagelist, &newlist, &moved, &failed);
+
+	putback_lru_pages(&moved);	/* Call release pages instead ?? */
+
+	if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
+		goto redo;
+out:
+	/* Return leftover allocated pages */
+	while (!list_empty(&newlist)) {
+		page = list_entry(newlist.next, struct page, lru);
+		list_del(&page->lru);
+		__free_page(page);
+	}
+	list_splice(&failed, pagelist);
+	if (err < 0)
+		return err;
+
+	/* Calculate number of leftover pages */
+	nr_pages = 0;
+	list_for_each(p, pagelist)
+		nr_pages++;
+	return nr_pages;
+}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index db8a3d3e1636..d7af296833fc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/migrate.h>
 
 #include <asm/pgtable.h>
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 548e023c193b..fd572bbdc9f5 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -42,18 +42,6 @@
 
 #include "internal.h"
 
-/* possible outcome of pageout() */
-typedef enum {
-	/* failed to write page out, page is locked */
-	PAGE_KEEP,
-	/* move page to the active list, page is locked */
-	PAGE_ACTIVATE,
-	/* page has been sent to the disk successfully, page is unlocked */
-	PAGE_SUCCESS,
-	/* page is clean and locked */
-	PAGE_CLEAN,
-} pageout_t;
-
 struct scan_control {
 	/* Incremented by the number of inactive pages that were scanned */
 	unsigned long nr_scanned;
@@ -304,7 +292,7 @@ static void handle_write_error(struct address_space *mapping,
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+pageout_t pageout(struct page *page, struct address_space *mapping)
 {
 	/*
 	 * If the page is dirty, only perform writeback if that write
@@ -372,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 	return PAGE_CLEAN;
 }
 
-static int remove_mapping(struct address_space *mapping, struct page *page)
+int remove_mapping(struct address_space *mapping, struct page *page)
 {
 	if (!mapping)
 		return 0;		/* truncate got there first */
@@ -570,481 +558,6 @@ keep:
 	return nr_reclaimed;
 }
 
-#ifdef CONFIG_MIGRATION
-static inline void move_to_lru(struct page *page)
-{
-	list_del(&page->lru);
-	if (PageActive(page)) {
-		/*
-		 * lru_cache_add_active checks that
-		 * the PG_active bit is off.
-		 */
-		ClearPageActive(page);
-		lru_cache_add_active(page);
-	} else {
-		lru_cache_add(page);
-	}
-	put_page(page);
-}
-
-/*
- * Add isolated pages on the list back to the LRU.
- *
- * returns the number of pages put back.
- */
-unsigned long putback_lru_pages(struct list_head *l)
-{
-	struct page *page;
-	struct page *page2;
-	unsigned long count = 0;
-
-	list_for_each_entry_safe(page, page2, l, lru) {
-		move_to_lru(page);
-		count++;
-	}
-	return count;
-}
-
-/*
- * Non migratable page
- */
-int fail_migrate_page(struct page *newpage, struct page *page)
-{
-	return -EIO;
-}
-EXPORT_SYMBOL(fail_migrate_page);
-
-/*
- * swapout a single page
- * page is locked upon entry, unlocked on exit
- */
-static int swap_page(struct page *page)
-{
-	struct address_space *mapping = page_mapping(page);
-
-	if (page_mapped(page) && mapping)
-		if (try_to_unmap(page, 1) != SWAP_SUCCESS)
-			goto unlock_retry;
-
-	if (PageDirty(page)) {
-		/* Page is dirty, try to write it out here */
-		switch(pageout(page, mapping)) {
-		case PAGE_KEEP:
-		case PAGE_ACTIVATE:
-			goto unlock_retry;
-
-		case PAGE_SUCCESS:
-			goto retry;
-
-		case PAGE_CLEAN:
-			; /* try to free the page below */
-		}
-	}
-
-	if (PagePrivate(page)) {
-		if (!try_to_release_page(page, GFP_KERNEL) ||
-		    (!mapping && page_count(page) == 1))
-			goto unlock_retry;
-	}
-
-	if (remove_mapping(mapping, page)) {
-		/* Success */
-		unlock_page(page);
-		return 0;
-	}
-
-unlock_retry:
-	unlock_page(page);
-
-retry:
-	return -EAGAIN;
-}
-EXPORT_SYMBOL(swap_page);
-
-/*
- * Page migration was first developed in the context of the memory hotplug
- * project. The main authors of the migration code are:
- *
- * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
- * Hirokazu Takahashi <taka@valinux.co.jp>
- * Dave Hansen <haveblue@us.ibm.com>
- * Christoph Lameter <clameter@sgi.com>
- */
-
-/*
- * Remove references for a page and establish the new page with the correct
- * basic settings to be able to stop accesses to the page.
- */
-int migrate_page_remove_references(struct page *newpage,
-				struct page *page, int nr_refs)
-{
-	struct address_space *mapping = page_mapping(page);
-	struct page **radix_pointer;
-
-	/*
-	 * Avoid doing any of the following work if the page count
-	 * indicates that the page is in use or truncate has removed
-	 * the page.
-	 */
-	if (!mapping || page_mapcount(page) + nr_refs != page_count(page))
-		return -EAGAIN;
-
-	/*
-	 * Establish swap ptes for anonymous pages or destroy pte
-	 * maps for files.
-	 *
-	 * In order to reestablish file backed mappings the fault handlers
-	 * will take the radix tree_lock which may then be used to stop
-  	 * processses from accessing this page until the new page is ready.
-	 *
-	 * A process accessing via a swap pte (an anonymous page) will take a
-	 * page_lock on the old page which will block the process until the
-	 * migration attempt is complete. At that time the PageSwapCache bit
-	 * will be examined. If the page was migrated then the PageSwapCache
-	 * bit will be clear and the operation to retrieve the page will be
-	 * retried which will find the new page in the radix tree. Then a new
-	 * direct mapping may be generated based on the radix tree contents.
-	 *
-	 * If the page was not migrated then the PageSwapCache bit
-	 * is still set and the operation may continue.
-	 */
-	if (try_to_unmap(page, 1) == SWAP_FAIL)
-		/* A vma has VM_LOCKED set -> Permanent failure */
-		return -EPERM;
-
-	/*
-	 * Give up if we were unable to remove all mappings.
-	 */
-	if (page_mapcount(page))
-		return -EAGAIN;
-
-	write_lock_irq(&mapping->tree_lock);
-
-	radix_pointer = (struct page **)radix_tree_lookup_slot(
-						&mapping->page_tree,
-						page_index(page));
-
-	if (!page_mapping(page) || page_count(page) != nr_refs ||
-			*radix_pointer != page) {
-		write_unlock_irq(&mapping->tree_lock);
-		return -EAGAIN;
-	}
-
-	/*
-	 * Now we know that no one else is looking at the page.
-	 *
-	 * Certain minimal information about a page must be available
-	 * in order for other subsystems to properly handle the page if they
-	 * find it through the radix tree update before we are finished
-	 * copying the page.
-	 */
-	get_page(newpage);
-	newpage->index = page->index;
-	newpage->mapping = page->mapping;
-	if (PageSwapCache(page)) {
-		SetPageSwapCache(newpage);
-		set_page_private(newpage, page_private(page));
-	}
-
-	*radix_pointer = newpage;
-	__put_page(page);
-	write_unlock_irq(&mapping->tree_lock);
-
-	return 0;
-}
-EXPORT_SYMBOL(migrate_page_remove_references);
-
-/*
- * Copy the page to its new location
- */
-void migrate_page_copy(struct page *newpage, struct page *page)
-{
-	copy_highpage(newpage, page);
-
-	if (PageError(page))
-		SetPageError(newpage);
-	if (PageReferenced(page))
-		SetPageReferenced(newpage);
-	if (PageUptodate(page))
-		SetPageUptodate(newpage);
-	if (PageActive(page))
-		SetPageActive(newpage);
-	if (PageChecked(page))
-		SetPageChecked(newpage);
-	if (PageMappedToDisk(page))
-		SetPageMappedToDisk(newpage);
-
-	if (PageDirty(page)) {
-		clear_page_dirty_for_io(page);
-		set_page_dirty(newpage);
- 	}
-
-	ClearPageSwapCache(page);
-	ClearPageActive(page);
-	ClearPagePrivate(page);
-	set_page_private(page, 0);
-	page->mapping = NULL;
-
-	/*
-	 * If any waiters have accumulated on the new page then
-	 * wake them up.
-	 */
-	if (PageWriteback(newpage))
-		end_page_writeback(newpage);
-}
-EXPORT_SYMBOL(migrate_page_copy);
-
-/*
- * Common logic to directly migrate a single page suitable for
- * pages that do not use PagePrivate.
- *
- * Pages are locked upon entry and exit.
- */
-int migrate_page(struct page *newpage, struct page *page)
-{
-	int rc;
-
-	BUG_ON(PageWriteback(page));	/* Writeback must be complete */
-
-	rc = migrate_page_remove_references(newpage, page, 2);
-
-	if (rc)
-		return rc;
-
-	migrate_page_copy(newpage, page);
-
-	/*
-	 * Remove auxiliary swap entries and replace
-	 * them with real ptes.
-	 *
-	 * Note that a real pte entry will allow processes that are not
-	 * waiting on the page lock to use the new page via the page tables
-	 * before the new page is unlocked.
-	 */
-	remove_from_swap(newpage);
-	return 0;
-}
-EXPORT_SYMBOL(migrate_page);
-
-/*
- * migrate_pages
- *
- * Two lists are passed to this function. The first list
- * contains the pages isolated from the LRU to be migrated.
- * The second list contains new pages that the pages isolated
- * can be moved to. If the second list is NULL then all
- * pages are swapped out.
- *
- * The function returns after 10 attempts or if no pages
- * are movable anymore because to has become empty
- * or no retryable pages exist anymore.
- *
- * Return: Number of pages not migrated when "to" ran empty.
- */
-unsigned long migrate_pages(struct list_head *from, struct list_head *to,
-		  struct list_head *moved, struct list_head *failed)
-{
-	unsigned long retry;
-	unsigned long nr_failed = 0;
-	int pass = 0;
-	struct page *page;
-	struct page *page2;
-	int swapwrite = current->flags & PF_SWAPWRITE;
-	int rc;
-
-	if (!swapwrite)
-		current->flags |= PF_SWAPWRITE;
-
-redo:
-	retry = 0;
-
-	list_for_each_entry_safe(page, page2, from, lru) {
-		struct page *newpage = NULL;
-		struct address_space *mapping;
-
-		cond_resched();
-
-		rc = 0;
-		if (page_count(page) == 1)
-			/* page was freed from under us. So we are done. */
-			goto next;
-
-		if (to && list_empty(to))
-			break;
-
-		/*
-		 * Skip locked pages during the first two passes to give the
-		 * functions holding the lock time to release the page. Later we
-		 * use lock_page() to have a higher chance of acquiring the
-		 * lock.
-		 */
-		rc = -EAGAIN;
-		if (pass > 2)
-			lock_page(page);
-		else
-			if (TestSetPageLocked(page))
-				goto next;
-
-		/*
-		 * Only wait on writeback if we have already done a pass where
-		 * we we may have triggered writeouts for lots of pages.
-		 */
-		if (pass > 0) {
-			wait_on_page_writeback(page);
-		} else {
-			if (PageWriteback(page))
-				goto unlock_page;
-		}
-
-		/*
-		 * Anonymous pages must have swap cache references otherwise
-		 * the information contained in the page maps cannot be
-		 * preserved.
-		 */
-		if (PageAnon(page) && !PageSwapCache(page)) {
-			if (!add_to_swap(page, GFP_KERNEL)) {
-				rc = -ENOMEM;
-				goto unlock_page;
-			}
-		}
-
-		if (!to) {
-			rc = swap_page(page);
-			goto next;
-		}
-
-		newpage = lru_to_page(to);
-		lock_page(newpage);
-
-		/*
-		 * Pages are properly locked and writeback is complete.
-		 * Try to migrate the page.
-		 */
-		mapping = page_mapping(page);
-		if (!mapping)
-			goto unlock_both;
-
-		if (mapping->a_ops->migratepage) {
-			/*
-			 * Most pages have a mapping and most filesystems
-			 * should provide a migration function. Anonymous
-			 * pages are part of swap space which also has its
-			 * own migration function. This is the most common
-			 * path for page migration.
-			 */
-			rc = mapping->a_ops->migratepage(newpage, page);
-			goto unlock_both;
-                }
-
-		/*
-		 * Default handling if a filesystem does not provide
-		 * a migration function. We can only migrate clean
-		 * pages so try to write out any dirty pages first.
-		 */
-		if (PageDirty(page)) {
-			switch (pageout(page, mapping)) {
-			case PAGE_KEEP:
-			case PAGE_ACTIVATE:
-				goto unlock_both;
-
-			case PAGE_SUCCESS:
-				unlock_page(newpage);
-				goto next;
-
-			case PAGE_CLEAN:
-				; /* try to migrate the page below */
-			}
-                }
-
-		/*
-		 * Buffers are managed in a filesystem specific way.
-		 * We must have no buffers or drop them.
-		 */
-		if (!page_has_buffers(page) ||
-		    try_to_release_page(page, GFP_KERNEL)) {
-			rc = migrate_page(newpage, page);
-			goto unlock_both;
-		}
-
-		/*
-		 * On early passes with mapped pages simply
-		 * retry. There may be a lock held for some
-		 * buffers that may go away. Later
-		 * swap them out.
-		 */
-		if (pass > 4) {
-			/*
-			 * Persistently unable to drop buffers..... As a
-			 * measure of last resort we fall back to
-			 * swap_page().
-			 */
-			unlock_page(newpage);
-			newpage = NULL;
-			rc = swap_page(page);
-			goto next;
-		}
-
-unlock_both:
-		unlock_page(newpage);
-
-unlock_page:
-		unlock_page(page);
-
-next:
-		if (rc == -EAGAIN) {
-			retry++;
-		} else if (rc) {
-			/* Permanent failure */
-			list_move(&page->lru, failed);
-			nr_failed++;
-		} else {
-			if (newpage) {
-				/* Successful migration. Return page to LRU */
-				move_to_lru(newpage);
-			}
-			list_move(&page->lru, moved);
-		}
-	}
-	if (retry && pass++ < 10)
-		goto redo;
-
-	if (!swapwrite)
-		current->flags &= ~PF_SWAPWRITE;
-
-	return nr_failed + retry;
-}
-
-/*
- * Isolate one page from the LRU lists and put it on the
- * indicated list with elevated refcount.
- *
- * Result:
- *  0 = page not on LRU list
- *  1 = page removed from LRU list and added to the specified list.
- */
-int isolate_lru_page(struct page *page)
-{
-	int ret = 0;
-
-	if (PageLRU(page)) {
-		struct zone *zone = page_zone(page);
-		spin_lock_irq(&zone->lru_lock);
-		if (PageLRU(page)) {
-			ret = 1;
-			get_page(page);
-			ClearPageLRU(page);
-			if (PageActive(page))
-				del_page_from_active_list(zone, page);
-			else
-				del_page_from_inactive_list(zone, page);
-		}
-		spin_unlock_irq(&zone->lru_lock);
-	}
-
-	return ret;
-}
-#endif
-
 /*
  * zone->lru_lock is heavily contended.  Some of the functions that
  * shrink the lists perform better by taking out a batch of pages
-- 
cgit v1.2.3


From f577eb30afdc68233f25d4d82b04102129262365 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 02:59:59 -0800
Subject: [PATCH] swsusp: low level interface

Introduce the low level interface that can be used for handling the
snapshot of the system memory by the in-kernel swap-writing/reading code of
swsusp and the userland interface code (to be introduced shortly).

Also change the way in which swsusp records the allocated swap pages and,
consequently, simplifies the in-kernel swap-writing/reading code (this is
necessary for the userland interface too).  To this end, it introduces two
helper functions in mm/swapfile.c, so that the swsusp code does not refer
directly to the swap internals.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/swap.h    |   5 +-
 kernel/power/disk.c     |  12 +-
 kernel/power/power.h    |  26 +-
 kernel/power/snapshot.c | 326 +++++++++++++++++++++-
 kernel/power/swsusp.c   | 723 +++++++++++++++++-------------------------------
 mm/swapfile.c           |  55 +++-
 6 files changed, 656 insertions(+), 491 deletions(-)

(limited to 'mm')

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 12415dd94451..54eac8a39a4c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -234,14 +234,15 @@ extern struct page * read_swap_cache_async(swp_entry_t, struct vm_area_struct *v
 /* linux/mm/swapfile.c */
 extern long total_swap_pages;
 extern unsigned int nr_swapfiles;
-extern struct swap_info_struct swap_info[];
 extern void si_swapinfo(struct sysinfo *);
 extern swp_entry_t get_swap_page(void);
-extern swp_entry_t get_swap_page_of_type(int type);
+extern swp_entry_t get_swap_page_of_type(int);
 extern int swap_duplicate(swp_entry_t);
 extern int valid_swaphandles(swp_entry_t, unsigned long *);
 extern void swap_free(swp_entry_t);
 extern void free_swap_and_cache(swp_entry_t);
+extern int swap_type_of(dev_t);
+extern unsigned int count_swap_pages(int, int);
 extern sector_t map_swap_page(struct swap_info_struct *, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int can_share_swap_page(struct page *);
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 0b43847dc980..4eb464b71347 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -26,9 +26,9 @@ extern suspend_disk_method_t pm_disk_mode;
 
 extern int swsusp_shrink_memory(void);
 extern int swsusp_suspend(void);
-extern int swsusp_write(struct pbe *pblist, unsigned int nr_pages);
+extern int swsusp_write(void);
 extern int swsusp_check(void);
-extern int swsusp_read(struct pbe **pblist_ptr);
+extern int swsusp_read(void);
 extern void swsusp_close(void);
 extern int swsusp_resume(void);
 
@@ -70,10 +70,6 @@ static void power_down(suspend_disk_method_t mode)
 	while(1);
 }
 
-
-static int in_suspend __nosavedata = 0;
-
-
 static inline void platform_finish(void)
 {
 	if (pm_disk_mode == PM_DISK_PLATFORM) {
@@ -145,7 +141,7 @@ int pm_suspend_disk(void)
 	if (in_suspend) {
 		device_resume();
 		pr_debug("PM: writing image.\n");
-		error = swsusp_write(pagedir_nosave, nr_copy_pages);
+		error = swsusp_write();
 		if (!error)
 			power_down(pm_disk_mode);
 		else {
@@ -216,7 +212,7 @@ static int software_resume(void)
 
 	pr_debug("PM: Reading swsusp image.\n");
 
-	if ((error = swsusp_read(&pagedir_nosave))) {
+	if ((error = swsusp_read())) {
 		swsusp_free();
 		goto Thaw;
 	}
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 388dba680841..ea7132ed029b 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -37,21 +37,31 @@ extern struct subsystem power_subsys;
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
 
-extern unsigned int nr_copy_pages;
 extern struct pbe *pagedir_nosave;
 
 /* Preferred image size in bytes (default 500 MB) */
 extern unsigned long image_size;
 
+extern int in_suspend;
+
 extern asmlinkage int swsusp_arch_suspend(void);
 extern asmlinkage int swsusp_arch_resume(void);
 
 extern unsigned int count_data_pages(void);
-extern void free_pagedir(struct pbe *pblist);
-extern void release_eaten_pages(void);
-extern struct pbe *alloc_pagedir(unsigned nr_pages, gfp_t gfp_mask, int safe_needed);
 extern void swsusp_free(void);
-extern int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed);
-extern unsigned int snapshot_nr_pages(void);
-extern struct pbe *snapshot_pblist(void);
-extern void snapshot_pblist_set(struct pbe *pblist);
+
+struct snapshot_handle {
+	loff_t		offset;
+	unsigned int	page;
+	unsigned int	page_offset;
+	unsigned int	prev;
+	struct pbe	*pbe;
+	void		*buffer;
+	unsigned int	buf_offset;
+};
+
+#define data_of(handle)	((handle).buffer + (handle).buf_offset)
+
+extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
+extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
+int snapshot_image_loaded(struct snapshot_handle *handle);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 8d5a5986d621..cc349437fb72 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -10,6 +10,7 @@
  */
 
 
+#include <linux/version.h>
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/suspend.h>
@@ -34,7 +35,8 @@
 #include "power.h"
 
 struct pbe *pagedir_nosave;
-unsigned int nr_copy_pages;
+static unsigned int nr_copy_pages;
+static unsigned int nr_meta_pages;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -235,7 +237,7 @@ static void copy_data_pages(struct pbe *pblist)
  *	free_pagedir - free pages allocated with alloc_pagedir()
  */
 
-void free_pagedir(struct pbe *pblist)
+static void free_pagedir(struct pbe *pblist)
 {
 	struct pbe *pbe;
 
@@ -301,7 +303,7 @@ struct eaten_page {
 
 static struct eaten_page *eaten_pages = NULL;
 
-void release_eaten_pages(void)
+static void release_eaten_pages(void)
 {
 	struct eaten_page *p, *q;
 
@@ -376,7 +378,6 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 	if (!nr_pages)
 		return NULL;
 
-	pr_debug("alloc_pagedir(): nr_pages = %d\n", nr_pages);
 	pblist = alloc_image_page(gfp_mask, safe_needed);
 	/* FIXME: rewrite this ugly loop */
 	for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages;
@@ -414,6 +415,9 @@ void swsusp_free(void)
 				}
 			}
 	}
+	nr_copy_pages = 0;
+	nr_meta_pages = 0;
+	pagedir_nosave = NULL;
 }
 
 
@@ -437,7 +441,7 @@ static int enough_free_mem(unsigned int nr_pages)
 		(nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
 }
 
-int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
+static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed)
 {
 	struct pbe *p;
 
@@ -504,7 +508,319 @@ asmlinkage int swsusp_save(void)
 	 */
 
 	nr_copy_pages = nr_pages;
+	nr_meta_pages = (nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 
 	printk("swsusp: critical section/: done (%d pages copied)\n", nr_pages);
 	return 0;
 }
+
+static void init_header(struct swsusp_info *info)
+{
+	memset(info, 0, sizeof(struct swsusp_info));
+	info->version_code = LINUX_VERSION_CODE;
+	info->num_physpages = num_physpages;
+	memcpy(&info->uts, &system_utsname, sizeof(system_utsname));
+	info->cpus = num_online_cpus();
+	info->image_pages = nr_copy_pages;
+	info->pages = nr_copy_pages + nr_meta_pages + 1;
+}
+
+/**
+ *	pack_orig_addresses - the .orig_address fields of the PBEs from the
+ *	list starting at @pbe are stored in the array @buf[] (1 page)
+ */
+
+static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe)
+{
+	int j;
+
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		buf[j] = pbe->orig_address;
+		pbe = pbe->next;
+	}
+	if (!pbe)
+		for (; j < PAGE_SIZE / sizeof(long); j++)
+			buf[j] = 0;
+	return pbe;
+}
+
+/**
+ *	snapshot_read_next - used for reading the system memory snapshot.
+ *
+ *	On the first call to it @handle should point to a zeroed
+ *	snapshot_handle structure.  The structure gets updated and a pointer
+ *	to it should be passed to this function every next time.
+ *
+ *	The @count parameter should contain the number of bytes the caller
+ *	wants to read from the snapshot.  It must not be zero.
+ *
+ *	On success the function returns a positive number.  Then, the caller
+ *	is allowed to read up to the returned number of bytes from the memory
+ *	location computed by the data_of() macro.  The number returned
+ *	may be smaller than @count, but this only happens if the read would
+ *	cross a page boundary otherwise.
+ *
+ *	The function returns 0 to indicate the end of data stream condition,
+ *	and a negative number is returned on error.  In such cases the
+ *	structure pointed to by @handle is not updated and should not be used
+ *	any more.
+ */
+
+int snapshot_read_next(struct snapshot_handle *handle, size_t count)
+{
+	static unsigned long *buffer;
+
+	if (handle->page > nr_meta_pages + nr_copy_pages)
+		return 0;
+	if (!buffer) {
+		/* This makes the buffer be freed by swsusp_free() */
+		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		if (!buffer)
+			return -ENOMEM;
+	}
+	if (!handle->offset) {
+		init_header((struct swsusp_info *)buffer);
+		handle->buffer = buffer;
+		handle->pbe = pagedir_nosave;
+	}
+	if (handle->prev < handle->page) {
+		if (handle->page <= nr_meta_pages) {
+			handle->pbe = pack_orig_addresses(buffer, handle->pbe);
+			if (!handle->pbe)
+				handle->pbe = pagedir_nosave;
+		} else {
+			handle->buffer = (void *)handle->pbe->address;
+			handle->pbe = handle->pbe->next;
+		}
+		handle->prev = handle->page;
+	}
+	handle->buf_offset = handle->page_offset;
+	if (handle->page_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->page_offset;
+		handle->page_offset = 0;
+		handle->page++;
+	} else {
+		handle->page_offset += count;
+	}
+	handle->offset += count;
+	return count;
+}
+
+/**
+ *	mark_unsafe_pages - mark the pages that cannot be used for storing
+ *	the image during resume, because they conflict with the pages that
+ *	had been used before suspend
+ */
+
+static int mark_unsafe_pages(struct pbe *pblist)
+{
+	struct zone *zone;
+	unsigned long zone_pfn;
+	struct pbe *p;
+
+	if (!pblist) /* a sanity check */
+		return -EINVAL;
+
+	/* Clear page flags */
+	for_each_zone (zone) {
+		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
+			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
+				ClearPageNosaveFree(pfn_to_page(zone_pfn +
+					zone->zone_start_pfn));
+	}
+
+	/* Mark orig addresses */
+	for_each_pbe (p, pblist) {
+		if (virt_addr_valid(p->orig_address))
+			SetPageNosaveFree(virt_to_page(p->orig_address));
+		else
+			return -EFAULT;
+	}
+
+	return 0;
+}
+
+static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
+{
+	/* We assume both lists contain the same number of elements */
+	while (src) {
+		dst->orig_address = src->orig_address;
+		dst = dst->next;
+		src = src->next;
+	}
+}
+
+static int check_header(struct swsusp_info *info)
+{
+	char *reason = NULL;
+
+	if (info->version_code != LINUX_VERSION_CODE)
+		reason = "kernel version";
+	if (info->num_physpages != num_physpages)
+		reason = "memory size";
+	if (strcmp(info->uts.sysname,system_utsname.sysname))
+		reason = "system type";
+	if (strcmp(info->uts.release,system_utsname.release))
+		reason = "kernel release";
+	if (strcmp(info->uts.version,system_utsname.version))
+		reason = "version";
+	if (strcmp(info->uts.machine,system_utsname.machine))
+		reason = "machine";
+	if (reason) {
+		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
+		return -EPERM;
+	}
+	return 0;
+}
+
+/**
+ *	load header - check the image header and copy data from it
+ */
+
+static int load_header(struct snapshot_handle *handle,
+                              struct swsusp_info *info)
+{
+	int error;
+	struct pbe *pblist;
+
+	error = check_header(info);
+	if (!error) {
+		pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
+		if (!pblist)
+			return -ENOMEM;
+		pagedir_nosave = pblist;
+		handle->pbe = pblist;
+		nr_copy_pages = info->image_pages;
+		nr_meta_pages = info->pages - info->image_pages - 1;
+	}
+	return error;
+}
+
+/**
+ *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
+ *	the PBEs in the list starting at @pbe
+ */
+
+static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
+                                                struct pbe *pbe)
+{
+	int j;
+
+	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
+		pbe->orig_address = buf[j];
+		pbe = pbe->next;
+	}
+	return pbe;
+}
+
+/**
+ *	create_image - use metadata contained in the PBE list
+ *	pointed to by pagedir_nosave to mark the pages that will
+ *	be overwritten in the process of restoring the system
+ *	memory state from the image and allocate memory for
+ *	the image avoiding these pages
+ */
+
+static int create_image(struct snapshot_handle *handle)
+{
+	int error = 0;
+	struct pbe *p, *pblist;
+
+	p = pagedir_nosave;
+	error = mark_unsafe_pages(p);
+	if (!error) {
+		pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1);
+		if (pblist)
+			copy_page_backup_list(pblist, p);
+		free_pagedir(p);
+		if (!pblist)
+			error = -ENOMEM;
+	}
+	if (!error)
+		error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
+	if (!error) {
+		release_eaten_pages();
+		pagedir_nosave = pblist;
+	} else {
+		pagedir_nosave = NULL;
+		handle->pbe = NULL;
+		nr_copy_pages = 0;
+		nr_meta_pages = 0;
+	}
+	return error;
+}
+
+/**
+ *	snapshot_write_next - used for writing the system memory snapshot.
+ *
+ *	On the first call to it @handle should point to a zeroed
+ *	snapshot_handle structure.  The structure gets updated and a pointer
+ *	to it should be passed to this function every next time.
+ *
+ *	The @count parameter should contain the number of bytes the caller
+ *	wants to write to the image.  It must not be zero.
+ *
+ *	On success the function returns a positive number.  Then, the caller
+ *	is allowed to write up to the returned number of bytes to the memory
+ *	location computed by the data_of() macro.  The number returned
+ *	may be smaller than @count, but this only happens if the write would
+ *	cross a page boundary otherwise.
+ *
+ *	The function returns 0 to indicate the "end of file" condition,
+ *	and a negative number is returned on error.  In such cases the
+ *	structure pointed to by @handle is not updated and should not be used
+ *	any more.
+ */
+
+int snapshot_write_next(struct snapshot_handle *handle, size_t count)
+{
+	static unsigned long *buffer;
+	int error = 0;
+
+	if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
+		return 0;
+	if (!buffer) {
+		/* This makes the buffer be freed by swsusp_free() */
+		buffer = alloc_image_page(GFP_ATOMIC, 0);
+		if (!buffer)
+			return -ENOMEM;
+	}
+	if (!handle->offset)
+		handle->buffer = buffer;
+	if (handle->prev < handle->page) {
+		if (!handle->prev) {
+			error = load_header(handle, (struct swsusp_info *)buffer);
+			if (error)
+				return error;
+		} else if (handle->prev <= nr_meta_pages) {
+			handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
+			if (!handle->pbe) {
+				error = create_image(handle);
+				if (error)
+					return error;
+				handle->pbe = pagedir_nosave;
+				handle->buffer = (void *)handle->pbe->address;
+			}
+		} else {
+			handle->pbe = handle->pbe->next;
+			handle->buffer = (void *)handle->pbe->address;
+		}
+		handle->prev = handle->page;
+	}
+	handle->buf_offset = handle->page_offset;
+	if (handle->page_offset + count >= PAGE_SIZE) {
+		count = PAGE_SIZE - handle->page_offset;
+		handle->page_offset = 0;
+		handle->page++;
+	} else {
+		handle->page_offset += count;
+	}
+	handle->offset += count;
+	return count;
+}
+
+int snapshot_image_loaded(struct snapshot_handle *handle)
+{
+	return !(!handle->pbe || handle->pbe->next || !nr_copy_pages ||
+		handle->page <= nr_meta_pages + nr_copy_pages);
+}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 4e90905f0e87..457084f50010 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -77,6 +77,8 @@
  */
 unsigned long image_size = 500 * 1024 * 1024;
 
+int in_suspend __nosavedata = 0;
+
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void);
 int save_highmem(void);
@@ -98,8 +100,6 @@ static struct swsusp_header {
 	char	sig[10];
 } __attribute__((packed, aligned(PAGE_SIZE))) swsusp_header;
 
-static struct swsusp_info swsusp_info;
-
 /*
  * Saving part...
  */
@@ -129,255 +129,261 @@ static int mark_swapfiles(swp_entry_t start)
 	return error;
 }
 
-/*
- * Check whether the swap device is the specified resume
- * device, irrespective of whether they are specified by
- * identical names.
- *
- * (Thus, device inode aliasing is allowed.  You can say /dev/hda4
- * instead of /dev/ide/host0/bus0/target0/lun0/part4 [if using devfs]
- * and they'll be considered the same device.  This is *necessary* for
- * devfs, since the resume code can only recognize the form /dev/hda4,
- * but the suspend code would see the long name.)
+/**
+ *	swsusp_swap_check - check if the resume device is a swap device
+ *	and get its index (if so)
  */
-static inline int is_resume_device(const struct swap_info_struct *swap_info)
-{
-	struct file *file = swap_info->swap_file;
-	struct inode *inode = file->f_dentry->d_inode;
-
-	return S_ISBLK(inode->i_mode) &&
-		swsusp_resume_device == MKDEV(imajor(inode), iminor(inode));
-}
 
 static int swsusp_swap_check(void) /* This is called before saving image */
 {
-	int i;
-
-	if (!swsusp_resume_device)
-		return -ENODEV;
-	spin_lock(&swap_lock);
-	for (i = 0; i < MAX_SWAPFILES; i++) {
-		if (!(swap_info[i].flags & SWP_WRITEOK))
-			continue;
-		if (is_resume_device(swap_info + i)) {
-			spin_unlock(&swap_lock);
-			root_swap = i;
-			return 0;
-		}
+	int res = swap_type_of(swsusp_resume_device);
+
+	if (res >= 0) {
+		root_swap = res;
+		return 0;
 	}
-	spin_unlock(&swap_lock);
-	return -ENODEV;
+	return res;
 }
 
 /**
- *	write_page - Write one page to a fresh swap location.
- *	@addr:	Address we're writing.
- *	@loc:	Place to store the entry we used.
+ *	The bitmap is used for tracing allocated swap pages
  *
- *	Allocate a new swap entry and 'sync' it. Note we discard -EIO
- *	errors. That is an artifact left over from swsusp. It did not
- *	check the return of rw_swap_page_sync() at all, since most pages
- *	written back to swap would return -EIO.
- *	This is a partial improvement, since we will at least return other
- *	errors, though we need to eventually fix the damn code.
+ *	The entire bitmap consists of a number of bitmap_page
+ *	structures linked with the help of the .next member.
+ *	Thus each page can be allocated individually, so we only
+ *	need to make 0-order memory allocations to create
+ *	the bitmap.
  */
-static int write_page(unsigned long addr, swp_entry_t *loc)
-{
-	swp_entry_t entry;
-	int error = -ENOSPC;
 
-	entry = get_swap_page_of_type(root_swap);
-	if (swp_offset(entry)) {
-		error = rw_swap_page_sync(WRITE, entry, virt_to_page(addr));
-		if (!error || error == -EIO)
-			*loc = entry;
-	}
-	return error;
-}
+#define BITMAP_PAGE_SIZE	(PAGE_SIZE - sizeof(void *))
+#define BITMAP_PAGE_CHUNKS	(BITMAP_PAGE_SIZE / sizeof(long))
+#define BITS_PER_CHUNK		(sizeof(long) * 8)
+#define BITMAP_PAGE_BITS	(BITMAP_PAGE_CHUNKS * BITS_PER_CHUNK)
+
+struct bitmap_page {
+	unsigned long		chunks[BITMAP_PAGE_CHUNKS];
+	struct bitmap_page	*next;
+};
 
 /**
- *	Swap map-handling functions
+ *	The following functions are used for tracing the allocated
+ *	swap pages, so that they can be freed in case of an error.
  *
- *	The swap map is a data structure used for keeping track of each page
- *	written to the swap.  It consists of many swap_map_page structures
- *	that contain each an array of MAP_PAGE_SIZE swap entries.
- *	These structures are linked together with the help of either the
- *	.next (in memory) or the .next_swap (in swap) member.
- *
- *	The swap map is created during suspend.  At that time we need to keep
- *	it in memory, because we have to free all of the allocated swap
- *	entries if an error occurs.  The memory needed is preallocated
- *	so that we know in advance if there's enough of it.
- *
- *	The first swap_map_page structure is filled with the swap entries that
- *	correspond to the first MAP_PAGE_SIZE data pages written to swap and
- *	so on.  After the all of the data pages have been written, the order
- *	of the swap_map_page structures in the map is reversed so that they
- *	can be read from swap in the original order.  This causes the data
- *	pages to be loaded in exactly the same order in which they have been
- *	saved.
- *
- *	During resume we only need to use one swap_map_page structure
- *	at a time, which means that we only need to use two memory pages for
- *	reading the image - one for reading the swap_map_page structures
- *	and the second for reading the data pages from swap.
+ *	The functions operate on a linked bitmap structure defined
+ *	above
  */
 
-#define MAP_PAGE_SIZE	((PAGE_SIZE - sizeof(swp_entry_t) - sizeof(void *)) \
-			/ sizeof(swp_entry_t))
-
-struct swap_map_page {
-	swp_entry_t		entries[MAP_PAGE_SIZE];
-	swp_entry_t		next_swap;
-	struct swap_map_page	*next;
-};
-
-static inline void free_swap_map(struct swap_map_page *swap_map)
+static void free_bitmap(struct bitmap_page *bitmap)
 {
-	struct swap_map_page *swp;
+	struct bitmap_page *bp;
 
-	while (swap_map) {
-		swp = swap_map->next;
-		free_page((unsigned long)swap_map);
-		swap_map = swp;
+	while (bitmap) {
+		bp = bitmap->next;
+		free_page((unsigned long)bitmap);
+		bitmap = bp;
 	}
 }
 
-static struct swap_map_page *alloc_swap_map(unsigned int nr_pages)
+static struct bitmap_page *alloc_bitmap(unsigned int nr_bits)
 {
-	struct swap_map_page *swap_map, *swp;
-	unsigned n = 0;
+	struct bitmap_page *bitmap, *bp;
+	unsigned int n;
 
-	if (!nr_pages)
+	if (!nr_bits)
 		return NULL;
 
-	pr_debug("alloc_swap_map(): nr_pages = %d\n", nr_pages);
-	swap_map = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-	swp = swap_map;
-	for (n = MAP_PAGE_SIZE; n < nr_pages; n += MAP_PAGE_SIZE) {
-		swp->next = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
-		swp = swp->next;
-		if (!swp) {
-			free_swap_map(swap_map);
+	bitmap = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+	bp = bitmap;
+	for (n = BITMAP_PAGE_BITS; n < nr_bits; n += BITMAP_PAGE_BITS) {
+		bp->next = (struct bitmap_page *)get_zeroed_page(GFP_KERNEL);
+		bp = bp->next;
+		if (!bp) {
+			free_bitmap(bitmap);
 			return NULL;
 		}
 	}
-	return swap_map;
+	return bitmap;
 }
 
-/**
- *	reverse_swap_map - reverse the order of pages in the swap map
- *	@swap_map
- */
-
-static inline struct swap_map_page *reverse_swap_map(struct swap_map_page *swap_map)
+static int bitmap_set(struct bitmap_page *bitmap, unsigned long bit)
 {
-	struct swap_map_page *prev, *next;
-
-	prev = NULL;
-	while (swap_map) {
-		next = swap_map->next;
-		swap_map->next = prev;
-		prev = swap_map;
-		swap_map = next;
+	unsigned int n;
+
+	n = BITMAP_PAGE_BITS;
+	while (bitmap && n <= bit) {
+		n += BITMAP_PAGE_BITS;
+		bitmap = bitmap->next;
 	}
-	return prev;
+	if (!bitmap)
+		return -EINVAL;
+	n -= BITMAP_PAGE_BITS;
+	bit -= n;
+	n = 0;
+	while (bit >= BITS_PER_CHUNK) {
+		bit -= BITS_PER_CHUNK;
+		n++;
+	}
+	bitmap->chunks[n] |= (1UL << bit);
+	return 0;
 }
 
-/**
- *	free_swap_map_entries - free the swap entries allocated to store
- *	the swap map @swap_map (this is only called in case of an error)
- */
-static inline void free_swap_map_entries(struct swap_map_page *swap_map)
+static unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap)
 {
-	while (swap_map) {
-		if (swap_map->next_swap.val)
-			swap_free(swap_map->next_swap);
-		swap_map = swap_map->next;
+	unsigned long offset;
+
+	offset = swp_offset(get_swap_page_of_type(swap));
+	if (offset) {
+		if (bitmap_set(bitmap, offset)) {
+			swap_free(swp_entry(swap, offset));
+			offset = 0;
+		}
 	}
+	return offset;
 }
 
-/**
- *	save_swap_map - save the swap map used for tracing the data pages
- *	stored in the swap
- */
-
-static int save_swap_map(struct swap_map_page *swap_map, swp_entry_t *start)
+static void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
 {
-	swp_entry_t entry = (swp_entry_t){0};
-	int error;
+	unsigned int bit, n;
+	unsigned long test;
 
-	while (swap_map) {
-		swap_map->next_swap = entry;
-		if ((error = write_page((unsigned long)swap_map, &entry)))
-			return error;
-		swap_map = swap_map->next;
+	bit = 0;
+	while (bitmap) {
+		for (n = 0; n < BITMAP_PAGE_CHUNKS; n++)
+			for (test = 1UL; test; test <<= 1) {
+				if (bitmap->chunks[n] & test)
+					swap_free(swp_entry(swap, bit));
+				bit++;
+			}
+		bitmap = bitmap->next;
 	}
-	*start = entry;
-	return 0;
 }
 
 /**
- *	free_image_entries - free the swap entries allocated to store
- *	the image data pages (this is only called in case of an error)
+ *	write_page - Write one page to given swap location.
+ *	@buf:		Address we're writing.
+ *	@offset:	Offset of the swap page we're writing to.
  */
 
-static inline void free_image_entries(struct swap_map_page *swp)
+static int write_page(void *buf, unsigned long offset)
 {
-	unsigned k;
+	swp_entry_t entry;
+	int error = -ENOSPC;
 
-	while (swp) {
-		for (k = 0; k < MAP_PAGE_SIZE; k++)
-			if (swp->entries[k].val)
-				swap_free(swp->entries[k]);
-		swp = swp->next;
+	if (offset) {
+		entry = swp_entry(root_swap, offset);
+		error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf));
 	}
+	return error;
 }
 
+/*
+ *	The swap map is a data structure used for keeping track of each page
+ *	written to a swap partition.  It consists of many swap_map_page
+ *	structures that contain each an array of MAP_PAGE_SIZE swap entries.
+ *	These structures are stored on the swap and linked together with the
+ *	help of the .next_swap member.
+ *
+ *	The swap map is created during suspend.  The swap map pages are
+ *	allocated and populated one at a time, so we only need one memory
+ *	page to set up the entire structure.
+ *
+ *	During resume we also only need to use one swap_map_page structure
+ *	at a time.
+ */
+
+#define MAP_PAGE_ENTRIES	(PAGE_SIZE / sizeof(long) - 1)
+
+struct swap_map_page {
+	unsigned long		entries[MAP_PAGE_ENTRIES];
+	unsigned long		next_swap;
+};
+
 /**
- *	The swap_map_handle structure is used for handling the swap map in
+ *	The swap_map_handle structure is used for handling swap in
  *	a file-alike way
  */
 
 struct swap_map_handle {
 	struct swap_map_page *cur;
+	unsigned long cur_swap;
+	struct bitmap_page *bitmap;
 	unsigned int k;
 };
 
-static inline void init_swap_map_handle(struct swap_map_handle *handle,
-                                        struct swap_map_page *map)
+static void release_swap_writer(struct swap_map_handle *handle)
 {
-	handle->cur = map;
+	if (handle->cur)
+		free_page((unsigned long)handle->cur);
+	handle->cur = NULL;
+	if (handle->bitmap)
+		free_bitmap(handle->bitmap);
+	handle->bitmap = NULL;
+}
+
+static int get_swap_writer(struct swap_map_handle *handle)
+{
+	handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
+	if (!handle->cur)
+		return -ENOMEM;
+	handle->bitmap = alloc_bitmap(count_swap_pages(root_swap, 0));
+	if (!handle->bitmap) {
+		release_swap_writer(handle);
+		return -ENOMEM;
+	}
+	handle->cur_swap = alloc_swap_page(root_swap, handle->bitmap);
+	if (!handle->cur_swap) {
+		release_swap_writer(handle);
+		return -ENOSPC;
+	}
 	handle->k = 0;
+	return 0;
 }
 
-static inline int swap_map_write_page(struct swap_map_handle *handle,
-                                      unsigned long addr)
+static int swap_write_page(struct swap_map_handle *handle, void *buf)
 {
 	int error;
+	unsigned long offset;
 
-	error = write_page(addr, handle->cur->entries + handle->k);
+	if (!handle->cur)
+		return -EINVAL;
+	offset = alloc_swap_page(root_swap, handle->bitmap);
+	error = write_page(buf, offset);
 	if (error)
 		return error;
-	if (++handle->k >= MAP_PAGE_SIZE) {
-		handle->cur = handle->cur->next;
+	handle->cur->entries[handle->k++] = offset;
+	if (handle->k >= MAP_PAGE_ENTRIES) {
+		offset = alloc_swap_page(root_swap, handle->bitmap);
+		if (!offset)
+			return -ENOSPC;
+		handle->cur->next_swap = offset;
+		error = write_page(handle->cur, handle->cur_swap);
+		if (error)
+			return error;
+		memset(handle->cur, 0, PAGE_SIZE);
+		handle->cur_swap = offset;
 		handle->k = 0;
 	}
 	return 0;
 }
 
+static int flush_swap_writer(struct swap_map_handle *handle)
+{
+	if (handle->cur && handle->cur_swap)
+		return write_page(handle->cur, handle->cur_swap);
+	else
+		return -EINVAL;
+}
+
 /**
- *	save_image_data - save the data pages pointed to by the PBEs
- *	from the list @pblist using the swap map handle @handle
- *	(assume there are @nr_pages data pages to save)
+ *	save_image - save the suspend image data
  */
 
-static int save_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
+static int save_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
 {
 	unsigned int m;
-	struct pbe *p;
+	int ret;
 	int error = 0;
 
 	printk("Saving image data pages (%u pages) ...     ", nr_pages);
@@ -385,98 +391,22 @@ static int save_image_data(struct pbe *pblist,
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	for_each_pbe (p, pblist) {
-		error = swap_map_write_page(handle, p->address);
-		if (error)
-			break;
-		if (!(nr_pages % m))
-			printk("\b\b\b\b%3d%%", nr_pages / m);
-		nr_pages++;
-	}
+	do {
+		ret = snapshot_read_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_write_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
 	if (!error)
 		printk("\b\b\b\bdone\n");
 	return error;
 }
 
-static void dump_info(void)
-{
-	pr_debug(" swsusp: Version: %u\n",swsusp_info.version_code);
-	pr_debug(" swsusp: Num Pages: %ld\n",swsusp_info.num_physpages);
-	pr_debug(" swsusp: UTS Sys: %s\n",swsusp_info.uts.sysname);
-	pr_debug(" swsusp: UTS Node: %s\n",swsusp_info.uts.nodename);
-	pr_debug(" swsusp: UTS Release: %s\n",swsusp_info.uts.release);
-	pr_debug(" swsusp: UTS Version: %s\n",swsusp_info.uts.version);
-	pr_debug(" swsusp: UTS Machine: %s\n",swsusp_info.uts.machine);
-	pr_debug(" swsusp: UTS Domain: %s\n",swsusp_info.uts.domainname);
-	pr_debug(" swsusp: CPUs: %d\n",swsusp_info.cpus);
-	pr_debug(" swsusp: Image: %ld Pages\n",swsusp_info.image_pages);
-	pr_debug(" swsusp: Total: %ld Pages\n", swsusp_info.pages);
-}
-
-static void init_header(unsigned int nr_pages)
-{
-	memset(&swsusp_info, 0, sizeof(swsusp_info));
-	swsusp_info.version_code = LINUX_VERSION_CODE;
-	swsusp_info.num_physpages = num_physpages;
-	memcpy(&swsusp_info.uts, &system_utsname, sizeof(system_utsname));
-
-	swsusp_info.cpus = num_online_cpus();
-	swsusp_info.image_pages = nr_pages;
-	swsusp_info.pages = nr_pages +
-		((nr_pages * sizeof(long) + PAGE_SIZE - 1) >> PAGE_SHIFT) + 1;
-}
-
-/**
- *	pack_orig_addresses - the .orig_address fields of the PBEs from the
- *	list starting at @pbe are stored in the array @buf[] (1 page)
- */
-
-static inline struct pbe *pack_orig_addresses(unsigned long *buf,
-                                              struct pbe *pbe)
-{
-	int j;
-
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		buf[j] = pbe->orig_address;
-		pbe = pbe->next;
-	}
-	if (!pbe)
-		for (; j < PAGE_SIZE / sizeof(long); j++)
-			buf[j] = 0;
-	return pbe;
-}
-
-/**
- *	save_image_metadata - save the .orig_address fields of the PBEs
- *	from the list @pblist using the swap map handle @handle
- */
-
-static int save_image_metadata(struct pbe *pblist,
-                               struct swap_map_handle *handle)
-{
-	unsigned long *buf;
-	unsigned int n = 0;
-	struct pbe *p;
-	int error = 0;
-
-	printk("Saving image metadata ... ");
-	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
-	if (!buf)
-		return -ENOMEM;
-	p = pblist;
-	while (p) {
-		p = pack_orig_addresses(buf, p);
-		error = swap_map_write_page(handle, (unsigned long)buf);
-		if (error)
-			break;
-		n++;
-	}
-	free_page((unsigned long)buf);
-	if (!error)
-		printk("done (%u pages saved)\n", n);
-	return error;
-}
-
 /**
  *	enough_swap - Make sure we have enough swap to save the image.
  *
@@ -486,8 +416,7 @@ static int save_image_metadata(struct pbe *pblist,
 
 static int enough_swap(unsigned int nr_pages)
 {
-	unsigned int free_swap = swap_info[root_swap].pages -
-		swap_info[root_swap].inuse_pages;
+	unsigned int free_swap = count_swap_pages(root_swap, 1);
 
 	pr_debug("swsusp: free swap pages: %u\n", free_swap);
 	return free_swap > (nr_pages + PAGES_FOR_IO +
@@ -503,57 +432,44 @@ static int enough_swap(unsigned int nr_pages)
  *	correctly, we'll mark system clean, anyway.)
  */
 
-int swsusp_write(struct pbe *pblist, unsigned int nr_pages)
+int swsusp_write(void)
 {
-	struct swap_map_page *swap_map;
 	struct swap_map_handle handle;
-	swp_entry_t start;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
+	unsigned long start;
 	int error;
 
 	if ((error = swsusp_swap_check())) {
 		printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n");
 		return error;
 	}
-	if (!enough_swap(nr_pages)) {
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_read_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	if (!enough_swap(header->pages)) {
 		printk(KERN_ERR "swsusp: Not enough free swap\n");
 		return -ENOSPC;
 	}
-
-	init_header(nr_pages);
-	swap_map = alloc_swap_map(swsusp_info.pages);
-	if (!swap_map)
-		return -ENOMEM;
-	init_swap_map_handle(&handle, swap_map);
-
-	error = swap_map_write_page(&handle, (unsigned long)&swsusp_info);
-	if (!error)
-		error = save_image_metadata(pblist, &handle);
+	error = get_swap_writer(&handle);
+	if (!error) {
+		start = handle.cur_swap;
+		error = swap_write_page(&handle, header);
+	}
 	if (!error)
-		error = save_image_data(pblist, &handle, nr_pages);
-	if (error)
-		goto Free_image_entries;
-
-	swap_map = reverse_swap_map(swap_map);
-	error = save_swap_map(swap_map, &start);
-	if (error)
-		goto Free_map_entries;
-
-	dump_info();
-	printk( "S" );
-	error = mark_swapfiles(start);
-	printk( "|\n" );
+		error = save_image(&handle, &snapshot, header->pages - 1);
+	if (!error) {
+		flush_swap_writer(&handle);
+		printk("S");
+		error = mark_swapfiles(swp_entry(root_swap, start));
+		printk("|\n");
+	}
 	if (error)
-		goto Free_map_entries;
-
-Free_swap_map:
-	free_swap_map(swap_map);
+		free_all_swap_pages(root_swap, handle.bitmap);
+	release_swap_writer(&handle);
 	return error;
-
-Free_map_entries:
-	free_swap_map_entries(swap_map);
-Free_image_entries:
-	free_image_entries(swap_map);
-	goto Free_swap_map;
 }
 
 /**
@@ -663,45 +579,6 @@ int swsusp_resume(void)
 	return error;
 }
 
-/**
- *	mark_unsafe_pages - mark the pages that cannot be used for storing
- *	the image during resume, because they conflict with the pages that
- *	had been used before suspend
- */
-
-static void mark_unsafe_pages(struct pbe *pblist)
-{
-	struct zone *zone;
-	unsigned long zone_pfn;
-	struct pbe *p;
-
-	if (!pblist) /* a sanity check */
-		return;
-
-	/* Clear page flags */
-	for_each_zone (zone) {
-		for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
-			if (pfn_valid(zone_pfn + zone->zone_start_pfn))
-				ClearPageNosaveFree(pfn_to_page(zone_pfn +
-					zone->zone_start_pfn));
-	}
-
-	/* Mark orig addresses */
-	for_each_pbe (p, pblist)
-		SetPageNosaveFree(virt_to_page(p->orig_address));
-
-}
-
-static void copy_page_backup_list(struct pbe *dst, struct pbe *src)
-{
-	/* We assume both lists contain the same number of elements */
-	while (src) {
-		dst->orig_address = src->orig_address;
-		dst = dst->next;
-		src = src->next;
-	}
-}
-
 /*
  *	Using bio to read from swap.
  *	This code requires a bit more work than just using buffer heads
@@ -779,14 +656,14 @@ static int bio_write_page(pgoff_t page_off, void *page)
  *	in a file-alike way
  */
 
-static inline void release_swap_map_reader(struct swap_map_handle *handle)
+static void release_swap_reader(struct swap_map_handle *handle)
 {
 	if (handle->cur)
 		free_page((unsigned long)handle->cur);
 	handle->cur = NULL;
 }
 
-static inline int get_swap_map_reader(struct swap_map_handle *handle,
+static int get_swap_reader(struct swap_map_handle *handle,
                                       swp_entry_t start)
 {
 	int error;
@@ -798,149 +675,80 @@ static inline int get_swap_map_reader(struct swap_map_handle *handle,
 		return -ENOMEM;
 	error = bio_read_page(swp_offset(start), handle->cur);
 	if (error) {
-		release_swap_map_reader(handle);
+		release_swap_reader(handle);
 		return error;
 	}
 	handle->k = 0;
 	return 0;
 }
 
-static inline int swap_map_read_page(struct swap_map_handle *handle, void *buf)
+static int swap_read_page(struct swap_map_handle *handle, void *buf)
 {
 	unsigned long offset;
 	int error;
 
 	if (!handle->cur)
 		return -EINVAL;
-	offset = swp_offset(handle->cur->entries[handle->k]);
+	offset = handle->cur->entries[handle->k];
 	if (!offset)
-		return -EINVAL;
+		return -EFAULT;
 	error = bio_read_page(offset, buf);
 	if (error)
 		return error;
-	if (++handle->k >= MAP_PAGE_SIZE) {
+	if (++handle->k >= MAP_PAGE_ENTRIES) {
 		handle->k = 0;
-		offset = swp_offset(handle->cur->next_swap);
+		offset = handle->cur->next_swap;
 		if (!offset)
-			release_swap_map_reader(handle);
+			release_swap_reader(handle);
 		else
 			error = bio_read_page(offset, handle->cur);
 	}
 	return error;
 }
 
-static int check_header(void)
-{
-	char *reason = NULL;
-
-	dump_info();
-	if (swsusp_info.version_code != LINUX_VERSION_CODE)
-		reason = "kernel version";
-	if (swsusp_info.num_physpages != num_physpages)
-		reason = "memory size";
-	if (strcmp(swsusp_info.uts.sysname,system_utsname.sysname))
-		reason = "system type";
-	if (strcmp(swsusp_info.uts.release,system_utsname.release))
-		reason = "kernel release";
-	if (strcmp(swsusp_info.uts.version,system_utsname.version))
-		reason = "version";
-	if (strcmp(swsusp_info.uts.machine,system_utsname.machine))
-		reason = "machine";
-	if (reason) {
-		printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason);
-		return -EPERM;
-	}
-	return 0;
-}
-
 /**
- *	load_image_data - load the image data using the swap map handle
- *	@handle and store them using the page backup list @pblist
+ *	load_image - load the image using the swap map handle
+ *	@handle and the snapshot handle @snapshot
  *	(assume there are @nr_pages pages to load)
  */
 
-static int load_image_data(struct pbe *pblist,
-                           struct swap_map_handle *handle,
-                           unsigned int nr_pages)
+static int load_image(struct swap_map_handle *handle,
+                      struct snapshot_handle *snapshot,
+                      unsigned int nr_pages)
 {
-	int error;
 	unsigned int m;
-	struct pbe *p;
+	int ret;
+	int error = 0;
 
-	if (!pblist)
-		return -EINVAL;
 	printk("Loading image data pages (%u pages) ...     ", nr_pages);
 	m = nr_pages / 100;
 	if (!m)
 		m = 1;
 	nr_pages = 0;
-	p = pblist;
-	while (p) {
-		error = swap_map_read_page(handle, (void *)p->address);
-		if (error)
-			break;
-		p = p->next;
-		if (!(nr_pages % m))
-			printk("\b\b\b\b%3d%%", nr_pages / m);
-		nr_pages++;
-	}
+	do {
+		ret = snapshot_write_next(snapshot, PAGE_SIZE);
+		if (ret > 0) {
+			error = swap_read_page(handle, data_of(*snapshot));
+			if (error)
+				break;
+			if (!(nr_pages % m))
+				printk("\b\b\b\b%3d%%", nr_pages / m);
+			nr_pages++;
+		}
+	} while (ret > 0);
 	if (!error)
 		printk("\b\b\b\bdone\n");
+	if (!snapshot_image_loaded(snapshot))
+		error = -ENODATA;
 	return error;
 }
 
-/**
- *	unpack_orig_addresses - copy the elements of @buf[] (1 page) to
- *	the PBEs in the list starting at @pbe
- */
-
-static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
-                                                struct pbe *pbe)
-{
-	int j;
-
-	for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) {
-		pbe->orig_address = buf[j];
-		pbe = pbe->next;
-	}
-	return pbe;
-}
-
-/**
- *	load_image_metadata - load the image metadata using the swap map
- *	handle @handle and put them into the PBEs in the list @pblist
- */
-
-static int load_image_metadata(struct pbe *pblist, struct swap_map_handle *handle)
-{
-	struct pbe *p;
-	unsigned long *buf;
-	unsigned int n = 0;
-	int error = 0;
-
-	printk("Loading image metadata ... ");
-	buf = (unsigned long *)get_zeroed_page(GFP_ATOMIC);
-	if (!buf)
-		return -ENOMEM;
-	p = pblist;
-	while (p) {
-		error = swap_map_read_page(handle, buf);
-		if (error)
-			break;
-		p = unpack_orig_addresses(buf, p);
-		n++;
-	}
-	free_page((unsigned long)buf);
-	if (!error)
-		printk("done (%u pages loaded)\n", n);
-	return error;
-}
-
-int swsusp_read(struct pbe **pblist_ptr)
+int swsusp_read(void)
 {
 	int error;
-	struct pbe *p, *pblist;
 	struct swap_map_handle handle;
+	struct snapshot_handle snapshot;
+	struct swsusp_info *header;
 	unsigned int nr_pages;
 
 	if (IS_ERR(resume_bdev)) {
@@ -948,38 +756,19 @@ int swsusp_read(struct pbe **pblist_ptr)
 		return PTR_ERR(resume_bdev);
 	}
 
-	error = get_swap_map_reader(&handle, swsusp_header.image);
+	memset(&snapshot, 0, sizeof(struct snapshot_handle));
+	error = snapshot_write_next(&snapshot, PAGE_SIZE);
+	if (error < PAGE_SIZE)
+		return error < 0 ? error : -EFAULT;
+	header = (struct swsusp_info *)data_of(snapshot);
+	error = get_swap_reader(&handle, swsusp_header.image);
 	if (!error)
-		error = swap_map_read_page(&handle, &swsusp_info);
-	if (!error)
-		error = check_header();
-	if (error)
-		return error;
-	nr_pages = swsusp_info.image_pages;
-	p = alloc_pagedir(nr_pages, GFP_ATOMIC, 0);
-	if (!p)
-		return -ENOMEM;
-	error = load_image_metadata(p, &handle);
+		error = swap_read_page(&handle, header);
 	if (!error) {
-		mark_unsafe_pages(p);
-		pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
-		if (pblist)
-			copy_page_backup_list(pblist, p);
-		free_pagedir(p);
-		if (!pblist)
-			error = -ENOMEM;
-
-		/* Allocate memory for the image and read the data from swap */
-		if (!error)
-			error = alloc_data_pages(pblist, GFP_ATOMIC, 1);
-		if (!error) {
-			release_eaten_pages();
-			error = load_image_data(pblist, &handle, nr_pages);
-		}
-		if (!error)
-			*pblist_ptr = pblist;
+		nr_pages = header->image_pages;
+		error = load_image(&handle, &snapshot, nr_pages);
 	}
-	release_swap_map_reader(&handle);
+	release_swap_reader(&handle);
 
 	blkdev_put(resume_bdev);
 
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 365ed6ff182d..4d11f9d84666 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -45,7 +45,7 @@ static const char Unused_offset[] = "Unused swap offset entry ";
 
 struct swap_list_t swap_list = {-1, -1};
 
-struct swap_info_struct swap_info[MAX_SWAPFILES];
+static struct swap_info_struct swap_info[MAX_SWAPFILES];
 
 static DEFINE_MUTEX(swapon_mutex);
 
@@ -417,6 +417,59 @@ void free_swap_and_cache(swp_entry_t entry)
 	}
 }
 
+#ifdef CONFIG_SOFTWARE_SUSPEND
+/*
+ * Find the swap type that corresponds to given device (if any)
+ *
+ * This is needed for software suspend and is done in such a way that inode
+ * aliasing is allowed.
+ */
+int swap_type_of(dev_t device)
+{
+	int i;
+
+	if (!device)
+		return -EINVAL;
+	spin_lock(&swap_lock);
+	for (i = 0; i < nr_swapfiles; i++) {
+		struct inode *inode;
+
+		if (!(swap_info[i].flags & SWP_WRITEOK))
+			continue;
+		inode = swap_info->swap_file->f_dentry->d_inode;
+		if (S_ISBLK(inode->i_mode) &&
+		    device == MKDEV(imajor(inode), iminor(inode))) {
+			spin_unlock(&swap_lock);
+			return i;
+		}
+	}
+	spin_unlock(&swap_lock);
+	return -ENODEV;
+}
+
+/*
+ * Return either the total number of swap pages of given type, or the number
+ * of free pages of that type (depending on @free)
+ *
+ * This is needed for software suspend
+ */
+unsigned int count_swap_pages(int type, int free)
+{
+	unsigned int n = 0;
+
+	if (type < nr_swapfiles) {
+		spin_lock(&swap_lock);
+		if (swap_info[type].flags & SWP_WRITEOK) {
+			n = swap_info[type].pages;
+			if (free)
+				n -= swap_info[type].inuse_pages;
+		}
+		spin_unlock(&swap_lock);
+	}
+	return n;
+}
+#endif
+
 /*
  * No need to decide whether this PTE shares the swap entry with others,
  * just let do_wp_page work it out if a write is requested later - to
-- 
cgit v1.2.3


From 6e1819d615f24ce0726a7d0bd3dd0152d7b21654 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Thu, 23 Mar 2006 03:00:03 -0800
Subject: [PATCH] swsusp: userland interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch introduces a user space interface for swsusp.

The interface is based on a special character device, called the snapshot
device, that allows user space processes to perform suspend and resume-related
operations with the help of some ioctls and the read()/write() functions.
 Additionally it allows these processes to allocate free swap pages from a
selected swap partition, called the resume partition, so that they know which
sectors of the resume partition are available to them.

The interface uses the same low-level system memory snapshot-handling
functions that are used by the built-it swap-writing/reading code of swsusp.

The interface documentation is included in the patch.

The patch assumes that the major and minor numbers of the snapshot device will
be 10 (ie.  misc device) and 231, the registration of which has already been
requested.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/power/userland-swsusp.txt | 149 ++++++++++++++++
 init/do_mounts_initrd.c                 |   1 +
 kernel/power/Makefile                   |   2 +-
 kernel/power/power.h                    |  14 ++
 kernel/power/snapshot.c                 |   9 +-
 kernel/power/user.c                     | 301 ++++++++++++++++++++++++++++++++
 mm/swapfile.c                           |   6 +-
 7 files changed, 475 insertions(+), 7 deletions(-)
 create mode 100644 Documentation/power/userland-swsusp.txt
 create mode 100644 kernel/power/user.c

(limited to 'mm')

diff --git a/Documentation/power/userland-swsusp.txt b/Documentation/power/userland-swsusp.txt
new file mode 100644
index 000000000000..94058220aaf0
--- /dev/null
+++ b/Documentation/power/userland-swsusp.txt
@@ -0,0 +1,149 @@
+Documentation for userland software suspend interface
+	(C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+
+First, the warnings at the beginning of swsusp.txt still apply.
+
+Second, you should read the FAQ in swsusp.txt _now_ if you have not
+done it already.
+
+Now, to use the userland interface for software suspend you need special
+utilities that will read/write the system memory snapshot from/to the
+kernel.  Such utilities are available, for example, from
+<http://www.sisk.pl/kernel/utilities/suspend>.  You may want to have
+a look at them if you are going to develop your own suspend/resume
+utilities.
+
+The interface consists of a character device providing the open(),
+release(), read(), and write() operations as well as several ioctl()
+commands defined in kernel/power/power.h.  The major and minor
+numbers of the device are, respectively, 10 and 231, and they can
+be read from /sys/class/misc/snapshot/dev.
+
+The device can be open either for reading or for writing.  If open for
+reading, it is considered to be in the suspend mode.  Otherwise it is
+assumed to be in the resume mode.  The device cannot be open for reading
+and writing.  It is also impossible to have the device open more than once
+at a time.
+
+The ioctl() commands recognized by the device are:
+
+SNAPSHOT_FREEZE - freeze user space processes (the current process is
+	not frozen); this is required for SNAPSHOT_ATOMIC_SNAPSHOT
+	and SNAPSHOT_ATOMIC_RESTORE to succeed
+
+SNAPSHOT_UNFREEZE - thaw user space processes frozen by SNAPSHOT_FREEZE
+
+SNAPSHOT_ATOMIC_SNAPSHOT - create a snapshot of the system memory; the
+	last argument of ioctl() should be a pointer to an int variable,
+	the value of which will indicate whether the call returned after
+	creating the snapshot (1) or after restoring the system memory state
+	from it (0) (after resume the system finds itself finishing the
+	SNAPSHOT_ATOMIC_SNAPSHOT ioctl() again); after the snapshot
+	has been created the read() operation can be used to transfer
+	it out of the kernel
+
+SNAPSHOT_ATOMIC_RESTORE - restore the system memory state from the
+	uploaded snapshot image; before calling it you should transfer
+	the system memory snapshot back to the kernel using the write()
+	operation; this call will not succeed if the snapshot
+	image is not available to the kernel
+
+SNAPSHOT_FREE - free memory allocated for the snapshot image
+
+SNAPSHOT_SET_IMAGE_SIZE - set the preferred maximum size of the image
+	(the kernel will do its best to ensure the image size will not exceed
+	this number, but if it turns out to be impossible, the kernel will
+	create the smallest image possible)
+
+SNAPSHOT_AVAIL_SWAP - return the amount of available swap in bytes (the last
+	argument should be a pointer to an unsigned int variable that will
+	contain the result if the call is successful).
+
+SNAPSHOT_GET_SWAP_PAGE - allocate a swap page from the resume partition
+	(the last argument should be a pointer to a loff_t variable that
+	will contain the swap page offset if the call is successful)
+
+SNAPSHOT_FREE_SWAP_PAGES - free all swap pages allocated with
+	SNAPSHOT_GET_SWAP_PAGE
+
+SNAPSHOT_SET_SWAP_FILE - set the resume partition (the last ioctl() argument
+	should specify the device's major and minor numbers in the old
+	two-byte format, as returned by the stat() function in the .st_rdev
+	member of the stat structure); it is recommended to always use this
+	call, because the code to set the resume partition could be removed from
+	future kernels
+
+The device's read() operation can be used to transfer the snapshot image from
+the kernel.  It has the following limitations:
+- you cannot read() more than one virtual memory page at a time
+- read()s accross page boundaries are impossible (ie. if ypu read() 1/2 of
+	a page in the previous call, you will only be able to read()
+	_at_ _most_ 1/2 of the page in the next call)
+
+The device's write() operation is used for uploading the system memory snapshot
+into the kernel.  It has the same limitations as the read() operation.
+
+The release() operation frees all memory allocated for the snapshot image
+and all swap pages allocated with SNAPSHOT_GET_SWAP_PAGE (if any).
+Thus it is not necessary to use either SNAPSHOT_FREE or
+SNAPSHOT_FREE_SWAP_PAGES before closing the device (in fact it will also
+unfreeze user space processes frozen by SNAPSHOT_UNFREEZE if they are
+still frozen when the device is being closed).
+
+Currently it is assumed that the userland utilities reading/writing the
+snapshot image from/to the kernel will use a swap parition, called the resume
+partition, as storage space.  However, this is not really required, as they
+can use, for example, a special (blank) suspend partition or a file on a partition
+that is unmounted before SNAPSHOT_ATOMIC_SNAPSHOT and mounted afterwards.
+
+These utilities SHOULD NOT make any assumptions regarding the ordering of
+data within the snapshot image, except for the image header that MAY be
+assumed to start with an swsusp_info structure, as specified in
+kernel/power/power.h.  This structure MAY be used by the userland utilities
+to obtain some information about the snapshot image, such as the size
+of the snapshot image, including the metadata and the header itself,
+contained in the .size member of swsusp_info.
+
+The snapshot image MUST be written to the kernel unaltered (ie. all of the image
+data, metadata and header MUST be written in _exactly_ the same amount, form
+and order in which they have been read).  Otherwise, the behavior of the
+resumed system may be totally unpredictable.
+
+While executing SNAPSHOT_ATOMIC_RESTORE the kernel checks if the
+structure of the snapshot image is consistent with the information stored
+in the image header.  If any inconsistencies are detected,
+SNAPSHOT_ATOMIC_RESTORE will not succeed.  Still, this is not a fool-proof
+mechanism and the userland utilities using the interface SHOULD use additional
+means, such as checksums, to ensure the integrity of the snapshot image.
+
+The suspending and resuming utilities MUST lock themselves in memory,
+preferrably using mlockall(), before calling SNAPSHOT_FREEZE.
+
+The suspending utility MUST check the value stored by SNAPSHOT_ATOMIC_SNAPSHOT
+in the memory location pointed to by the last argument of ioctl() and proceed
+in accordance with it:
+1. 	If the value is 1 (ie. the system memory snapshot has just been
+	created and the system is ready for saving it):
+	(a)	The suspending utility MUST NOT close the snapshot device
+		_unless_ the whole suspend procedure is to be cancelled, in
+		which case, if the snapshot image has already been saved, the
+		suspending utility SHOULD destroy it, preferrably by zapping
+		its header.  If the suspend is not to be cancelled, the
+		system MUST be powered off or rebooted after the snapshot
+		image has been saved.
+	(b)	The suspending utility SHOULD NOT attempt to perform any
+		file system operations (including reads) on the file systems
+		that were mounted before SNAPSHOT_ATOMIC_SNAPSHOT has been
+		called.  However, it MAY mount a file system that was not
+		mounted at that time and perform some operations on it (eg.
+		use it for saving the image).
+2.	If the value is 0 (ie. the system state has just been restored from
+	the snapshot image), the suspending utility MUST close the snapshot
+	device.  Afterwards it will be treated as a regular userland process,
+	so it need not exit.
+
+The resuming utility SHOULD NOT attempt to mount any file systems that could
+be mounted before suspend and SHOULD NOT attempt to perform any operations
+involving such file systems.
+
+For details, please refer to the source code.
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index a05cabd0fd10..405f9031af87 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -56,6 +56,7 @@ static void __init handle_initrd(void)
 	sys_chroot(".");
 	mount_devfs_fs ();
 
+	current->flags |= PF_NOFREEZE;
 	pid = kernel_thread(do_linuxrc, "/linuxrc", SIGCHLD);
 	if (pid > 0) {
 		while (pid != sys_wait4(-1, NULL, 0, NULL))
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index bb91a0615303..8d0af3d37a4b 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -5,7 +5,7 @@ endif
 
 obj-y				:= main.o process.o console.o
 obj-$(CONFIG_PM_LEGACY)		+= pm.o
-obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o
+obj-$(CONFIG_SOFTWARE_SUSPEND)	+= swsusp.o disk.o snapshot.o swap.o user.o
 
 obj-$(CONFIG_SUSPEND_SMP)	+= smp.o
 
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 5d1abffbb9ce..42c431c8bdde 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -8,6 +8,7 @@ struct swsusp_info {
 	int			cpus;
 	unsigned long		image_pages;
 	unsigned long		pages;
+	unsigned long		size;
 } __attribute__((aligned(PAGE_SIZE)));
 
 
@@ -65,6 +66,19 @@ extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
 extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
 int snapshot_image_loaded(struct snapshot_handle *handle);
 
+#define SNAPSHOT_IOC_MAGIC	'3'
+#define SNAPSHOT_FREEZE			_IO(SNAPSHOT_IOC_MAGIC, 1)
+#define SNAPSHOT_UNFREEZE		_IO(SNAPSHOT_IOC_MAGIC, 2)
+#define SNAPSHOT_ATOMIC_SNAPSHOT	_IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
+#define SNAPSHOT_ATOMIC_RESTORE		_IO(SNAPSHOT_IOC_MAGIC, 4)
+#define SNAPSHOT_FREE			_IO(SNAPSHOT_IOC_MAGIC, 5)
+#define SNAPSHOT_SET_IMAGE_SIZE		_IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
+#define SNAPSHOT_AVAIL_SWAP		_IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
+#define SNAPSHOT_GET_SWAP_PAGE		_IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
+#define SNAPSHOT_FREE_SWAP_PAGES	_IO(SNAPSHOT_IOC_MAGIC, 9)
+#define SNAPSHOT_SET_SWAP_FILE		_IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
+#define SNAPSHOT_IOC_MAXNR	10
+
 /**
  *	The bitmap is used for tracing allocated swap pages
  *
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index cc349437fb72..0036955357e0 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -37,6 +37,7 @@
 struct pbe *pagedir_nosave;
 static unsigned int nr_copy_pages;
 static unsigned int nr_meta_pages;
+static unsigned long *buffer;
 
 #ifdef CONFIG_HIGHMEM
 unsigned int count_highmem_pages(void)
@@ -389,7 +390,7 @@ struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed
 		free_pagedir(pblist);
 		pblist = NULL;
         } else
-        	create_pbe_list(pblist, nr_pages);
+		create_pbe_list(pblist, nr_pages);
 	return pblist;
 }
 
@@ -418,6 +419,7 @@ void swsusp_free(void)
 	nr_copy_pages = 0;
 	nr_meta_pages = 0;
 	pagedir_nosave = NULL;
+	buffer = NULL;
 }
 
 
@@ -523,6 +525,8 @@ static void init_header(struct swsusp_info *info)
 	info->cpus = num_online_cpus();
 	info->image_pages = nr_copy_pages;
 	info->pages = nr_copy_pages + nr_meta_pages + 1;
+	info->size = info->pages;
+	info->size <<= PAGE_SHIFT;
 }
 
 /**
@@ -568,8 +572,6 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
 
 int snapshot_read_next(struct snapshot_handle *handle, size_t count)
 {
-	static unsigned long *buffer;
-
 	if (handle->page > nr_meta_pages + nr_copy_pages)
 		return 0;
 	if (!buffer) {
@@ -774,7 +776,6 @@ static int create_image(struct snapshot_handle *handle)
 
 int snapshot_write_next(struct snapshot_handle *handle, size_t count)
 {
-	static unsigned long *buffer;
 	int error = 0;
 
 	if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages)
diff --git a/kernel/power/user.c b/kernel/power/user.c
new file mode 100644
index 000000000000..8cabc405ca10
--- /dev/null
+++ b/kernel/power/user.c
@@ -0,0 +1,301 @@
+/*
+ * linux/kernel/power/user.c
+ *
+ * This file provides the user space interface for software suspend/resume.
+ *
+ * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This file is released under the GPLv2.
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/syscalls.h>
+#include <linux/string.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pm.h>
+#include <linux/fs.h>
+
+#include <asm/uaccess.h>
+
+#include "power.h"
+
+#define SNAPSHOT_MINOR	231
+
+static struct snapshot_data {
+	struct snapshot_handle handle;
+	int swap;
+	struct bitmap_page *bitmap;
+	int mode;
+	char frozen;
+	char ready;
+} snapshot_state;
+
+static atomic_t device_available = ATOMIC_INIT(1);
+
+static int snapshot_open(struct inode *inode, struct file *filp)
+{
+	struct snapshot_data *data;
+
+	if (!atomic_add_unless(&device_available, -1, 0))
+		return -EBUSY;
+
+	if ((filp->f_flags & O_ACCMODE) == O_RDWR)
+		return -ENOSYS;
+
+	nonseekable_open(inode, filp);
+	data = &snapshot_state;
+	filp->private_data = data;
+	memset(&data->handle, 0, sizeof(struct snapshot_handle));
+	if ((filp->f_flags & O_ACCMODE) == O_RDONLY) {
+		data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device) : -1;
+		data->mode = O_RDONLY;
+	} else {
+		data->swap = -1;
+		data->mode = O_WRONLY;
+	}
+	data->bitmap = NULL;
+	data->frozen = 0;
+	data->ready = 0;
+
+	return 0;
+}
+
+static int snapshot_release(struct inode *inode, struct file *filp)
+{
+	struct snapshot_data *data;
+
+	swsusp_free();
+	data = filp->private_data;
+	free_all_swap_pages(data->swap, data->bitmap);
+	free_bitmap(data->bitmap);
+	if (data->frozen) {
+		down(&pm_sem);
+		thaw_processes();
+		enable_nonboot_cpus();
+		up(&pm_sem);
+	}
+	atomic_inc(&device_available);
+	return 0;
+}
+
+static ssize_t snapshot_read(struct file *filp, char __user *buf,
+                             size_t count, loff_t *offp)
+{
+	struct snapshot_data *data;
+	ssize_t res;
+
+	data = filp->private_data;
+	res = snapshot_read_next(&data->handle, count);
+	if (res > 0) {
+		if (copy_to_user(buf, data_of(data->handle), res))
+			res = -EFAULT;
+		else
+			*offp = data->handle.offset;
+	}
+	return res;
+}
+
+static ssize_t snapshot_write(struct file *filp, const char __user *buf,
+                              size_t count, loff_t *offp)
+{
+	struct snapshot_data *data;
+	ssize_t res;
+
+	data = filp->private_data;
+	res = snapshot_write_next(&data->handle, count);
+	if (res > 0) {
+		if (copy_from_user(data_of(data->handle), buf, res))
+			res = -EFAULT;
+		else
+			*offp = data->handle.offset;
+	}
+	return res;
+}
+
+static int snapshot_ioctl(struct inode *inode, struct file *filp,
+                          unsigned int cmd, unsigned long arg)
+{
+	int error = 0;
+	struct snapshot_data *data;
+	loff_t offset, avail;
+
+	if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
+		return -ENOTTY;
+	if (_IOC_NR(cmd) > SNAPSHOT_IOC_MAXNR)
+		return -ENOTTY;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	data = filp->private_data;
+
+	switch (cmd) {
+
+	case SNAPSHOT_FREEZE:
+		if (data->frozen)
+			break;
+		sys_sync();
+		down(&pm_sem);
+		pm_prepare_console();
+		disable_nonboot_cpus();
+		if (freeze_processes()) {
+			thaw_processes();
+			enable_nonboot_cpus();
+			pm_restore_console();
+			error = -EBUSY;
+		}
+		up(&pm_sem);
+		if (!error)
+			data->frozen = 1;
+		break;
+
+	case SNAPSHOT_UNFREEZE:
+		if (!data->frozen)
+			break;
+		down(&pm_sem);
+		thaw_processes();
+		enable_nonboot_cpus();
+		pm_restore_console();
+		up(&pm_sem);
+		data->frozen = 0;
+		break;
+
+	case SNAPSHOT_ATOMIC_SNAPSHOT:
+		if (data->mode != O_RDONLY || !data->frozen  || data->ready) {
+			error = -EPERM;
+			break;
+		}
+		down(&pm_sem);
+		/* Free memory before shutting down devices. */
+		error = swsusp_shrink_memory();
+		if (!error) {
+			error = device_suspend(PMSG_FREEZE);
+			if (!error) {
+				in_suspend = 1;
+				error = swsusp_suspend();
+				device_resume();
+			}
+		}
+		up(&pm_sem);
+		if (!error)
+			error = put_user(in_suspend, (unsigned int __user *)arg);
+		if (!error)
+			data->ready = 1;
+		break;
+
+	case SNAPSHOT_ATOMIC_RESTORE:
+		if (data->mode != O_WRONLY || !data->frozen ||
+		    !snapshot_image_loaded(&data->handle)) {
+			error = -EPERM;
+			break;
+		}
+		down(&pm_sem);
+		pm_prepare_console();
+		error = device_suspend(PMSG_FREEZE);
+		if (!error) {
+			error = swsusp_resume();
+			device_resume();
+		}
+		pm_restore_console();
+		up(&pm_sem);
+		break;
+
+	case SNAPSHOT_FREE:
+		swsusp_free();
+		memset(&data->handle, 0, sizeof(struct snapshot_handle));
+		data->ready = 0;
+		break;
+
+	case SNAPSHOT_SET_IMAGE_SIZE:
+		image_size = arg;
+		break;
+
+	case SNAPSHOT_AVAIL_SWAP:
+		avail = count_swap_pages(data->swap, 1);
+		avail <<= PAGE_SHIFT;
+		error = put_user(avail, (loff_t __user *)arg);
+		break;
+
+	case SNAPSHOT_GET_SWAP_PAGE:
+		if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+			error = -ENODEV;
+			break;
+		}
+		if (!data->bitmap) {
+			data->bitmap = alloc_bitmap(count_swap_pages(data->swap, 0));
+			if (!data->bitmap) {
+				error = -ENOMEM;
+				break;
+			}
+		}
+		offset = alloc_swap_page(data->swap, data->bitmap);
+		if (offset) {
+			offset <<= PAGE_SHIFT;
+			error = put_user(offset, (loff_t __user *)arg);
+		} else {
+			error = -ENOSPC;
+		}
+		break;
+
+	case SNAPSHOT_FREE_SWAP_PAGES:
+		if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
+			error = -ENODEV;
+			break;
+		}
+		free_all_swap_pages(data->swap, data->bitmap);
+		free_bitmap(data->bitmap);
+		data->bitmap = NULL;
+		break;
+
+	case SNAPSHOT_SET_SWAP_FILE:
+		if (!data->bitmap) {
+			/*
+			 * User space encodes device types as two-byte values,
+			 * so we need to recode them
+			 */
+			if (old_decode_dev(arg)) {
+				data->swap = swap_type_of(old_decode_dev(arg));
+				if (data->swap < 0)
+					error = -ENODEV;
+			} else {
+				data->swap = -1;
+				error = -EINVAL;
+			}
+		} else {
+			error = -EPERM;
+		}
+		break;
+
+	default:
+		error = -ENOTTY;
+
+	}
+
+	return error;
+}
+
+static struct file_operations snapshot_fops = {
+	.open = snapshot_open,
+	.release = snapshot_release,
+	.read = snapshot_read,
+	.write = snapshot_write,
+	.llseek = no_llseek,
+	.ioctl = snapshot_ioctl,
+};
+
+static struct miscdevice snapshot_device = {
+	.minor = SNAPSHOT_MINOR,
+	.name = "snapshot",
+	.fops = &snapshot_fops,
+};
+
+static int __init snapshot_device_init(void)
+{
+	return misc_register(&snapshot_device);
+};
+
+device_initcall(snapshot_device_init);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 4d11f9d84666..39aa9d129612 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -428,14 +428,16 @@ int swap_type_of(dev_t device)
 {
 	int i;
 
-	if (!device)
-		return -EINVAL;
 	spin_lock(&swap_lock);
 	for (i = 0; i < nr_swapfiles; i++) {
 		struct inode *inode;
 
 		if (!(swap_info[i].flags & SWP_WRITEOK))
 			continue;
+		if (!device) {
+			spin_unlock(&swap_lock);
+			return i;
+		}
 		inode = swap_info->swap_file->f_dentry->d_inode;
 		if (S_ISBLK(inode->i_mode) &&
 		    device == MKDEV(imajor(inode), iminor(inode))) {
-- 
cgit v1.2.3


From d8733c2956968a01394a4d2a9e97a8b431a78776 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Thu, 23 Mar 2006 03:00:11 -0800
Subject: [PATCH] ext3_readdir: use generic readahead

Linus points out that ext3_readdir's readahead only cuts in when
ext3_readdir() is operating at the very start of the directory.  So for large
directories we end up performing no readahead at all and we suck.

So take it all out and use the core VM's page_cache_readahead().  This means
that ext3 directory reads will use all of readahead's dynamic sizing goop.

Note that we're using the directory's filp->f_ra to hold the readahead state,
but readahead is actually being performed against the underlying blockdev's
address_space.  Fortunately the readahead code is all set up to handle this.

Tested with printk.  It works.  I was struggling to find a real workload which
actually cared.

(The patch also exports page_cache_readahead() to GPL modules)

Cc: "Stephen C. Tweedie" <sct@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/ext3/dir.c           | 52 +++++++++++++++++++++++--------------------------
 fs/ext3/inode.c         |  2 +-
 include/linux/ext3_fs.h |  9 ++++++---
 mm/readahead.c          |  1 +
 4 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'mm')

diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 832867aef3dc..773459164bb2 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -95,11 +95,10 @@ static int ext3_readdir(struct file * filp,
 			 void * dirent, filldir_t filldir)
 {
 	int error = 0;
-	unsigned long offset, blk;
-	int i, num, stored;
-	struct buffer_head * bh, * tmp, * bha[16];
-	struct ext3_dir_entry_2 * de;
-	struct super_block * sb;
+	unsigned long offset;
+	int i, stored;
+	struct ext3_dir_entry_2 *de;
+	struct super_block *sb;
 	int err;
 	struct inode *inode = filp->f_dentry->d_inode;
 	int ret = 0;
@@ -124,12 +123,29 @@ static int ext3_readdir(struct file * filp,
 	}
 #endif
 	stored = 0;
-	bh = NULL;
 	offset = filp->f_pos & (sb->s_blocksize - 1);
 
 	while (!error && !stored && filp->f_pos < inode->i_size) {
-		blk = (filp->f_pos) >> EXT3_BLOCK_SIZE_BITS(sb);
-		bh = ext3_bread(NULL, inode, blk, 0, &err);
+		unsigned long blk = filp->f_pos >> EXT3_BLOCK_SIZE_BITS(sb);
+		struct buffer_head map_bh;
+		struct buffer_head *bh = NULL;
+
+		map_bh.b_state = 0;
+		err = ext3_get_block_handle(NULL, inode, blk, &map_bh, 0, 0);
+		if (!err) {
+			page_cache_readahead(sb->s_bdev->bd_inode->i_mapping,
+				&filp->f_ra,
+				filp,
+				map_bh.b_blocknr >>
+					(PAGE_CACHE_SHIFT - inode->i_blkbits),
+				1);
+			bh = ext3_bread(NULL, inode, blk, 0, &err);
+		}
+
+		/*
+		 * We ignore I/O errors on directories so users have a chance
+		 * of recovering data when there's a bad sector
+		 */
 		if (!bh) {
 			ext3_error (sb, "ext3_readdir",
 				"directory #%lu contains a hole at offset %lu",
@@ -138,26 +154,6 @@ static int ext3_readdir(struct file * filp,
 			continue;
 		}
 
-		/*
-		 * Do the readahead
-		 */
-		if (!offset) {
-			for (i = 16 >> (EXT3_BLOCK_SIZE_BITS(sb) - 9), num = 0;
-			     i > 0; i--) {
-				tmp = ext3_getblk (NULL, inode, ++blk, 0, &err);
-				if (tmp && !buffer_uptodate(tmp) &&
-						!buffer_locked(tmp))
-					bha[num++] = tmp;
-				else
-					brelse (tmp);
-			}
-			if (num) {
-				ll_rw_block (READA, num, bha);
-				for (i = 0; i < num; i++)
-					brelse (bha[i]);
-			}
-		}
-
 revalidate:
 		/* If the dir block has changed since the last call to
 		 * readdir(2), then we might be pointing to an invalid
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 0384e539b88f..d59d5a667b0b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -671,7 +671,7 @@ err_out:
  * The BKL may not be held on entry here.  Be sure to take it early.
  */
 
-static int
+int
 ext3_get_block_handle(handle_t *handle, struct inode *inode, sector_t iblock,
 		struct buffer_head *bh_result, int create, int extend_disksize)
 {
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index c0272d73ab20..e7239f2f97a1 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -772,9 +772,12 @@ extern unsigned long ext3_count_free (struct buffer_head *, unsigned);
 
 
 /* inode.c */
-extern int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
-extern struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
-extern struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+int ext3_forget(handle_t *, int, struct inode *, struct buffer_head *, int);
+struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *);
+struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *);
+int ext3_get_block_handle(handle_t *handle, struct inode *inode,
+	sector_t iblock, struct buffer_head *bh_result, int create,
+	int extend_disksize);
 
 extern void ext3_read_inode (struct inode *);
 extern int  ext3_write_inode (struct inode *, int);
diff --git a/mm/readahead.c b/mm/readahead.c
index 301b36c4a0ce..0f142a40984b 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -555,6 +555,7 @@ recheck:
 out:
 	return ra->prev_page + 1;
 }
+EXPORT_SYMBOL_GPL(page_cache_readahead);
 
 /*
  * handle_ra_miss() is called when it is known that a page which should have
-- 
cgit v1.2.3


From 2056a782f8e7e65fd4bfd027506b4ce1c5e9ccd4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <axboe@suse.de>
Date: Thu, 23 Mar 2006 20:00:26 +0100
Subject: [PATCH] Block queue IO tracing support (blktrace) as of 2006-03-23

Signed-off-by: Jens Axboe <axboe@suse.de>
---
 block/Kconfig                |  12 +
 block/Makefile               |   2 +
 block/blktrace.c             | 538 +++++++++++++++++++++++++++++++++++++++++++
 block/elevator.c             |   4 +
 block/ioctl.c                |   6 +
 block/ll_rw_blk.c            |  44 +++-
 drivers/block/cciss.c        |   2 +
 drivers/md/dm.c              |  13 +-
 fs/bio.c                     |   4 +
 fs/compat_ioctl.c            |   1 +
 include/linux/blkdev.h       |   3 +
 include/linux/blktrace_api.h | 277 ++++++++++++++++++++++
 include/linux/compat_ioctl.h |   4 +
 include/linux/fs.h           |   4 +
 include/linux/sched.h        |   1 +
 kernel/fork.c                |   1 +
 mm/highmem.c                 |   3 +
 17 files changed, 916 insertions(+), 3 deletions(-)
 create mode 100644 block/blktrace.c
 create mode 100644 include/linux/blktrace_api.h

(limited to 'mm')

diff --git a/block/Kconfig b/block/Kconfig
index 377f6dd20e17..96783645092d 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -11,4 +11,16 @@ config LBD
 	  your machine, or if you want to have a raid or loopback device
 	  bigger than 2TB.  Otherwise say N.
 
+config BLK_DEV_IO_TRACE
+	bool "Support for tracing block io actions"
+	select RELAY
+	select DEBUG_FS
+	help
+	  Say Y here, if you want to be able to trace the block layer actions
+	  on a given queue. Tracing allows you to see any traffic happening
+	  on a block device queue. For more information (and the user space
+	  support tools needed), fetch the blktrace app from:
+
+	  git://brick.kernel.dk/data/git/blktrace.git
+
 source block/Kconfig.iosched
diff --git a/block/Makefile b/block/Makefile
index 7e4f93e2b44e..c05de0e0037f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,3 +8,5 @@ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+
+obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
diff --git a/block/blktrace.c b/block/blktrace.c
new file mode 100644
index 000000000000..36f3a172275f
--- /dev/null
+++ b/block/blktrace.c
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2006 Jens Axboe <axboe@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+#include <linux/config.h>
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <asm/uaccess.h>
+
+static DEFINE_PER_CPU(unsigned long long, blk_trace_cpu_offset) = { 0, };
+static unsigned int blktrace_seq __read_mostly = 1;
+
+/*
+ * Send out a notify for this process, if we haven't done so since a trace
+ * started
+ */
+static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+{
+	struct blk_io_trace *t;
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + sizeof(tsk->comm));
+	if (t) {
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->device = bt->dev;
+		t->action = BLK_TC_ACT(BLK_TC_NOTIFY);
+		t->pid = tsk->pid;
+		t->cpu = smp_processor_id();
+		t->pdu_len = sizeof(tsk->comm);
+		memcpy((void *) t + sizeof(*t), tsk->comm, t->pdu_len);
+		tsk->btrace_seq = blktrace_seq;
+	}
+}
+
+static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+			 pid_t pid)
+{
+	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
+		return 1;
+	if (sector < bt->start_lba || sector > bt->end_lba)
+		return 1;
+	if (bt->pid && pid != bt->pid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Data direction bit lookup
+ */
+static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ), BLK_TC_ACT(BLK_TC_WRITE) };
+
+/*
+ * Bio action bits of interest
+ */
+static u32 bio_act[3] __read_mostly = { 0, BLK_TC_ACT(BLK_TC_BARRIER), BLK_TC_ACT(BLK_TC_SYNC) };
+
+/*
+ * More could be added as needed, taking care to increment the decrementer
+ * to get correct indexing
+ */
+#define trace_barrier_bit(rw)	\
+	(((rw) & (1 << BIO_RW_BARRIER)) >> (BIO_RW_BARRIER - 0))
+#define trace_sync_bit(rw)	\
+	(((rw) & (1 << BIO_RW_SYNC)) >> (BIO_RW_SYNC - 1))
+
+/*
+ * The worker for the various blk_add_trace*() types. Fills out a
+ * blk_io_trace structure and places it in a per-cpu subbuffer.
+ */
+void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+{
+	struct task_struct *tsk = current;
+	struct blk_io_trace *t;
+	unsigned long flags;
+	unsigned long *sequence;
+	pid_t pid;
+	int cpu;
+
+	if (unlikely(bt->trace_state != Blktrace_running))
+		return;
+
+	what |= ddir_act[rw & WRITE];
+	what |= bio_act[trace_barrier_bit(rw)];
+	what |= bio_act[trace_sync_bit(rw)];
+
+	pid = tsk->pid;
+	if (unlikely(act_log_check(bt, what, sector, pid)))
+		return;
+
+	/*
+	 * A word about the locking here - we disable interrupts to reserve
+	 * some space in the relay per-cpu buffer, to prevent an irq
+	 * from coming in and stepping on our toes. Once reserved, it's
+	 * enough to get preemption disabled to prevent read of this data
+	 * before we are through filling it. get_cpu()/put_cpu() does this
+	 * for us
+	 */
+	local_irq_save(flags);
+
+	if (unlikely(tsk->btrace_seq != blktrace_seq))
+		trace_note_tsk(bt, tsk);
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	if (t) {
+		cpu = smp_processor_id();
+		sequence = per_cpu_ptr(bt->sequence, cpu);
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->sequence = ++(*sequence);
+		t->time = sched_clock() - per_cpu(blk_trace_cpu_offset, cpu);
+		t->sector = sector;
+		t->bytes = bytes;
+		t->action = what;
+		t->pid = pid;
+		t->device = bt->dev;
+		t->cpu = cpu;
+		t->error = error;
+		t->pdu_len = pdu_len;
+
+		if (pdu_len)
+			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+	}
+
+	local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL_GPL(__blk_add_trace);
+
+static struct dentry *blk_tree_root;
+static struct mutex blk_tree_mutex;
+static unsigned int root_users;
+
+static inline void blk_remove_root(void)
+{
+	if (blk_tree_root) {
+		debugfs_remove(blk_tree_root);
+		blk_tree_root = NULL;
+	}
+}
+
+static void blk_remove_tree(struct dentry *dir)
+{
+	mutex_lock(&blk_tree_mutex);
+	debugfs_remove(dir);
+	if (--root_users == 0)
+		blk_remove_root();
+	mutex_unlock(&blk_tree_mutex);
+}
+
+static struct dentry *blk_create_tree(const char *blk_name)
+{
+	struct dentry *dir = NULL;
+
+	mutex_lock(&blk_tree_mutex);
+
+	if (!blk_tree_root) {
+		blk_tree_root = debugfs_create_dir("block", NULL);
+		if (!blk_tree_root)
+			goto err;
+	}
+
+	dir = debugfs_create_dir(blk_name, blk_tree_root);
+	if (dir)
+		root_users++;
+	else
+		blk_remove_root();
+
+err:
+	mutex_unlock(&blk_tree_mutex);
+	return dir;
+}
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	relay_close(bt->rchan);
+	debugfs_remove(bt->dropped_file);
+	blk_remove_tree(bt->dir);
+	free_percpu(bt->sequence);
+	kfree(bt);
+}
+
+static int blk_trace_remove(request_queue_t *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (!bt)
+		return -EINVAL;
+
+	if (bt->trace_state == Blktrace_setup ||
+	    bt->trace_state == Blktrace_stopped)
+		blk_trace_cleanup(bt);
+
+	return 0;
+}
+
+static int blk_dropped_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->u.generic_ip;
+
+	return 0;
+}
+
+static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct blk_trace *bt = filp->private_data;
+	char buf[16];
+
+	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static struct file_operations blk_dropped_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_dropped_open,
+	.read =		blk_dropped_read,
+};
+
+/*
+ * Keep track of how many times we encountered a full subbuffer, to aid
+ * the user space app in telling how many lost events there were.
+ */
+static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+				     void *prev_subbuf, size_t prev_padding)
+{
+	struct blk_trace *bt;
+
+	if (!relay_buf_full(buf))
+		return 1;
+
+	bt = buf->chan->private_data;
+	atomic_inc(&bt->dropped);
+	return 0;
+}
+
+static int blk_remove_buf_file_callback(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+	return 0;
+}
+
+static struct dentry *blk_create_buf_file_callback(const char *filename,
+						   struct dentry *parent,
+						   int mode,
+						   struct rchan_buf *buf,
+						   int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+					&relay_file_operations);
+}
+
+static struct rchan_callbacks blk_relay_callbacks = {
+	.subbuf_start		= blk_subbuf_start_callback,
+	.create_buf_file	= blk_create_buf_file_callback,
+	.remove_buf_file	= blk_remove_buf_file_callback,
+};
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup(request_queue_t *q, struct block_device *bdev,
+			   char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	struct blk_trace *old_bt, *bt = NULL;
+	struct dentry *dir = NULL;
+	char b[BDEVNAME_SIZE];
+	int ret, i;
+
+	if (copy_from_user(&buts, arg, sizeof(buts)))
+		return -EFAULT;
+
+	if (!buts.buf_size || !buts.buf_nr)
+		return -EINVAL;
+
+	strcpy(buts.name, bdevname(bdev, b));
+
+	/*
+	 * some device names have larger paths - convert the slashes
+	 * to underscores for this to work as expected
+	 */
+	for (i = 0; i < strlen(buts.name); i++)
+		if (buts.name[i] == '/')
+			buts.name[i] = '_';
+
+	if (copy_to_user(arg, &buts, sizeof(buts)))
+		return -EFAULT;
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->sequence = alloc_percpu(unsigned long);
+	if (!bt->sequence)
+		goto err;
+
+	ret = -ENOENT;
+	dir = blk_create_tree(buts.name);
+	if (!dir)
+		goto err;
+
+	bt->dir = dir;
+	bt->dev = bdev->bd_dev;
+	atomic_set(&bt->dropped, 0);
+
+	ret = -EIO;
+	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt, &blk_dropped_fops);
+	if (!bt->dropped_file)
+		goto err;
+
+	bt->rchan = relay_open("trace", dir, buts.buf_size, buts.buf_nr, &blk_relay_callbacks);
+	if (!bt->rchan)
+		goto err;
+	bt->rchan->private_data = bt;
+
+	bt->act_mask = buts.act_mask;
+	if (!bt->act_mask)
+		bt->act_mask = (u16) -1;
+
+	bt->start_lba = buts.start_lba;
+	bt->end_lba = buts.end_lba;
+	if (!bt->end_lba)
+		bt->end_lba = -1ULL;
+
+	bt->pid = buts.pid;
+	bt->trace_state = Blktrace_setup;
+
+	ret = -EBUSY;
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt) {
+		(void) xchg(&q->blk_trace, old_bt);
+		goto err;
+	}
+
+	return 0;
+err:
+	if (dir)
+		blk_remove_tree(dir);
+	if (bt) {
+		if (bt->dropped_file)
+			debugfs_remove(bt->dropped_file);
+		if (bt->sequence)
+			free_percpu(bt->sequence);
+		if (bt->rchan)
+			relay_close(bt->rchan);
+		kfree(bt);
+	}
+	return ret;
+}
+
+static int blk_trace_startstop(request_queue_t *q, int start)
+{
+	struct blk_trace *bt;
+	int ret;
+
+	if ((bt = q->blk_trace) == NULL)
+		return -EINVAL;
+
+	/*
+	 * For starting a trace, we can transition from a setup or stopped
+	 * trace. For stopping a trace, the state must be running
+	 */
+	ret = -EINVAL;
+	if (start) {
+		if (bt->trace_state == Blktrace_setup ||
+		    bt->trace_state == Blktrace_stopped) {
+			blktrace_seq++;
+			smp_mb();
+			bt->trace_state = Blktrace_running;
+			ret = 0;
+		}
+	} else {
+		if (bt->trace_state == Blktrace_running) {
+			bt->trace_state = Blktrace_stopped;
+			relay_flush(bt->rchan);
+			ret = 0;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * @bdev:	the block device
+ * @cmd: 	the ioctl cmd
+ * @arg:	the argument data, if any
+ *
+ **/
+int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	request_queue_t *q;
+	int ret, start = 0;
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	switch (cmd) {
+	case BLKTRACESETUP:
+		ret = blk_trace_setup(q, bdev, arg);
+		break;
+	case BLKTRACESTART:
+		start = 1;
+	case BLKTRACESTOP:
+		ret = blk_trace_startstop(q, start);
+		break;
+	case BLKTRACETEARDOWN:
+		ret = blk_trace_remove(q);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	return ret;
+}
+
+/**
+ * blk_trace_shutdown: - stop and cleanup trace structures
+ * @q:    the request queue associated with the device
+ *
+ **/
+void blk_trace_shutdown(request_queue_t *q)
+{
+	blk_trace_startstop(q, 0);
+	blk_trace_remove(q);
+}
+
+/*
+ * Average offset over two calls to sched_clock() with a gettimeofday()
+ * in the middle
+ */
+static void blk_check_time(unsigned long long *t)
+{
+	unsigned long long a, b;
+	struct timeval tv;
+
+	a = sched_clock();
+	do_gettimeofday(&tv);
+	b = sched_clock();
+
+	*t = tv.tv_sec * 1000000000 + tv.tv_usec * 1000;
+	*t -= (a + b) / 2;
+}
+
+static void blk_trace_check_cpu_time(void *data)
+{
+	unsigned long long *t;
+	int cpu = get_cpu();
+
+	t = &per_cpu(blk_trace_cpu_offset, cpu);
+
+	/*
+	 * Just call it twice, hopefully the second call will be cache hot
+	 * and a little more precise
+	 */
+	blk_check_time(t);
+	blk_check_time(t);
+
+	put_cpu();
+}
+
+/*
+ * Call blk_trace_check_cpu_time() on each CPU to calibrate our inter-CPU
+ * timings
+ */
+static void blk_trace_calibrate_offsets(void)
+{
+	unsigned long flags;
+
+	smp_call_function(blk_trace_check_cpu_time, NULL, 1, 1);
+	local_irq_save(flags);
+	blk_trace_check_cpu_time(NULL);
+	local_irq_restore(flags);
+}
+
+static void blk_trace_set_ht_offsets(void)
+{
+#if defined(CONFIG_SCHED_SMT)
+	int cpu, i;
+
+	/*
+	 * now make sure HT siblings have the same time offset
+	 */
+	preempt_disable();
+	for_each_online_cpu(cpu) {
+		unsigned long long *cpu_off, *sibling_off;
+
+		for_each_cpu_mask(i, cpu_sibling_map[cpu]) {
+			if (i == cpu)
+				continue;
+
+			cpu_off = &per_cpu(blk_trace_cpu_offset, cpu);
+			sibling_off = &per_cpu(blk_trace_cpu_offset, i);
+			*sibling_off = *cpu_off;
+		}
+	}
+	preempt_enable();
+#endif
+}
+
+static __init int blk_trace_init(void)
+{
+	mutex_init(&blk_tree_mutex);
+	blk_trace_calibrate_offsets();
+	blk_trace_set_ht_offsets();
+
+	return 0;
+}
+
+module_init(blk_trace_init);
+
diff --git a/block/elevator.c b/block/elevator.c
index db3d0d8296a0..5e558c4689a4 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
+#include <linux/blktrace_api.h>
 
 #include <asm/uaccess.h>
 
@@ -333,6 +334,8 @@ void elv_insert(request_queue_t *q, struct request *rq, int where)
 	struct list_head *pos;
 	unsigned ordseq;
 
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+
 	rq->q = q;
 
 	switch (where) {
@@ -499,6 +502,7 @@ struct request *elv_next_request(request_queue_t *q)
 			 * not be passed by new incoming requests
 			 */
 			rq->flags |= REQ_STARTED;
+			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 		}
 
 		if (!q->boundary_rq || q->boundary_rq == rq) {
diff --git a/block/ioctl.c b/block/ioctl.c
index 35fdb7dc6512..9cfa2e1ecb24 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -5,6 +5,7 @@
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/smp_lock.h>
+#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 
 static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
@@ -189,6 +190,11 @@ static int blkdev_locked_ioctl(struct file *file, struct block_device *bdev,
 		return put_ulong(arg, bdev->bd_inode->i_size >> 9);
 	case BLKGETSIZE64:
 		return put_u64(arg, bdev->bd_inode->i_size);
+	case BLKTRACESTART:
+	case BLKTRACESTOP:
+	case BLKTRACESETUP:
+	case BLKTRACETEARDOWN:
+		return blk_trace_ioctl(bdev, cmd, (char __user *) arg);
 	}
 	return -ENOIOCTLCMD;
 }
diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 6c793b196aa9..062067fa7ead 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -28,6 +28,7 @@
 #include <linux/writeback.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
+#include <linux/blktrace_api.h>
 
 /*
  * for max sense size
@@ -1556,8 +1557,10 @@ void blk_plug_device(request_queue_t *q)
 	if (test_bit(QUEUE_FLAG_STOPPED, &q->queue_flags))
 		return;
 
-	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
+	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
+		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
+	}
 }
 
 EXPORT_SYMBOL(blk_plug_device);
@@ -1621,14 +1624,21 @@ static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
 	/*
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
-	if (q->unplug_fn)
+	if (q->unplug_fn) {
+		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+					q->rq.count[READ] + q->rq.count[WRITE]);
+
 		q->unplug_fn(q);
+	}
 }
 
 static void blk_unplug_work(void *data)
 {
 	request_queue_t *q = data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	q->unplug_fn(q);
 }
 
@@ -1636,6 +1646,9 @@ static void blk_unplug_timeout(unsigned long data)
 {
 	request_queue_t *q = (request_queue_t *)data;
 
+	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
+				q->rq.count[READ] + q->rq.count[WRITE]);
+
 	kblockd_schedule_work(&q->unplug_work);
 }
 
@@ -1753,6 +1766,9 @@ static void blk_release_queue(struct kobject *kobj)
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 
+	if (q->blk_trace)
+		blk_trace_shutdown(q);
+
 	kmem_cache_free(requestq_cachep, q);
 }
 
@@ -2129,6 +2145,8 @@ rq_starved:
 	
 	rq_init(q, rq);
 	rq->rl = rl;
+
+	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
 out:
 	return rq;
 }
@@ -2157,6 +2175,8 @@ static struct request *get_request_wait(request_queue_t *q, int rw,
 		if (!rq) {
 			struct io_context *ioc;
 
+			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
+
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
 			io_schedule();
@@ -2210,6 +2230,8 @@ EXPORT_SYMBOL(blk_get_request);
  */
 void blk_requeue_request(request_queue_t *q, struct request *rq)
 {
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 
@@ -2844,6 +2866,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->back_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
@@ -2859,6 +2883,8 @@ static int __make_request(request_queue_t *q, struct bio *bio)
 			if (!q->front_merge_fn(q, req, bio))
 				break;
 
+			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+
 			bio->bi_next = req->bio;
 			req->bio = bio;
 
@@ -2976,6 +3002,7 @@ void generic_make_request(struct bio *bio)
 	request_queue_t *q;
 	sector_t maxsector;
 	int ret, nr_sectors = bio_sectors(bio);
+	dev_t old_dev;
 
 	might_sleep();
 	/* Test device or partition size, when known. */
@@ -3002,6 +3029,8 @@ void generic_make_request(struct bio *bio)
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
+	maxsector = -1;
+	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
 
@@ -3034,6 +3063,15 @@ end_io:
 		 */
 		blk_partition_remap(bio);
 
+		if (maxsector != -1)
+			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector, 
+					    maxsector);
+
+		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+
+		maxsector = bio->bi_sector;
+		old_dev = bio->bi_bdev->bd_dev;
+
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 }
@@ -3153,6 +3191,8 @@ static int __end_that_request_first(struct request *req, int uptodate,
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 
+	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
+
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
 	 */
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e29b8926f80e..1f2890989b56 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -38,6 +38,7 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
+#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
@@ -2331,6 +2332,7 @@ static inline void complete_command( ctlr_info_t *h, CommandList_struct *cmd,
 
 	cmd->rq->completion_data = cmd;
 	cmd->rq->errors = status;
+	blk_add_trace_rq(cmd->rq->q, cmd->rq, BLK_TA_COMPLETE);
 	blk_complete_request(cmd->rq);
 }
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 26b08ee425c7..8c82373f7ff3 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -17,6 +17,7 @@
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
+#include <linux/blktrace_api.h>
 
 static const char *_name = DM_NAME;
 
@@ -334,6 +335,8 @@ static void dec_pending(struct dm_io *io, int error)
 			/* nudge anyone waiting on suspend queue */
 			wake_up(&io->md->wait);
 
+		blk_add_trace_bio(io->md->queue, io->bio, BLK_TA_COMPLETE);
+
 		bio_endio(io->bio, io->bio->bi_size, io->error);
 		free_io(io->md, io);
 	}
@@ -392,6 +395,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		      struct target_io *tio)
 {
 	int r;
+	sector_t sector;
 
 	/*
 	 * Sanity checks.
@@ -407,10 +411,17 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	 * this io.
 	 */
 	atomic_inc(&tio->io->io_count);
+	sector = clone->bi_sector;
 	r = ti->type->map(ti, clone, &tio->info);
-	if (r > 0)
+	if (r > 0) {
 		/* the bio has been remapped so dispatch it */
+
+		blk_add_trace_remap(bdev_get_queue(clone->bi_bdev), clone, 
+				    tio->io->bio->bi_bdev->bd_dev, sector, 
+				    clone->bi_sector);
+
 		generic_make_request(clone);
+	}
 
 	else if (r < 0) {
 		/* error the io and bail out */
diff --git a/fs/bio.c b/fs/bio.c
index 8f1d2e815c96..0a8c59cb68f5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,6 +25,7 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
+#include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #define BIO_POOL_SIZE 256
@@ -1095,6 +1096,9 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors)
 	if (!bp)
 		return bp;
 
+	blk_add_trace_pdu_int(bdev_get_queue(bi->bi_bdev), BLK_TA_SPLIT, bi,
+				bi->bi_sector + first_sectors);
+
 	BUG_ON(bi->bi_vcnt != 1);
 	BUG_ON(bi->bi_idx != 0);
 	atomic_set(&bp->cnt, 3);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c666769a875d..7c031f00fd79 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -72,6 +72,7 @@
 #include <linux/i2c-dev.h>
 #include <linux/wireless.h>
 #include <linux/atalk.h>
+#include <linux/blktrace_api.h>
 
 #include <net/sock.h>          /* siocdevprivate_ioctl */
 #include <net/bluetooth/bluetooth.h>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56bb6a4e15f3..c179966f1a2f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -22,6 +22,7 @@ typedef struct request_queue request_queue_t;
 struct elevator_queue;
 typedef struct elevator_queue elevator_t;
 struct request_pm_state;
+struct blk_trace;
 
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
@@ -416,6 +417,8 @@ struct request_queue
 	unsigned int		sg_reserved_size;
 	int			node;
 
+	struct blk_trace	*blk_trace;
+
 	/*
 	 * reserved for flush operations
 	 */
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
new file mode 100644
index 000000000000..b34d3e73d5ea
--- /dev/null
+++ b/include/linux/blktrace_api.h
@@ -0,0 +1,277 @@
+#ifndef BLKTRACE_H
+#define BLKTRACE_H
+
+#include <linux/config.h>
+#include <linux/blkdev.h>
+#include <linux/relay.h>
+
+/*
+ * Trace categories
+ */
+enum blktrace_cat {
+	BLK_TC_READ	= 1 << 0,	/* reads */
+	BLK_TC_WRITE	= 1 << 1,	/* writes */
+	BLK_TC_BARRIER	= 1 << 2,	/* barrier */
+	BLK_TC_SYNC	= 1 << 3,	/* barrier */
+	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
+	BLK_TC_REQUEUE	= 1 << 5,	/* requeueing */
+	BLK_TC_ISSUE	= 1 << 6,	/* issue */
+	BLK_TC_COMPLETE	= 1 << 7,	/* completions */
+	BLK_TC_FS	= 1 << 8,	/* fs requests */
+	BLK_TC_PC	= 1 << 9,	/* pc requests */
+	BLK_TC_NOTIFY	= 1 << 10,	/* special message */
+
+	BLK_TC_END	= 1 << 15,	/* only 16-bits, reminder */
+};
+
+#define BLK_TC_SHIFT		(16)
+#define BLK_TC_ACT(act)		((act) << BLK_TC_SHIFT)
+
+/*
+ * Basic trace actions
+ */
+enum blktrace_act {
+	__BLK_TA_QUEUE = 1,		/* queued */
+	__BLK_TA_BACKMERGE,		/* back merged to existing rq */
+	__BLK_TA_FRONTMERGE,		/* front merge to existing rq */
+	__BLK_TA_GETRQ,			/* allocated new request */
+	__BLK_TA_SLEEPRQ,		/* sleeping on rq allocation */
+	__BLK_TA_REQUEUE,		/* request requeued */
+	__BLK_TA_ISSUE,			/* sent to driver */
+	__BLK_TA_COMPLETE,		/* completed by driver */
+	__BLK_TA_PLUG,			/* queue was plugged */
+	__BLK_TA_UNPLUG_IO,		/* queue was unplugged by io */
+	__BLK_TA_UNPLUG_TIMER,		/* queue was unplugged by timer */
+	__BLK_TA_INSERT,		/* insert request */
+	__BLK_TA_SPLIT,			/* bio was split */
+	__BLK_TA_BOUNCE,		/* bio was bounced */
+	__BLK_TA_REMAP,			/* bio was remapped */
+};
+
+/*
+ * Trace actions in full. Additionally, read or write is masked
+ */
+#define BLK_TA_QUEUE		(__BLK_TA_QUEUE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_BACKMERGE	(__BLK_TA_BACKMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_FRONTMERGE	(__BLK_TA_FRONTMERGE | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_GETRQ		(__BLK_TA_GETRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_SLEEPRQ		(__BLK_TA_SLEEPRQ | BLK_TC_ACT(BLK_TC_QUEUE))
+#define	BLK_TA_REQUEUE		(__BLK_TA_REQUEUE | BLK_TC_ACT(BLK_TC_REQUEUE))
+#define BLK_TA_ISSUE		(__BLK_TA_ISSUE | BLK_TC_ACT(BLK_TC_ISSUE))
+#define BLK_TA_COMPLETE		(__BLK_TA_COMPLETE| BLK_TC_ACT(BLK_TC_COMPLETE))
+#define BLK_TA_PLUG		(__BLK_TA_PLUG | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_IO	(__BLK_TA_UNPLUG_IO | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_UNPLUG_TIMER	(__BLK_TA_UNPLUG_TIMER | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_INSERT		(__BLK_TA_INSERT | BLK_TC_ACT(BLK_TC_QUEUE))
+#define BLK_TA_SPLIT		(__BLK_TA_SPLIT)
+#define BLK_TA_BOUNCE		(__BLK_TA_BOUNCE)
+#define BLK_TA_REMAP		(__BLK_TA_REMAP | BLK_TC_ACT(BLK_TC_QUEUE))
+
+#define BLK_IO_TRACE_MAGIC	0x65617400
+#define BLK_IO_TRACE_VERSION	0x07
+
+/*
+ * The trace itself
+ */
+struct blk_io_trace {
+	u32 magic;		/* MAGIC << 8 | version */
+	u32 sequence;		/* event number */
+	u64 time;		/* in microseconds */
+	u64 sector;		/* disk offset */
+	u32 bytes;		/* transfer length */
+	u32 action;		/* what happened */
+	u32 pid;		/* who did it */
+	u32 device;		/* device number */
+	u32 cpu;		/* on what cpu did it happen */
+	u16 error;		/* completion error */
+	u16 pdu_len;		/* length of data after this trace */
+};
+
+/*
+ * The remap event
+ */
+struct blk_io_trace_remap {
+	u32 device;
+	u32 __pad;
+	u64 sector;
+};
+
+enum {
+	Blktrace_setup = 1,
+	Blktrace_running,
+	Blktrace_stopped,
+};
+
+struct blk_trace {
+	int trace_state;
+	struct rchan *rchan;
+	unsigned long *sequence;
+	u16 act_mask;
+	u64 start_lba;
+	u64 end_lba;
+	u32 pid;
+	u32 dev;
+	struct dentry *dir;
+	struct dentry *dropped_file;
+	atomic_t dropped;
+};
+
+/*
+ * User setup structure passed with BLKTRACESTART
+ */
+struct blk_user_trace_setup {
+	char name[BDEVNAME_SIZE];	/* output */
+	u16 act_mask;			/* input */
+	u32 buf_size;			/* input */
+	u32 buf_nr;			/* input */
+	u64 start_lba;
+	u64 end_lba;
+	u32 pid;
+};
+
+#if defined(CONFIG_BLK_DEV_IO_TRACE)
+extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
+extern void blk_trace_shutdown(request_queue_t *);
+extern void __blk_add_trace(struct blk_trace *, sector_t, int, int, u32, int, int, void *);
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static inline void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+				    u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+	int rw = rq->flags & 0x07;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, rw, what, rq->errors, 0, NULL);
+	}
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static inline void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+				     u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+/**
+ * blk_add_trace_generic - Add a trace for a generic action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @rw:		the data direction
+ * @what:	the action
+ *
+ * Description:
+ *     Records a simple trace
+ *
+ **/
+static inline void blk_add_trace_generic(struct request_queue *q,
+					 struct bio *bio, int rw, u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		blk_add_trace_bio(q, bio, what);
+	else
+		__blk_add_trace(bt, 0, 0, rw, what, 0, 0, NULL);
+}
+
+/**
+ * blk_add_trace_pdu_int - Add a trace for a bio with an integer payload
+ * @q:		queue the io is for
+ * @what:	the action
+ * @bio:	the source bio
+ * @pdu:	the integer payload
+ *
+ * Description:
+ *     Adds a trace with some integer payload. This might be an unplug
+ *     option given as the action, with the depth at unplug time given
+ *     as the payload
+ *
+ **/
+static inline void blk_add_trace_pdu_int(struct request_queue *q, u32 what,
+					 struct bio *bio, unsigned int pdu)
+{
+	struct blk_trace *bt = q->blk_trace;
+	u64 rpdu = cpu_to_be64(pdu);
+
+	if (likely(!bt))
+		return;
+
+	if (bio)
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what, !bio_flagged(bio, BIO_UPTODATE), sizeof(rpdu), &rpdu);
+	else
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static inline void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+				       dev_t dev, sector_t from, sector_t to)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+#else /* !CONFIG_BLK_DEV_IO_TRACE */
+#define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
+#define blk_trace_shutdown(q)			do { } while (0)
+#define blk_add_trace_rq(q, rq, what)		do { } while (0)
+#define blk_add_trace_bio(q, rq, what)		do { } while (0)
+#define blk_add_trace_generic(q, rq, rw, what)	do { } while (0)
+#define blk_add_trace_pdu_int(q, what, bio, pdu)	do { } while (0)
+#define blk_add_trace_remap(q, bio, dev, f, t)	do {} while (0)
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#endif
diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h
index ae7dfb790df3..efb518f16bb3 100644
--- a/include/linux/compat_ioctl.h
+++ b/include/linux/compat_ioctl.h
@@ -97,6 +97,10 @@ COMPATIBLE_IOCTL(BLKRRPART)
 COMPATIBLE_IOCTL(BLKFLSBUF)
 COMPATIBLE_IOCTL(BLKSECTSET)
 COMPATIBLE_IOCTL(BLKSSZGET)
+COMPATIBLE_IOCTL(BLKTRACESTART)
+COMPATIBLE_IOCTL(BLKTRACESTOP)
+COMPATIBLE_IOCTL(BLKTRACESETUP)
+COMPATIBLE_IOCTL(BLKTRACETEARDOWN)
 ULONG_IOCTL(BLKRASET)
 ULONG_IOCTL(BLKFRASET)
 /* RAID */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f9c9dea636d0..9b34a1b03455 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -197,6 +197,10 @@ extern int dir_notify_enable;
 #define BLKBSZGET  _IOR(0x12,112,size_t)
 #define BLKBSZSET  _IOW(0x12,113,size_t)
 #define BLKGETSIZE64 _IOR(0x12,114,size_t)	/* return device size in bytes (u64 *arg) */
+#define BLKTRACESETUP _IOWR(0x12,115,struct blk_user_trace_setup)
+#define BLKTRACESTART _IO(0x12,116)
+#define BLKTRACESTOP _IO(0x12,117)
+#define BLKTRACETEARDOWN _IO(0x12,118)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62e6314382f0..e60a91d5b369 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -706,6 +706,7 @@ struct task_struct {
 	prio_array_t *array;
 
 	unsigned short ioprio;
+	unsigned int btrace_seq;
 
 	unsigned long sleep_avg;
 	unsigned long long timestamp, last_ran;
diff --git a/kernel/fork.c b/kernel/fork.c
index c79ae0b19a49..c21bae8c93b9 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -181,6 +181,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 	/* One for us, one for whoever does the "release_task()" (usually parent) */
 	atomic_set(&tsk->usage,2);
 	atomic_set(&tsk->fs_excl, 0);
+	tsk->btrace_seq = 0;
 	return tsk;
 }
 
diff --git a/mm/highmem.c b/mm/highmem.c
index ce2e7e8bbfa7..d0ea1eec6a9a 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,6 +26,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
+#include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 
 static mempool_t *page_pool, *isa_page_pool;
@@ -483,6 +484,8 @@ void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
 		pool = isa_page_pool;
 	}
 
+	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
+
 	/*
 	 * slow path
 	 */
-- 
cgit v1.2.3


From f6ef943813ac3085ece7252ea101d663581219f6 Mon Sep 17 00:00:00 2001
From: Bart Samwel <bart@samwel.tk>
Date: Fri, 24 Mar 2006 03:15:48 -0800
Subject: [PATCH] Represent dirty_*_centisecs as jiffies internally

Make that the internal values for:

/proc/sys/vm/dirty_writeback_centisecs
/proc/sys/vm/dirty_expire_centisecs

are stored as jiffies instead of centiseconds.  Let the sysctl interface do
the conversions with full precision using clock_t_to_jiffies, instead of
doing overflow-sensitive on-the-fly conversions every time the values are
used.

Cons: apparent precision loss if HZ is not a multiple of 100, because of
conversion back and forth.  This is a common problem for all sysctl values
that use proc_dointvec_userhz_jiffies.  (There is only one other in-tree
use, in net/core/neighbour.c.)

Signed-off-by: Bart Samwel <bart@samwel.tk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/writeback.h |  4 ++--
 kernel/sysctl.c           | 10 +++++-----
 mm/page-writeback.c       | 24 ++++++++++++------------
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'mm')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index beaef5c7a0ea..609565961494 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -88,8 +88,8 @@ void throttle_vm_writeout(void);
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
 extern int vm_dirty_ratio;
-extern int dirty_writeback_centisecs;
-extern int dirty_expire_centisecs;
+extern int dirty_writeback_interval;
+extern int dirty_expire_interval;
 extern int block_dump;
 extern int laptop_mode;
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 32b48e8ee36e..817ba25517eb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -742,18 +742,18 @@ static ctl_table vm_table[] = {
 	{
 		.ctl_name	= VM_DIRTY_WB_CS,
 		.procname	= "dirty_writeback_centisecs",
-		.data		= &dirty_writeback_centisecs,
-		.maxlen		= sizeof(dirty_writeback_centisecs),
+		.data		= &dirty_writeback_interval,
+		.maxlen		= sizeof(dirty_writeback_interval),
 		.mode		= 0644,
 		.proc_handler	= &dirty_writeback_centisecs_handler,
 	},
 	{
 		.ctl_name	= VM_DIRTY_EXPIRE_CS,
 		.procname	= "dirty_expire_centisecs",
-		.data		= &dirty_expire_centisecs,
-		.maxlen		= sizeof(dirty_expire_centisecs),
+		.data		= &dirty_expire_interval,
+		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_userhz_jiffies,
 	},
 	{
 		.ctl_name	= VM_NR_PDFLUSH_THREADS,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 945559fb63d2..e79107991d20 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -75,12 +75,12 @@ int vm_dirty_ratio = 40;
  * The interval between `kupdate'-style writebacks, in centiseconds
  * (hundredths of a second)
  */
-int dirty_writeback_centisecs = 5 * 100;
+int dirty_writeback_interval = 5 * HZ;
 
 /*
  * The longest number of centiseconds for which data is allowed to remain dirty
  */
-int dirty_expire_centisecs = 30 * 100;
+int dirty_expire_interval = 30 * HZ;
 
 /*
  * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -380,8 +380,8 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
  * just walks the superblock inode list, writing back any inodes which are
  * older than a specific point in time.
  *
- * Try to run once per dirty_writeback_centisecs.  But if a writeback event
- * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * Try to run once per dirty_writeback_interval.  But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
  * one-second gap.
  *
  * older_than_this takes precedence over nr_to_write.  So we'll only write back
@@ -406,9 +406,9 @@ static void wb_kupdate(unsigned long arg)
 	sync_supers();
 
 	get_writeback_state(&wbs);
-	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
+	oldest_jif = jiffies - dirty_expire_interval;
 	start_jif = jiffies;
-	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
+	next_jif = start_jif + dirty_writeback_interval;
 	nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
 	while (nr_to_write > 0) {
@@ -425,7 +425,7 @@ static void wb_kupdate(unsigned long arg)
 	}
 	if (time_before(next_jif, jiffies + HZ))
 		next_jif = jiffies + HZ;
-	if (dirty_writeback_centisecs)
+	if (dirty_writeback_interval)
 		mod_timer(&wb_timer, next_jif);
 }
 
@@ -435,11 +435,11 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec(table, write, file, buffer, length, ppos);
-	if (dirty_writeback_centisecs) {
+	proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+	if (dirty_writeback_interval) {
 		mod_timer(&wb_timer,
-			jiffies + (dirty_writeback_centisecs * HZ) / 100);
-	} else {
+			jiffies + dirty_writeback_interval);
+		} else {
 		del_timer(&wb_timer);
 	}
 	return 0;
@@ -544,7 +544,7 @@ void __init page_writeback_init(void)
 		if (vm_dirty_ratio <= 0)
 			vm_dirty_ratio = 1;
 	}
-	mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
+	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
 	set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 }
-- 
cgit v1.2.3


From ed5b43f15a8e86e3ae939b98bc161ee973ecedf2 Mon Sep 17 00:00:00 2001
From: Bart Samwel <bart@samwel.tk>
Date: Fri, 24 Mar 2006 03:15:49 -0800
Subject: [PATCH] Represent laptop_mode as jiffies internally

Make that the internal value for /proc/sys/vm/laptop_mode is stored as
jiffies instead of seconds.  Let the sysctl interface do the conversions,
instead of doing on-the-fly conversions every time the value is used.

Add a description of the fact that laptop_mode doubles as a flag and a
timeout to the comment above the laptop_mode variable.

Signed-off-by: Bart Samwel <bart@samwel.tk>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/sysctl.c     | 5 ++---
 mm/page-writeback.c | 5 +++--
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 817ba25517eb..d13426680d10 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -848,9 +848,8 @@ static ctl_table vm_table[] = {
 		.data		= &laptop_mode,
 		.maxlen		= sizeof(laptop_mode),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
+		.proc_handler	= &proc_dointvec_jiffies,
+		.strategy	= &sysctl_jiffies,
 	},
 	{
 		.ctl_name	= VM_BLOCK_DUMP,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index e79107991d20..c1052ee79f01 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -88,7 +88,8 @@ int dirty_expire_interval = 30 * HZ;
 int block_dump;
 
 /*
- * Flag that puts the machine in "laptop mode".
+ * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
+ * a full sync is triggered after this time elapses without any disk activity.
  */
 int laptop_mode;
 
@@ -468,7 +469,7 @@ static void laptop_timer_fn(unsigned long unused)
  */
 void laptop_io_completion(void)
 {
-	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
+	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode);
 }
 
 /*
-- 
cgit v1.2.3


From 0b1303fcf23678ee1785841fb0c770a35cd0833c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@engr.sgi.com>
Date: Fri, 24 Mar 2006 03:15:59 -0800
Subject: [PATCH] cpusets: only wakeup kswapd for zones in the current cpuset

If we get under some memory pressure in a cpuset (we only scan zones that
are in the cpuset for memory) then kswapd is woken up for all zones.  This
patch only wakes up kswapd in zones that are part of the current cpuset.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Acked-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index b7f14a4799a5..a5c3f8bd98ae 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -943,7 +943,8 @@ restart:
 		goto got_pg;
 
 	do {
-		wakeup_kswapd(*z, order);
+		if (cpuset_zone_allowed(*z, gfp_mask))
+			wakeup_kswapd(*z, order);
 	} while (*(++z));
 
 	/*
-- 
cgit v1.2.3


From 44110fe385af23ca5eee8a6ad4ff55d50339097a Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:04 -0800
Subject: [PATCH] cpuset memory spread page cache implementation and hooks

Change the page cache allocation calls to support cpuset memory spreading.

See the previous patch, cpuset_mem_spread, for an explanation of cpuset memory
spreading.

On systems without cpusets configured in the kernel, this is no change.

On systems with cpusets configured in the kernel, but the "memory_spread"
cpuset option not enabled for the current tasks cpuset, this adds a call to a
cpuset routine and failed bit test of the processor state flag PF_SPREAD_PAGE.

On tasks in cpusets with "memory_spread" enabled, this adds a call to a cpuset
routine that computes which of the tasks mems_allowed nodes should be
preferred for this allocation.

If memory spreading applies to a particular allocation, then any other NUMA
mempolicy does not apply.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/pagemap.h |  5 +++++
 mm/filemap.c            | 23 +++++++++++++++++++++++
 2 files changed, 28 insertions(+)

(limited to 'mm')

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index ee700c6eb442..839f0b3c23aa 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -51,6 +51,10 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask)
 #define page_cache_release(page)	put_page(page)
 void release_pages(struct page **pages, int nr, int cold);
 
+#ifdef CONFIG_NUMA
+extern struct page *page_cache_alloc(struct address_space *x);
+extern struct page *page_cache_alloc_cold(struct address_space *x);
+#else
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
 	return alloc_pages(mapping_gfp_mask(x), 0);
@@ -60,6 +64,7 @@ static inline struct page *page_cache_alloc_cold(struct address_space *x)
 {
 	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
 }
+#endif
 
 typedef int filler_t(void *, struct page *);
 
diff --git a/mm/filemap.c b/mm/filemap.c
index e8f58f7dd7a5..d4ff48ec269e 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -29,6 +29,7 @@
 #include <linux/blkdev.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
+#include <linux/cpuset.h>
 #include "filemap.h"
 #include "internal.h"
 
@@ -427,6 +428,28 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
 	return ret;
 }
 
+#ifdef CONFIG_NUMA
+struct page *page_cache_alloc(struct address_space *x)
+{
+	if (cpuset_do_page_mem_spread()) {
+		int n = cpuset_mem_spread_node();
+		return alloc_pages_node(n, mapping_gfp_mask(x), 0);
+	}
+	return alloc_pages(mapping_gfp_mask(x), 0);
+}
+EXPORT_SYMBOL(page_cache_alloc);
+
+struct page *page_cache_alloc_cold(struct address_space *x)
+{
+	if (cpuset_do_page_mem_spread()) {
+		int n = cpuset_mem_spread_node();
+		return alloc_pages_node(n, mapping_gfp_mask(x)|__GFP_COLD, 0);
+	}
+	return alloc_pages(mapping_gfp_mask(x)|__GFP_COLD, 0);
+}
+EXPORT_SYMBOL(page_cache_alloc_cold);
+#endif
+
 /*
  * In order to wait for pages to become available there must be
  * waitqueues associated with pages. By using a hash table of
-- 
cgit v1.2.3


From 101a50019ae5e370d73984ee05d56dd3b08f330a Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:07 -0800
Subject: [PATCH] cpuset memory spread slab cache implementation

Provide the slab cache infrastructure to support cpuset memory spreading.

See the previous patches, cpuset_mem_spread, for an explanation of cpuset
memory spreading.

This patch provides a slab cache SLAB_MEM_SPREAD flag.  If set in the
kmem_cache_create() call defining a slab cache, then any task marked with the
process state flag PF_MEMSPREAD will spread memory page allocations for that
cache over all the allowed nodes, instead of preferring the local (faulting)
node.

On systems not configured with CONFIG_NUMA, this results in no change to the
page allocation code path for slab caches.

On systems with cpusets configured in the kernel, but the "memory_spread"
cpuset option not enabled for the current tasks cpuset, this adds a call to a
cpuset routine and failed bit test of the processor state flag PF_SPREAD_SLAB.

For tasks so marked, a second inline test is done for the slab cache flag
SLAB_MEM_SPREAD, and if that is set and if the allocation is not
in_interrupt(), this adds a call to to a cpuset routine that computes which of
the tasks mems_allowed nodes should be preferred for this allocation.

==> This patch adds another hook into the performance critical
    code path to allocating objects from the slab cache, in the
    ____cache_alloc() chunk, below.  The next patch optimizes this
    hook, reducing the impact of the combined mempolicy plus memory
    spreading hooks on this critical code path to a single check
    against the tasks task_struct flags word.

This patch provides the generic slab flags and logic needed to apply memory
spreading to a particular slab.

A subsequent patch will mark a few specific slab caches for this placement
policy.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h |  1 +
 mm/slab.c            | 13 +++++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 2b28c849d75a..e2ee5b268797 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -46,6 +46,7 @@ typedef struct kmem_cache kmem_cache_t;
 						   what is reclaimable later*/
 #define SLAB_PANIC		0x00040000UL	/* panic if kmem_cache_create() fails */
 #define SLAB_DESTROY_BY_RCU	0x00080000UL	/* defer freeing pages to RCU */
+#define SLAB_MEM_SPREAD		0x00100000UL	/* Spread some memory over cpuset */
 
 /* flags passed to a constructor func */
 #define	SLAB_CTOR_CONSTRUCTOR	0x001UL		/* if not set, then deconstructor */
diff --git a/mm/slab.c b/mm/slab.c
index 1c8f5ee230d5..de516658d3d8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -94,6 +94,7 @@
 #include	<linux/interrupt.h>
 #include	<linux/init.h>
 #include	<linux/compiler.h>
+#include	<linux/cpuset.h>
 #include	<linux/seq_file.h>
 #include	<linux/notifier.h>
 #include	<linux/kallsyms.h>
@@ -173,12 +174,12 @@
 			 SLAB_CACHE_DMA | \
 			 SLAB_MUST_HWCACHE_ALIGN | SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-			 SLAB_DESTROY_BY_RCU)
+			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #else
 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | SLAB_MUST_HWCACHE_ALIGN | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
-			 SLAB_DESTROY_BY_RCU)
+			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD)
 #endif
 
 /*
@@ -2810,6 +2811,14 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	if (unlikely(current->mempolicy && !in_interrupt())) {
 		int nid = slab_node(current->mempolicy);
 
+		if (nid != numa_node_id())
+			return __cache_alloc_node(cachep, flags, nid);
+	}
+	if (unlikely(cpuset_do_slab_mem_spread() &&
+					(cachep->flags & SLAB_MEM_SPREAD) &&
+					!in_interrupt())) {
+		int nid = cpuset_mem_spread_node();
+
 		if (nid != numa_node_id())
 			return __cache_alloc_node(cachep, flags, nid);
 	}
-- 
cgit v1.2.3


From c61afb181c649754ea221f104e268cbacfc993e3 Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:08 -0800
Subject: [PATCH] cpuset memory spread slab cache optimizations

The hooks in the slab cache allocator code path for support of NUMA
mempolicies and cpuset memory spreading are in an important code path.  Many
systems will use neither feature.

This patch optimizes those hooks down to a single check of some bits in the
current tasks task_struct flags.  For non NUMA systems, this hook and related
code is already ifdef'd out.

The optimization is done by using another task flag, set if the task is using
a non-default NUMA mempolicy.  Taking this flag bit along with the
PF_SPREAD_PAGE and PF_SPREAD_SLAB flag bits added earlier in this 'cpuset
memory spreading' patch set, one can check for the combination of any of these
special case memory placement mechanisms with a single test of the current
tasks task_struct flags.

This patch also tightens up the code, to save a few bytes of kernel text
space, and moves some of it out of line.  Due to the nested inlines called
from multiple places, we were ending up with three copies of this code, which
once we get off the main code path (for local node allocation) seems a bit
wasteful of instruction memory.

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempolicy.h |  5 +++++
 include/linux/sched.h     |  1 +
 kernel/fork.c             |  1 +
 mm/mempolicy.c            | 32 ++++++++++++++++++++++++++++++++
 mm/slab.c                 | 41 ++++++++++++++++++++++++++++-------------
 5 files changed, 67 insertions(+), 13 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index bbd2221923c3..6a7621b2b12b 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -147,6 +147,7 @@ extern void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *new);
 extern void mpol_rebind_task(struct task_struct *tsk,
 					const nodemask_t *new);
 extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new);
+extern void mpol_fix_fork_child_flag(struct task_struct *p);
 #define set_cpuset_being_rebound(x) (cpuset_being_rebound = (x))
 
 #ifdef CONFIG_CPUSET
@@ -248,6 +249,10 @@ static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 {
 }
 
+static inline void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+}
+
 #define set_cpuset_being_rebound(x) do {} while (0)
 
 static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b0e37cfa09f5..2cda439ece43 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -932,6 +932,7 @@ static inline void put_task_struct(struct task_struct *t)
 #define PF_SWAPWRITE	0x01000000	/* Allowed to write to swap */
 #define PF_SPREAD_PAGE	0x04000000	/* Spread page cache over cpuset */
 #define PF_SPREAD_SLAB	0x08000000	/* Spread some slab caches over cpuset */
+#define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
diff --git a/kernel/fork.c b/kernel/fork.c
index c21bae8c93b9..a02063903aaa 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1021,6 +1021,7 @@ static task_t *copy_process(unsigned long clone_flags,
  		p->mempolicy = NULL;
  		goto bad_fork_cleanup_cpuset;
  	}
+	mpol_fix_fork_child_flag(p);
 #endif
 
 #ifdef CONFIG_DEBUG_MUTEXES
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e93cc740c22b..4f71cfd29c6f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -422,6 +422,37 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
 	return mpol_check_policy(mode, nodes);
 }
 
+
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy.  Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+	if (p->mempolicy)
+		p->flags |= PF_MEMPOLICY;
+	else
+		p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+	mpol_fix_fork_child_flag(current);
+}
+
 /* Set the process memory policy */
 long do_set_mempolicy(int mode, nodemask_t *nodes)
 {
@@ -434,6 +465,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
 		return PTR_ERR(new);
 	mpol_free(current->mempolicy);
 	current->mempolicy = new;
+	mpol_set_task_struct_flag();
 	if (new && new->policy == MPOL_INTERLEAVE)
 		current->il_next = first_node(new->v.nodes);
 	return 0;
diff --git a/mm/slab.c b/mm/slab.c
index de516658d3d8..f80b52388a12 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -899,6 +899,7 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 
 #ifdef CONFIG_NUMA
 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
+static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
 static struct array_cache **alloc_alien_cache(int node, int limit)
 {
@@ -2808,19 +2809,11 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	struct array_cache *ac;
 
 #ifdef CONFIG_NUMA
-	if (unlikely(current->mempolicy && !in_interrupt())) {
-		int nid = slab_node(current->mempolicy);
-
-		if (nid != numa_node_id())
-			return __cache_alloc_node(cachep, flags, nid);
-	}
-	if (unlikely(cpuset_do_slab_mem_spread() &&
-					(cachep->flags & SLAB_MEM_SPREAD) &&
-					!in_interrupt())) {
-		int nid = cpuset_mem_spread_node();
-
-		if (nid != numa_node_id())
-			return __cache_alloc_node(cachep, flags, nid);
+	if (unlikely(current->flags & (PF_SPREAD_PAGE | PF_SPREAD_SLAB |
+							PF_MEMPOLICY))) {
+		objp = alternate_node_alloc(cachep, flags);
+		if (objp != NULL)
+			return objp;
 	}
 #endif
 
@@ -2855,6 +2848,28 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
 }
 
 #ifdef CONFIG_NUMA
+/*
+ * Try allocating on another node if PF_SPREAD_PAGE|PF_SPREAD_SLAB|PF_MEMPOLICY.
+ *
+ * If we are in_interrupt, then process context, including cpusets and
+ * mempolicy, may not apply and should not be used for allocation policy.
+ */
+static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+	int nid_alloc, nid_here;
+
+	if (in_interrupt())
+		return NULL;
+	nid_alloc = nid_here = numa_node_id();
+	if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD))
+		nid_alloc = cpuset_mem_spread_node();
+	else if (current->mempolicy)
+		nid_alloc = slab_node(current->mempolicy);
+	if (nid_alloc != nid_here)
+		return __cache_alloc_node(cachep, flags, nid_alloc);
+	return NULL;
+}
+
 /*
  * A interface to enable slab creation on nodeid
  */
-- 
cgit v1.2.3


From b2455396be35383c4eebc6745cc718b1dd9e23df Mon Sep 17 00:00:00 2001
From: Paul Jackson <pj@sgi.com>
Date: Fri, 24 Mar 2006 03:16:12 -0800
Subject: [PATCH] cpuset: memory_spread_slab drop useless PF_SPREAD_PAGE check

The hook in the slab cache allocation path to handle cpuset memory
spreading for tasks in cpusets with 'memory_spread_slab' enabled has a
modest performance bug.  The hook calls into the memory spreading handler
alternate_node_alloc() if either of 'memory_spread_slab' or
'memory_spread_page' is enabled, even though the handler does nothing
(albeit harmlessly) for the page case

Fix - drop PF_SPREAD_PAGE from the set of flag bits that are used to
trigger a call to alternate_node_alloc().

The page case is handled by separate hooks -- see the calls conditioned on
cpuset_do_page_mem_spread() in mm/filemap.c

Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index f80b52388a12..26138c9f8f00 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2809,8 +2809,7 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 	struct array_cache *ac;
 
 #ifdef CONFIG_NUMA
-	if (unlikely(current->flags & (PF_SPREAD_PAGE | PF_SPREAD_SLAB |
-							PF_MEMPOLICY))) {
+	if (unlikely(current->flags & (PF_SPREAD_SLAB | PF_MEMPOLICY))) {
 		objp = alternate_node_alloc(cachep, flags);
 		if (objp != NULL)
 			return objp;
@@ -2849,7 +2848,7 @@ static __always_inline void *__cache_alloc(struct kmem_cache *cachep,
 
 #ifdef CONFIG_NUMA
 /*
- * Try allocating on another node if PF_SPREAD_PAGE|PF_SPREAD_SLAB|PF_MEMPOLICY.
+ * Try allocating on another node if PF_SPREAD_SLAB|PF_MEMPOLICY.
  *
  * If we are in_interrupt, then process context, including cpusets and
  * mempolicy, may not apply and should not be used for allocation policy.
-- 
cgit v1.2.3


From 469eb4d03878b676418f853011ebfb54ccf83a5e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:17:45 -0800
Subject: [PATCH] filemap_fdatawrite_range() api: clarify -end parameter

I had trouble understanding working out whether filemap_fdatawrite_range()'s
`end' parameter describes the last-byte-to-be-written or the last-plus-one.
Clarify that in comments.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/filemap.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/filemap.c b/mm/filemap.c
index d4ff48ec269e..c1b1708cc95d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -175,7 +175,7 @@ static int sync_page(void *word)
  * dirty pages that lie within the byte offsets <start, end>
  * @mapping:	address space structure to write
  * @start:	offset in bytes where the range starts
- * @end:	offset in bytes where the range ends
+ * @end:	offset in bytes where the range ends (inclusive)
  * @sync_mode:	enable synchronous operation
  *
  * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
@@ -368,6 +368,12 @@ int filemap_write_and_wait(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_write_and_wait);
 
+/*
+ * Write out and wait upon file offsets lstart->lend, inclusive.
+ *
+ * Note that `lend' is inclusive (describes the last byte to be written) so
+ * that this function can be used to write to the very end-of-file (end = -1).
+ */
 int filemap_write_and_wait_range(struct address_space *mapping,
 				 loff_t lstart, loff_t lend)
 {
-- 
cgit v1.2.3


From ebcf28e1c7a295f3321249dd235ad2e45938fdd9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:04 -0800
Subject: [PATCH] fadvise(): write commands

Add two new linux-specific fadvise extensions():

LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
offsets `offset' and `offset+len'.  Any pages which are currently under
writeout are skipped, whether or not they are dirty.

LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
offsets `offset' and `offset+len'.

By combining these two operations the application may do several things:

LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.

LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty
pages at the disk.

LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all
of the currently dirty pages at the disk, wait until they have been written.

It should be noted that none of these operations write out the file's
metadata.  So unless the application is strictly performing overwrites of
already-instantiated disk blocks, there are no guarantees here that the data
will be available after a crash.

To complete this suite of operations I guess we should have a "sync file
metadata only" operation.  This gives applications access to all the building
blocks needed for all sorts of sync operations.  But sync-metadata doesn't fit
well with the fadvise() interface.  Probably it should be a new syscall:
sys_fmetadatasync().

The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64().
It is made to represent that last affected byte in the file (ie: it is
inclusive).  Generally, all these byterange and pagerange functions are
inclusive so we can easily represent EOF with -1.

As Ulrich notes, these two functions are somewhat abusive of the fadvise()
concept, which appears to be "set the future policy for this fd".

But these commands are a perfect fit with the fadvise() impementation, and
several of the existing fadvise() commands are synchronous and don't affect
future policy either.   I think we can live with the slight incongruity.

Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/fadvise.h |  6 ++++++
 include/linux/fs.h      |  5 +++++
 mm/fadvise.c            | 46 +++++++++++++++++++++++++++++++++++++++++-----
 mm/filemap.c            | 10 +++++-----
 4 files changed, 57 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/include/linux/fadvise.h b/include/linux/fadvise.h
index e8e747139b9a..b2913bba35d8 100644
--- a/include/linux/fadvise.h
+++ b/include/linux/fadvise.h
@@ -18,4 +18,10 @@
 #define POSIX_FADV_NOREUSE	5 /* Data will be accessed once.  */
 #endif
 
+/*
+ * Linux-specific fadvise() extensions:
+ */
+#define LINUX_FADV_ASYNC_WRITE	32	/* Start writeout on range */
+#define LINUX_FADV_WRITE_WAIT	33	/* Wait upon writeout to range */
+
 #endif	/* FADVISE_H_INCLUDED */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 65e6df247ea5..0ad70c1e5e55 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1473,6 +1473,11 @@ extern int filemap_fdatawait(struct address_space *);
 extern int filemap_write_and_wait(struct address_space *mapping);
 extern int filemap_write_and_wait_range(struct address_space *mapping,
 				        loff_t lstart, loff_t lend);
+extern int wait_on_page_writeback_range(struct address_space *mapping,
+				pgoff_t start, pgoff_t end);
+extern int __filemap_fdatawrite_range(struct address_space *mapping,
+				loff_t start, loff_t end, int sync_mode);
+
 extern void sync_supers(void);
 extern void sync_filesystems(int wait);
 extern void emergency_sync(void);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index d257c89e7704..907c39257ca0 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -15,6 +15,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/fadvise.h>
+#include <linux/writeback.h>
 #include <linux/syscalls.h>
 
 #include <asm/unistd.h>
@@ -22,13 +23,36 @@
 /*
  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
  * deactivate the pages and clear PG_Referenced.
+ *
+ * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
+ * offsets `offset' and `offset+len' inclusive.  Any pages which are currently
+ * under writeout are skipped, whether or not they are dirty.
+ *
+ * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
+ * offsets `offset' and `offset+len'.
+ *
+ * By combining these two operations the application may do several things:
+ *
+ * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently
+ * dirty pages at the disk.
+ *
+ * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push
+ * all of the currently dirty pages at the disk, wait until they have been
+ * written.
+ *
+ * It should be noted that none of these operations write out the file's
+ * metadata.  So unless the application is strictly performing overwrites of
+ * already-instantiated disk blocks, there are no guarantees here that the data
+ * will be available after a crash.
  */
 asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 {
 	struct file *file = fget(fd);
 	struct address_space *mapping;
 	struct backing_dev_info *bdi;
-	loff_t endbyte;
+	loff_t endbyte;			/* inclusive */
 	pgoff_t start_index;
 	pgoff_t end_index;
 	unsigned long nrpages;
@@ -56,6 +80,8 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 	endbyte = offset + len;
 	if (!len || endbyte < len)
 		endbyte = -1;
+	else
+		endbyte--;		/* inclusive */
 
 	bdi = mapping->backing_dev_info;
 
@@ -78,7 +104,7 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 
 		/* First and last PARTIAL page! */
 		start_index = offset >> PAGE_CACHE_SHIFT;
-		end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
+		end_index = endbyte >> PAGE_CACHE_SHIFT;
 
 		/* Careful about overflow on the "+1" */
 		nrpages = end_index - start_index + 1;
@@ -96,11 +122,21 @@ asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
 			filemap_flush(mapping);
 
 		/* First and last FULL page! */
-		start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+		start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
 		end_index = (endbyte >> PAGE_CACHE_SHIFT);
 
-		if (end_index > start_index)
-			invalidate_mapping_pages(mapping, start_index, end_index-1);
+		if (end_index >= start_index)
+			invalidate_mapping_pages(mapping, start_index,
+						end_index);
+		break;
+	case LINUX_FADV_ASYNC_WRITE:
+		ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
+						WB_SYNC_NONE);
+		break;
+	case LINUX_FADV_WRITE_WAIT:
+		ret = wait_on_page_writeback_range(mapping,
+					offset >> PAGE_CACHE_SHIFT,
+					endbyte >> PAGE_CACHE_SHIFT);
 		break;
 	default:
 		ret = -EINVAL;
diff --git a/mm/filemap.c b/mm/filemap.c
index c1b1708cc95d..3ef20739e725 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -183,8 +183,8 @@ static int sync_page(void *word)
  * these two operations is that if a dirty page/buffer is encountered, it must
  * be waited upon, and not just skipped over.
  */
-static int __filemap_fdatawrite_range(struct address_space *mapping,
-	loff_t start, loff_t end, int sync_mode)
+int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+				loff_t end, int sync_mode)
 {
 	int ret;
 	struct writeback_control wbc = {
@@ -213,8 +213,8 @@ int filemap_fdatawrite(struct address_space *mapping)
 }
 EXPORT_SYMBOL(filemap_fdatawrite);
 
-static int filemap_fdatawrite_range(struct address_space *mapping,
-	loff_t start, loff_t end)
+static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+				loff_t end)
 {
 	return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
 }
@@ -233,7 +233,7 @@ EXPORT_SYMBOL(filemap_flush);
  * Wait for writeback to complete against pages indexed by start->end
  * inclusive
  */
-static int wait_on_page_writeback_range(struct address_space *mapping,
+int wait_on_page_writeback_range(struct address_space *mapping,
 				pgoff_t start, pgoff_t end)
 {
 	struct pagevec pvec;
-- 
cgit v1.2.3


From fa5a734e406b53761fcc5ee22366006f71112c2d Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:10 -0800
Subject: [PATCH] balance_dirty_pages_ratelimited: take nr_pages arg

Modify balance_dirty_pages_ratelimited() so that it can take a
number-of-pages-which-I-just-dirtied argument.  For msync().

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/writeback.h | 10 +++++++++-
 mm/page-writeback.c       | 24 +++++++++++++++---------
 2 files changed, 24 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 609565961494..56f92fcbe94a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -99,7 +99,15 @@ int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
 				      void __user *, size_t *, loff_t *);
 
 void page_writeback_init(void);
-void balance_dirty_pages_ratelimited(struct address_space *mapping);
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+					unsigned long nr_pages_dirtied);
+
+static inline void
+balance_dirty_pages_ratelimited(struct address_space *mapping)
+{
+	balance_dirty_pages_ratelimited_nr(mapping, 1);
+}
+
 int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
 int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
 int sync_page_range(struct inode *inode, struct address_space *mapping,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c1052ee79f01..c67ddc464721 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -256,8 +256,9 @@ static void balance_dirty_pages(struct address_space *mapping)
 }
 
 /**
- * balance_dirty_pages_ratelimited - balance dirty memory state
+ * balance_dirty_pages_ratelimited_nr - balance dirty memory state
  * @mapping: address_space which was dirtied
+ * @nr_pages: number of pages which the caller has just dirtied
  *
  * Processes which are dirtying memory should call in here once for each page
  * which was newly dirtied.  The function will periodically check the system's
@@ -268,10 +269,12 @@ static void balance_dirty_pages(struct address_space *mapping)
  * limit we decrease the ratelimiting by a lot, to prevent individual processes
  * from overshooting the limit by (ratelimit_pages) each.
  */
-void balance_dirty_pages_ratelimited(struct address_space *mapping)
+void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
+					unsigned long nr_pages_dirtied)
 {
-	static DEFINE_PER_CPU(int, ratelimits) = 0;
-	long ratelimit;
+	static DEFINE_PER_CPU(unsigned long, ratelimits) = 0;
+	unsigned long ratelimit;
+	unsigned long *p;
 
 	ratelimit = ratelimit_pages;
 	if (dirty_exceeded)
@@ -281,15 +284,18 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
 	 * Check the rate limiting. Also, we do not want to throttle real-time
 	 * tasks in balance_dirty_pages(). Period.
 	 */
-	if (get_cpu_var(ratelimits)++ >= ratelimit) {
-		__get_cpu_var(ratelimits) = 0;
-		put_cpu_var(ratelimits);
+	preempt_disable();
+	p =  &__get_cpu_var(ratelimits);
+	*p += nr_pages_dirtied;
+	if (unlikely(*p >= ratelimit)) {
+		*p = 0;
+		preempt_enable();
 		balance_dirty_pages(mapping);
 		return;
 	}
-	put_cpu_var(ratelimits);
+	preempt_enable();
 }
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
 
 void throttle_vm_writeout(void)
 {
-- 
cgit v1.2.3


From 4741c9fd36b3bcadd37238321c469049da94a4b9 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:11 -0800
Subject: [PATCH] set_page_dirty() return value fixes

We need set_page_dirty() to return true if it actually transitioned the page
from a clean to dirty state.  This wasn't right in a couple of places.  Do a
kernel-wide audit, fix things up.

This leaves open the possibility of returning a negative errno from
set_page_dirty() sometime in the future.  But we don't do that at present.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/cris/arch-v32/drivers/cryptocop.c |  2 +-
 drivers/block/rd.c                     |  3 ++-
 fs/buffer.c                            |  2 +-
 include/linux/fs.h                     |  2 +-
 mm/page-writeback.c                    | 11 ++++++-----
 5 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'mm')

diff --git a/arch/cris/arch-v32/drivers/cryptocop.c b/arch/cris/arch-v32/drivers/cryptocop.c
index 501fa52d8d3a..c59ee28a35f4 100644
--- a/arch/cris/arch-v32/drivers/cryptocop.c
+++ b/arch/cris/arch-v32/drivers/cryptocop.c
@@ -2944,7 +2944,7 @@ static int cryptocop_ioctl_process(struct inode *inode, struct file *filp, unsig
 		int spdl_err;
 		/* Mark output pages dirty. */
 		spdl_err = set_page_dirty_lock(outpages[i]);
-		DEBUG(if (spdl_err)printk("cryptocop_ioctl_process: set_page_dirty_lock returned %d\n", spdl_err));
+		DEBUG(if (spdl_err < 0)printk("cryptocop_ioctl_process: set_page_dirty_lock returned %d\n", spdl_err));
 	}
 	for (i = 0; i < nooutpages; i++){
 		put_page(outpages[i]);
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index 1c54f46d3f70..940bfd7951e5 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -186,7 +186,8 @@ static int ramdisk_writepages(struct address_space *mapping,
  */
 static int ramdisk_set_page_dirty(struct page *page)
 {
-	SetPageDirty(page);
+	if (!TestSetPageDirty(page))
+		return 1;
 	return 0;
 }
 
diff --git a/fs/buffer.c b/fs/buffer.c
index 11ca6eb46a33..24262ea8cc50 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -865,8 +865,8 @@ int __set_page_dirty_buffers(struct page *page)
 		}
 		write_unlock_irq(&mapping->tree_lock);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+		return 1;
 	}
-	
 	return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_buffers);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 0ad70c1e5e55..092cfaee0cd2 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -350,7 +350,7 @@ struct address_space_operations {
 	/* Write back some dirty pages from this mapping. */
 	int (*writepages)(struct address_space *, struct writeback_control *);
 
-	/* Set a page dirty */
+	/* Set a page dirty.  Return true if this dirtied it */
 	int (*set_page_dirty)(struct page *page);
 
 	int (*readpages)(struct file *filp, struct address_space *mapping,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c67ddc464721..893d7677579e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -628,8 +628,6 @@ EXPORT_SYMBOL(write_one_page);
  */
 int __set_page_dirty_nobuffers(struct page *page)
 {
-	int ret = 0;
-
 	if (!TestSetPageDirty(page)) {
 		struct address_space *mapping = page_mapping(page);
 		struct address_space *mapping2;
@@ -651,8 +649,9 @@ int __set_page_dirty_nobuffers(struct page *page)
 							I_DIRTY_PAGES);
 			}
 		}
+		return 1;
 	}
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 
@@ -682,8 +681,10 @@ int fastcall set_page_dirty(struct page *page)
 			return (*spd)(page);
 		return __set_page_dirty_buffers(page);
 	}
-	if (!PageDirty(page))
-		SetPageDirty(page);
+	if (!PageDirty(page)) {
+		if (!TestSetPageDirty(page))
+			return 1;
+	}
 	return 0;
 }
 EXPORT_SYMBOL(set_page_dirty);
-- 
cgit v1.2.3


From 9c50823eebf7c256b92b4e0f02b5fb30e97788c2 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:12 -0800
Subject: [PATCH] msync(): perform dirty page levelling

It seems sensible to perform dirty page throttling in msync: as the application
dirties pages we can kick off pdflush early, or even force the msync() caller
to perform writeout, or even throttle the msync() caller.

The main effect of this is to start disk writeback earlier if we've just
discovered that a large amount of pagecache has been dirtied.  (Otherwise it
wouldn't happen for up to five seconds, next time pdflush wakes up).

It also will cause the page-dirtying process to get panalised for dirtying
those pages rather than whacking someone else with the problem.

We should do this for munmap() and possibly even exit(), too.

We drop the mmap_sem while performing the dirty page balancing.  It doesn't
seem right to hold mmap_sem for that long.

Note that this patch only affects MS_ASYNC.  MS_SYNC will be syncing all the
dirty pages anyway.

We note that msync(MS_SYNC) does a full-file-sync inside mmap_sem, and always
has.  We can fix that up...

The patch also tightens up the mmap_sem coverage in sys_msync(): no point in
taking it while we perform the incoming arg checking.

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/msync.c | 93 +++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 61 insertions(+), 32 deletions(-)

(limited to 'mm')

diff --git a/mm/msync.c b/mm/msync.c
index 3563a56e1a51..8a66f5d5d4f0 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -12,17 +12,20 @@
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/hugetlb.h>
+#include <linux/writeback.h>
+#include <linux/file.h>
 #include <linux/syscalls.h>
 
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 
-static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
+static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end)
 {
 	pte_t *pte;
 	spinlock_t *ptl;
 	int progress = 0;
+	unsigned long ret = 0;
 
 again:
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -43,58 +46,64 @@ again:
 		if (!page)
 			continue;
 		if (ptep_clear_flush_dirty(vma, addr, pte) ||
-		    page_test_and_clear_dirty(page))
-			set_page_dirty(page);
+				page_test_and_clear_dirty(page))
+			ret += set_page_dirty(page);
 		progress += 3;
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(pte - 1, ptl);
 	cond_resched();
 	if (addr != end)
 		goto again;
+	return ret;
 }
 
-static inline void msync_pmd_range(struct vm_area_struct *vma, pud_t *pud,
-				unsigned long addr, unsigned long end)
+static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
+			pud_t *pud, unsigned long addr, unsigned long end)
 {
 	pmd_t *pmd;
 	unsigned long next;
+	unsigned long ret = 0;
 
 	pmd = pmd_offset(pud, addr);
 	do {
 		next = pmd_addr_end(addr, end);
 		if (pmd_none_or_clear_bad(pmd))
 			continue;
-		msync_pte_range(vma, pmd, addr, next);
+		ret += msync_pte_range(vma, pmd, addr, next);
 	} while (pmd++, addr = next, addr != end);
+	return ret;
 }
 
-static inline void msync_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
-				unsigned long addr, unsigned long end)
+static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
+			pgd_t *pgd, unsigned long addr, unsigned long end)
 {
 	pud_t *pud;
 	unsigned long next;
+	unsigned long ret = 0;
 
 	pud = pud_offset(pgd, addr);
 	do {
 		next = pud_addr_end(addr, end);
 		if (pud_none_or_clear_bad(pud))
 			continue;
-		msync_pmd_range(vma, pud, addr, next);
+		ret += msync_pmd_range(vma, pud, addr, next);
 	} while (pud++, addr = next, addr != end);
+	return ret;
 }
 
-static void msync_page_range(struct vm_area_struct *vma,
+static unsigned long msync_page_range(struct vm_area_struct *vma,
 				unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd;
 	unsigned long next;
+	unsigned long ret = 0;
 
 	/* For hugepages we can't go walking the page table normally,
 	 * but that's ok, hugetlbfs is memory based, so we don't need
 	 * to do anything more on an msync().
 	 */
 	if (vma->vm_flags & VM_HUGETLB)
-		return;
+		return 0;
 
 	BUG_ON(addr >= end);
 	pgd = pgd_offset(vma->vm_mm, addr);
@@ -103,8 +112,9 @@ static void msync_page_range(struct vm_area_struct *vma,
 		next = pgd_addr_end(addr, end);
 		if (pgd_none_or_clear_bad(pgd))
 			continue;
-		msync_pud_range(vma, pgd, addr, next);
+		ret += msync_pud_range(vma, pgd, addr, next);
 	} while (pgd++, addr = next, addr != end);
+	return ret;
 }
 
 /*
@@ -118,8 +128,9 @@ static void msync_page_range(struct vm_area_struct *vma,
  * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
  * applications.
  */
-static int msync_interval(struct vm_area_struct *vma,
-			unsigned long addr, unsigned long end, int flags)
+static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
+			unsigned long end, int flags,
+			unsigned long *nr_pages_dirtied)
 {
 	int ret = 0;
 	struct file *file = vma->vm_file;
@@ -128,7 +139,7 @@ static int msync_interval(struct vm_area_struct *vma,
 		return -EBUSY;
 
 	if (file && (vma->vm_flags & VM_SHARED)) {
-		msync_page_range(vma, addr, end);
+		*nr_pages_dirtied = msync_page_range(vma, addr, end);
 
 		if (flags & MS_SYNC) {
 			struct address_space *mapping = file->f_mapping;
@@ -157,11 +168,8 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 	unsigned long end;
 	struct vm_area_struct *vma;
 	int unmapped_error, error = -EINVAL;
+	int done = 0;
 
-	if (flags & MS_SYNC)
-		current->flags |= PF_SYNCWRITE;
-
-	down_read(&current->mm->mmap_sem);
 	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
 		goto out;
 	if (start & ~PAGE_MASK)
@@ -180,13 +188,19 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 	 * If the interval [start,end) covers some unmapped address ranges,
 	 * just ignore them, but return -ENOMEM at the end.
 	 */
+	down_read(&current->mm->mmap_sem);
+	if (flags & MS_SYNC)
+		current->flags |= PF_SYNCWRITE;
 	vma = find_vma(current->mm, start);
 	unmapped_error = 0;
-	for (;;) {
+	do {
+		unsigned long nr_pages_dirtied = 0;
+		struct file *file;
+
 		/* Still start < end. */
 		error = -ENOMEM;
 		if (!vma)
-			goto out;
+			goto out_unlock;
 		/* Here start < vma->vm_end. */
 		if (start < vma->vm_start) {
 			unmapped_error = -ENOMEM;
@@ -195,22 +209,37 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 		/* Here vma->vm_start <= start < vma->vm_end. */
 		if (end <= vma->vm_end) {
 			if (start < end) {
-				error = msync_interval(vma, start, end, flags);
+				error = msync_interval(vma, start, end, flags,
+							&nr_pages_dirtied);
 				if (error)
-					goto out;
+					goto out_unlock;
 			}
 			error = unmapped_error;
-			goto out;
+			done = 1;
+		} else {
+			/* Here vma->vm_start <= start < vma->vm_end < end. */
+			error = msync_interval(vma, start, vma->vm_end, flags,
+						&nr_pages_dirtied);
+			if (error)
+				goto out_unlock;
 		}
-		/* Here vma->vm_start <= start < vma->vm_end < end. */
-		error = msync_interval(vma, start, vma->vm_end, flags);
-		if (error)
-			goto out;
+		file = vma->vm_file;
 		start = vma->vm_end;
-		vma = vma->vm_next;
-	}
-out:
-	up_read(&current->mm->mmap_sem);
+		if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
+			get_file(file);
+			up_read(&current->mm->mmap_sem);
+			balance_dirty_pages_ratelimited_nr(file->f_mapping,
+							nr_pages_dirtied);
+			fput(file);
+			down_read(&current->mm->mmap_sem);
+			vma = find_vma(current->mm, start);
+		} else {
+			vma = vma->vm_next;
+		}
+	} while (!done);
+out_unlock:
 	current->flags &= ~PF_SYNCWRITE;
+	up_read(&current->mm->mmap_sem);
+out:
 	return error;
 }
-- 
cgit v1.2.3


From 707c21c848deeb0200ba3f07e4ba90e6dc419c2f Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:13 -0800
Subject: [PATCH] msync(MS_SYNC): don't hold mmap_sem while syncing

It seems bad to hold mmap_sem while performing synchronous disk I/O.  Alter
the msync(MS_SYNC) code so that the lock is released while we sync the file.

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/msync.c | 49 ++++++++++++++++++++++++++-----------------------
 1 file changed, 26 insertions(+), 23 deletions(-)

(limited to 'mm')

diff --git a/mm/msync.c b/mm/msync.c
index 8a66f5d5d4f0..ee9bd6759833 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -132,35 +132,14 @@ static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
 			unsigned long end, int flags,
 			unsigned long *nr_pages_dirtied)
 {
-	int ret = 0;
 	struct file *file = vma->vm_file;
 
 	if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
 		return -EBUSY;
 
-	if (file && (vma->vm_flags & VM_SHARED)) {
+	if (file && (vma->vm_flags & VM_SHARED))
 		*nr_pages_dirtied = msync_page_range(vma, addr, end);
-
-		if (flags & MS_SYNC) {
-			struct address_space *mapping = file->f_mapping;
-			int err;
-
-			ret = filemap_fdatawrite(mapping);
-			if (file->f_op && file->f_op->fsync) {
-				/*
-				 * We don't take i_mutex here because mmap_sem
-				 * is already held.
-				 */
-				err = file->f_op->fsync(file,file->f_dentry,1);
-				if (err && !ret)
-					ret = err;
-			}
-			err = filemap_fdatawait(mapping);
-			if (!ret)
-				ret = err;
-		}
-	}
-	return ret;
+	return 0;
 }
 
 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
@@ -233,6 +212,30 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 			fput(file);
 			down_read(&current->mm->mmap_sem);
 			vma = find_vma(current->mm, start);
+		} else if ((flags & MS_SYNC) && file &&
+				(vma->vm_flags & VM_SHARED)) {
+			struct address_space *mapping;
+			int err;
+
+			get_file(file);
+			up_read(&current->mm->mmap_sem);
+			mapping = file->f_mapping;
+			error = filemap_fdatawrite(mapping);
+			if (file->f_op && file->f_op->fsync) {
+				mutex_lock(&mapping->host->i_mutex);
+				err = file->f_op->fsync(file,file->f_dentry,1);
+				mutex_unlock(&mapping->host->i_mutex);
+				if (err && !error)
+					error = err;
+			}
+			err = filemap_fdatawait(mapping);
+			if (err && !error)
+				error = err;
+			fput(file);
+			down_read(&current->mm->mmap_sem);
+			if (error)
+				goto out_unlock;
+			vma = find_vma(current->mm, start);
 		} else {
 			vma = vma->vm_next;
 		}
-- 
cgit v1.2.3


From 676758bdb7bfca8413a85203921746f446e237be Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:14 -0800
Subject: [PATCH] msync: fix return value

msync() does a strange thing.  Essentially:

	vma = find_vma();
	for ( ; ; ) {
		if (!vma)
			return -ENOMEM;
		...
		vma = vma->vm_next;
	}

so an msync() request which starts within or before a valid VMA and which ends
within or beyond the final VMA will incorrectly return -ENOMEM.

Fix.

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/msync.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/mm/msync.c b/mm/msync.c
index ee9bd6759833..d6a50f3f28b6 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -146,7 +146,8 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 {
 	unsigned long end;
 	struct vm_area_struct *vma;
-	int unmapped_error, error = -EINVAL;
+	int unmapped_error = 0;
+	int error = -EINVAL;
 	int done = 0;
 
 	if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
@@ -171,15 +172,14 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 	if (flags & MS_SYNC)
 		current->flags |= PF_SYNCWRITE;
 	vma = find_vma(current->mm, start);
-	unmapped_error = 0;
+	if (!vma) {
+		error = -ENOMEM;
+		goto out_unlock;
+	}
 	do {
 		unsigned long nr_pages_dirtied = 0;
 		struct file *file;
 
-		/* Still start < end. */
-		error = -ENOMEM;
-		if (!vma)
-			goto out_unlock;
 		/* Here start < vma->vm_end. */
 		if (start < vma->vm_start) {
 			unmapped_error = -ENOMEM;
@@ -239,7 +239,7 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 		} else {
 			vma = vma->vm_next;
 		}
-	} while (!done);
+	} while (vma && !done);
 out_unlock:
 	current->flags &= ~PF_SYNCWRITE;
 	up_read(&current->mm->mmap_sem);
-- 
cgit v1.2.3


From 8f2e9f157a94f444dad974b088b853ac40785b02 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Fri, 24 Mar 2006 03:18:15 -0800
Subject: [PATCH] msync(): use do_fsync()

No need to duplicate all that code.

Cc: Hugh Dickins <hugh@veritas.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/msync.c | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/mm/msync.c b/mm/msync.c
index d6a50f3f28b6..2672b8dc3d89 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -9,6 +9,7 @@
  */
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/hugetlb.h>
@@ -214,23 +215,9 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
 			vma = find_vma(current->mm, start);
 		} else if ((flags & MS_SYNC) && file &&
 				(vma->vm_flags & VM_SHARED)) {
-			struct address_space *mapping;
-			int err;
-
 			get_file(file);
 			up_read(&current->mm->mmap_sem);
-			mapping = file->f_mapping;
-			error = filemap_fdatawrite(mapping);
-			if (file->f_op && file->f_op->fsync) {
-				mutex_lock(&mapping->host->i_mutex);
-				err = file->f_op->fsync(file,file->f_dentry,1);
-				mutex_unlock(&mapping->host->i_mutex);
-				if (err && !error)
-					error = err;
-			}
-			err = filemap_fdatawait(mapping);
-			if (err && !error)
-				error = err;
+			error = do_fsync(file, 0);
 			fput(file);
 			down_read(&current->mm->mmap_sem);
 			if (error)
-- 
cgit v1.2.3


From 96840aa00a031069a136ec4c55d0bdd09ac6d3a7 Mon Sep 17 00:00:00 2001
From: Davi Arnaut <davi.arnaut@gmail.com>
Date: Fri, 24 Mar 2006 03:18:42 -0800
Subject: [PATCH] strndup_user()

This patch series creates a strndup_user() function to easy copying C strings
from userspace.  Also we avoid common pitfalls like userspace modifying the
final \0 after the strlen_user().

Signed-off-by: Davi Arnaut <davi.arnaut@gmail.com>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/string.h |  2 ++
 mm/util.c              | 37 +++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

(limited to 'mm')

diff --git a/include/linux/string.h b/include/linux/string.h
index 369be3264a55..dee221429ad0 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -18,6 +18,8 @@ extern char * strsep(char **,const char *);
 extern __kernel_size_t strspn(const char *,const char *);
 extern __kernel_size_t strcspn(const char *,const char *);
 
+extern char *strndup_user(const char __user *, long);
+
 /*
  * Include machine specific inline routines
  */
diff --git a/mm/util.c b/mm/util.c
index 5f4bb59da63c..49e29f751b50 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,6 +1,8 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/module.h>
+#include <linux/err.h>
+#include <asm/uaccess.h>
 
 /**
  * kzalloc - allocate memory. The memory is set to zero.
@@ -37,3 +39,38 @@ char *kstrdup(const char *s, gfp_t gfp)
 	return buf;
 }
 EXPORT_SYMBOL(kstrdup);
+
+/*
+ * strndup_user - duplicate an existing string from user space
+ *
+ * @s: The string to duplicate
+ * @n: Maximum number of bytes to copy, including the trailing NUL.
+ */
+char *strndup_user(const char __user *s, long n)
+{
+	char *p;
+	long length;
+
+	length = strnlen_user(s, n);
+
+	if (!length)
+		return ERR_PTR(-EFAULT);
+
+	if (length > n)
+		return ERR_PTR(-EINVAL);
+
+	p = kmalloc(length, GFP_KERNEL);
+
+	if (!p)
+		return ERR_PTR(-ENOMEM);
+
+	if (copy_from_user(p, s, length)) {
+		kfree(p);
+		return ERR_PTR(-EFAULT);
+	}
+
+	p[length - 1] = '\0';
+
+	return p;
+}
+EXPORT_SYMBOL(strndup_user);
-- 
cgit v1.2.3


From 16538c40776b8be6b0f23966e08fdc7b8fff823f Mon Sep 17 00:00:00 2001
From: Amos Waterland <apw@us.ibm.com>
Date: Fri, 24 Mar 2006 18:30:53 +0100
Subject: The comment describing how MS_ASYNC works in msync.c is confusing
 because of a typo.  This patch just changes "my" to "by", which I believe was
 the original intent.

Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 mm/msync.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm')

diff --git a/mm/msync.c b/mm/msync.c
index 2672b8dc3d89..bc6c95376366 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -126,7 +126,7 @@ static unsigned long msync_page_range(struct vm_area_struct *vma,
  * write out the dirty pages and wait on the writeout and check the result.
  * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
  * async writeout immediately.
- * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
+ * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
  * applications.
  */
 static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
-- 
cgit v1.2.3


From 871751e25d956ad24f129ca972b7851feaa61d53 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 25 Mar 2006 03:06:39 -0800
Subject: [PATCH] slab: implement /proc/slab_allocators

Implement /proc/slab_allocators.   It produces output like:

idr_layer_cache: 80 idr_pre_get+0x33/0x4e
buffer_head: 2555 alloc_buffer_head+0x20/0x75
mm_struct: 9 mm_alloc+0x1e/0x42
mm_struct: 20 dup_mm+0x36/0x370
vm_area_struct: 384 dup_mm+0x18f/0x370
vm_area_struct: 151 do_mmap_pgoff+0x2e0/0x7c3
vm_area_struct: 1 split_vma+0x5a/0x10e
vm_area_struct: 11 do_brk+0x206/0x2e2
vm_area_struct: 2 copy_vma+0xda/0x142
vm_area_struct: 9 setup_arg_pages+0x99/0x214
fs_cache: 8 copy_fs_struct+0x21/0x133
fs_cache: 29 copy_process+0xf38/0x10e3
files_cache: 30 alloc_files+0x1b/0xcf
signal_cache: 81 copy_process+0xbaa/0x10e3
sighand_cache: 77 copy_process+0xe65/0x10e3
sighand_cache: 1 de_thread+0x4d/0x5f8
anon_vma: 241 anon_vma_prepare+0xd9/0xf3
size-2048: 1 add_sect_attrs+0x5f/0x145
size-2048: 2 journal_init_revoke+0x99/0x302
size-2048: 2 journal_init_revoke+0x137/0x302
size-2048: 2 journal_init_inode+0xf9/0x1c4

Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Alexander Nyberg <alexn@telia.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
DESC
slab-leaks3-locking-fix
EDESC
From: Andrew Morton <akpm@osdl.org>

Update for slab-remove-cachep-spinlock.patch

Cc: Al Viro <viro@ftp.linux.org.uk>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Alexander Nyberg <alexn@telia.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Christoph Lameter <clameter@engr.sgi.com>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 fs/proc/proc_misc.c  |  37 +++++++++++
 include/linux/slab.h |   6 +-
 lib/Kconfig.debug    |   4 ++
 mm/slab.c            | 180 +++++++++++++++++++++++++++++++++++++++++++++++++--
 mm/util.c            |   4 +-
 net/core/skbuff.c    |   2 +-
 6 files changed, 222 insertions(+), 11 deletions(-)

(limited to 'mm')

diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 826c131994c3..1e9ea37d457e 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -485,6 +485,40 @@ static struct file_operations proc_slabinfo_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 };
+
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+extern struct seq_operations slabstats_op;
+static int slabstats_open(struct inode *inode, struct file *file)
+{
+	unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	int ret = -ENOMEM;
+	if (n) {
+		ret = seq_open(file, &slabstats_op);
+		if (!ret) {
+			struct seq_file *m = file->private_data;
+			*n = PAGE_SIZE / (2 * sizeof(unsigned long));
+			m->private = n;
+			n = NULL;
+		}
+		kfree(n);
+	}
+	return ret;
+}
+
+static int slabstats_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *m = file->private_data;
+	kfree(m->private);
+	return seq_release(inode, file);
+}
+
+static struct file_operations proc_slabstats_operations = {
+	.open		= slabstats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= slabstats_release,
+};
+#endif
 #endif
 
 static int show_stat(struct seq_file *p, void *v)
@@ -744,6 +778,9 @@ void __init proc_misc_init(void)
 	create_seq_entry("interrupts", 0, &proc_interrupts_operations);
 #ifdef CONFIG_SLAB
 	create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations);
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+	create_seq_entry("slab_allocators", 0 ,&proc_slabstats_operations);
+#endif
 #endif
 	create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations);
 	create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations);
diff --git a/include/linux/slab.h b/include/linux/slab.h
index e2ee5b268797..f88e08a5802c 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -77,11 +77,12 @@ struct cache_sizes {
 };
 extern struct cache_sizes malloc_sizes[];
 
-#ifndef CONFIG_DEBUG_SLAB
 extern void *__kmalloc(size_t, gfp_t);
+#ifndef CONFIG_DEBUG_SLAB
+#define ____kmalloc(size, flags) __kmalloc(size, flags)
 #else
 extern void *__kmalloc_track_caller(size_t, gfp_t, void*);
-#define __kmalloc(size, flags) \
+#define ____kmalloc(size, flags) \
     __kmalloc_track_caller(size, flags, __builtin_return_address(0))
 #endif
 
@@ -173,6 +174,7 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 #define kmem_ptr_validate(a, b) (0)
 #define kmem_cache_alloc_node(c, f, n) kmem_cache_alloc(c, f)
 #define kmalloc_node(s, f, n) kmalloc(s, f)
+#define ____kmalloc kmalloc
 
 #endif /* CONFIG_SLOB */
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f2618e1c2b93..1fe3f897145f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -85,6 +85,10 @@ config DEBUG_SLAB
 	  allocation as well as poisoning memory on free to catch use of freed
 	  memory. This can make kmalloc/kfree-intensive workloads much slower.
 
+config DEBUG_SLAB_LEAK
+	bool "Memory leak debugging"
+	depends on DEBUG_SLAB
+
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPT
diff --git a/mm/slab.c b/mm/slab.c
index 26138c9f8f00..a5047161084e 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -204,7 +204,8 @@
 typedef unsigned int kmem_bufctl_t;
 #define BUFCTL_END	(((kmem_bufctl_t)(~0U))-0)
 #define BUFCTL_FREE	(((kmem_bufctl_t)(~0U))-1)
-#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-2)
+#define	BUFCTL_ACTIVE	(((kmem_bufctl_t)(~0U))-2)
+#define	SLAB_LIMIT	(((kmem_bufctl_t)(~0U))-3)
 
 /* Max number of objs-per-slab for caches which use off-slab slabs.
  * Needed to avoid a possible looping condition in cache_grow().
@@ -2399,7 +2400,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
 	/* Verify that the slab belongs to the intended node */
 	WARN_ON(slabp->nodeid != nodeid);
 
-	if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
+	if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
 		printk(KERN_ERR "slab: double free detected in cache "
 				"'%s', objp %p\n", cachep->name, objp);
 		BUG();
@@ -2605,6 +2606,9 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
 		 */
 		cachep->dtor(objp + obj_offset(cachep), cachep, 0);
 	}
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+	slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
+#endif
 	if (cachep->flags & SLAB_POISON) {
 #ifdef CONFIG_DEBUG_PAGEALLOC
 		if ((cachep->buffer_size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2788,6 +2792,16 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
 		*dbg_redzone1(cachep, objp) = RED_ACTIVE;
 		*dbg_redzone2(cachep, objp) = RED_ACTIVE;
 	}
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+	{
+		struct slab *slabp;
+		unsigned objnr;
+
+		slabp = page_get_slab(virt_to_page(objp));
+		objnr = (unsigned)(objp - slabp->s_mem) / cachep->buffer_size;
+		slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
+	}
+#endif
 	objp += obj_offset(cachep);
 	if (cachep->ctor && cachep->flags & SLAB_POISON) {
 		unsigned long ctor_flags = SLAB_CTOR_CONSTRUCTOR;
@@ -3220,22 +3234,23 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 	return __cache_alloc(cachep, flags, caller);
 }
 
-#ifndef CONFIG_DEBUG_SLAB
 
 void *__kmalloc(size_t size, gfp_t flags)
 {
+#ifndef CONFIG_DEBUG_SLAB
 	return __do_kmalloc(size, flags, NULL);
+#else
+	return __do_kmalloc(size, flags, __builtin_return_address(0));
+#endif
 }
 EXPORT_SYMBOL(__kmalloc);
 
-#else
-
+#ifdef CONFIG_DEBUG_SLAB
 void *__kmalloc_track_caller(size_t size, gfp_t flags, void *caller)
 {
 	return __do_kmalloc(size, flags, caller);
 }
 EXPORT_SYMBOL(__kmalloc_track_caller);
-
 #endif
 
 #ifdef CONFIG_SMP
@@ -3899,6 +3914,159 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 		res = count;
 	return res;
 }
+
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+
+static void *leaks_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	struct list_head *p;
+
+	mutex_lock(&cache_chain_mutex);
+	p = cache_chain.next;
+	while (n--) {
+		p = p->next;
+		if (p == &cache_chain)
+			return NULL;
+	}
+	return list_entry(p, struct kmem_cache, next);
+}
+
+static inline int add_caller(unsigned long *n, unsigned long v)
+{
+	unsigned long *p;
+	int l;
+	if (!v)
+		return 1;
+	l = n[1];
+	p = n + 2;
+	while (l) {
+		int i = l/2;
+		unsigned long *q = p + 2 * i;
+		if (*q == v) {
+			q[1]++;
+			return 1;
+		}
+		if (*q > v) {
+			l = i;
+		} else {
+			p = q + 2;
+			l -= i + 1;
+		}
+	}
+	if (++n[1] == n[0])
+		return 0;
+	memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
+	p[0] = v;
+	p[1] = 1;
+	return 1;
+}
+
+static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
+{
+	void *p;
+	int i;
+	if (n[0] == n[1])
+		return;
+	for (i = 0, p = s->s_mem; i < c->num; i++, p += c->buffer_size) {
+		if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
+			continue;
+		if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
+			return;
+	}
+}
+
+static void show_symbol(struct seq_file *m, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char *modname;
+	const char *name;
+	unsigned long offset, size;
+	char namebuf[KSYM_NAME_LEN+1];
+
+	name = kallsyms_lookup(address, &size, &offset, &modname, namebuf);
+
+	if (name) {
+		seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
+		if (modname)
+			seq_printf(m, " [%s]", modname);
+		return;
+	}
+#endif
+	seq_printf(m, "%p", (void *)address);
+}
+
+static int leaks_show(struct seq_file *m, void *p)
+{
+	struct kmem_cache *cachep = p;
+	struct list_head *q;
+	struct slab *slabp;
+	struct kmem_list3 *l3;
+	const char *name;
+	unsigned long *n = m->private;
+	int node;
+	int i;
+
+	if (!(cachep->flags & SLAB_STORE_USER))
+		return 0;
+	if (!(cachep->flags & SLAB_RED_ZONE))
+		return 0;
+
+	/* OK, we can do it */
+
+	n[1] = 0;
+
+	for_each_online_node(node) {
+		l3 = cachep->nodelists[node];
+		if (!l3)
+			continue;
+
+		check_irq_on();
+		spin_lock_irq(&l3->list_lock);
+
+		list_for_each(q, &l3->slabs_full) {
+			slabp = list_entry(q, struct slab, list);
+			handle_slab(n, cachep, slabp);
+		}
+		list_for_each(q, &l3->slabs_partial) {
+			slabp = list_entry(q, struct slab, list);
+			handle_slab(n, cachep, slabp);
+		}
+		spin_unlock_irq(&l3->list_lock);
+	}
+	name = cachep->name;
+	if (n[0] == n[1]) {
+		/* Increase the buffer size */
+		mutex_unlock(&cache_chain_mutex);
+		m->private = kzalloc(n[0] * 4 * sizeof(unsigned long), GFP_KERNEL);
+		if (!m->private) {
+			/* Too bad, we are really out */
+			m->private = n;
+			mutex_lock(&cache_chain_mutex);
+			return -ENOMEM;
+		}
+		*(unsigned long *)m->private = n[0] * 2;
+		kfree(n);
+		mutex_lock(&cache_chain_mutex);
+		/* Now make sure this entry will be retried */
+		m->count = m->size;
+		return 0;
+	}
+	for (i = 0; i < n[1]; i++) {
+		seq_printf(m, "%s: %lu ", name, n[2*i+3]);
+		show_symbol(m, n[2*i+2]);
+		seq_putc(m, '\n');
+	}
+	return 0;
+}
+
+struct seq_operations slabstats_op = {
+	.start = leaks_start,
+	.next = s_next,
+	.stop = s_stop,
+	.show = leaks_show,
+};
+#endif
 #endif
 
 /**
diff --git a/mm/util.c b/mm/util.c
index 49e29f751b50..b68d3d7d0359 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -11,7 +11,7 @@
  */
 void *kzalloc(size_t size, gfp_t flags)
 {
-	void *ret = kmalloc(size, flags);
+	void *ret = ____kmalloc(size, flags);
 	if (ret)
 		memset(ret, 0, size);
 	return ret;
@@ -33,7 +33,7 @@ char *kstrdup(const char *s, gfp_t gfp)
 		return NULL;
 
 	len = strlen(s) + 1;
-	buf = kmalloc(len, gfp);
+	buf = ____kmalloc(len, gfp);
 	if (buf)
 		memcpy(buf, s, len);
 	return buf;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c9f878454531..09464fa8d72f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -149,7 +149,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
 
 	/* Get the DATA. Size must match skb_add_mtu(). */
 	size = SKB_DATA_ALIGN(size);
-	data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
+	data = ____kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
 	if (!data)
 		goto nodata;
 
-- 
cgit v1.2.3


From a8c0f9a41f88da703ade33f9c1626a55c786e8bb Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sat, 25 Mar 2006 03:06:42 -0800
Subject: [PATCH] slab: introduce kmem_cache_zalloc allocator

Introduce a memory-zeroing variant of kmem_cache_alloc.  The allocator
already exits in XFS and there are potential users for it so this patch
makes the allocator available for the general public.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h |  2 ++
 mm/slab.c            | 17 +++++++++++++++++
 mm/slob.c            | 10 ++++++++++
 3 files changed, 29 insertions(+)

(limited to 'mm')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index f88e08a5802c..1216b09e07b1 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -64,6 +64,7 @@ extern kmem_cache_t *kmem_cache_create(const char *, size_t, size_t, unsigned lo
 extern int kmem_cache_destroy(kmem_cache_t *);
 extern int kmem_cache_shrink(kmem_cache_t *);
 extern void *kmem_cache_alloc(kmem_cache_t *, gfp_t);
+extern void *kmem_cache_zalloc(struct kmem_cache *, gfp_t);
 extern void kmem_cache_free(kmem_cache_t *, void *);
 extern unsigned int kmem_cache_size(kmem_cache_t *);
 extern const char *kmem_cache_name(kmem_cache_t *);
@@ -156,6 +157,7 @@ struct kmem_cache *kmem_cache_create(const char *c, size_t, size_t,
 	void (*)(void *, struct kmem_cache *, unsigned long));
 int kmem_cache_destroy(struct kmem_cache *c);
 void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags);
+void *kmem_cache_zalloc(struct kmem_cache *, gfp_t);
 void kmem_cache_free(struct kmem_cache *c, void *b);
 const char *kmem_cache_name(struct kmem_cache *);
 void *kmalloc(size_t size, gfp_t flags);
diff --git a/mm/slab.c b/mm/slab.c
index a5047161084e..6f5aeebd4306 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3107,6 +3107,23 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+/**
+ * kmem_cache_alloc - Allocate an object. The memory is set to zero.
+ * @cache: The cache to allocate from.
+ * @flags: See kmalloc().
+ *
+ * Allocate an object from this cache and set the allocated memory to zero.
+ * The flags are only relevant if the cache has no available objects.
+ */
+void *kmem_cache_zalloc(struct kmem_cache *cache, gfp_t flags)
+{
+	void *ret = __cache_alloc(cache, flags, __builtin_return_address(0));
+	if (ret)
+		memset(ret, 0, obj_size(cache));
+	return ret;
+}
+EXPORT_SYMBOL(kmem_cache_zalloc);
+
 /**
  * kmem_ptr_validate - check if an untrusted pointer might
  *	be a slab entry.
diff --git a/mm/slob.c b/mm/slob.c
index a1f42bdc0245..9bcc7e2cabfd 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -294,6 +294,16 @@ void *kmem_cache_alloc(struct kmem_cache *c, gfp_t flags)
 }
 EXPORT_SYMBOL(kmem_cache_alloc);
 
+void *kmem_cache_zalloc(struct kmem_cache *c, gfp_t flags)
+{
+	void *ret = kmem_cache_alloc(c, flags);
+	if (ret)
+		memset(ret, 0, c->size);
+
+	return ret;
+}
+EXPORT_SYMBOL(kmem_cache_zalloc);
+
 void kmem_cache_free(struct kmem_cache *c, void *b)
 {
 	if (c->dtor)
-- 
cgit v1.2.3


From 40c07ae8daa659b8feb149c84731629386873c16 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sat, 25 Mar 2006 03:06:43 -0800
Subject: [PATCH] slab: optimize constant-size kzalloc calls

As suggested by Eric Dumazet, optimize kzalloc() calls that pass a
compile-time constant size.  Please note that the patch increases kernel
text slightly (~200 bytes for defconfig on x86).

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/slab.h | 30 +++++++++++++++++++++++++++---
 mm/util.c            |  6 +++---
 2 files changed, 30 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 1216b09e07b1..15e1d9736b1b 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -110,7 +110,30 @@ found:
 	return __kmalloc(size, flags);
 }
 
-extern void *kzalloc(size_t, gfp_t);
+extern void *__kzalloc(size_t, gfp_t);
+
+static inline void *kzalloc(size_t size, gfp_t flags)
+{
+	if (__builtin_constant_p(size)) {
+		int i = 0;
+#define CACHE(x) \
+		if (size <= x) \
+			goto found; \
+		else \
+			i++;
+#include "kmalloc_sizes.h"
+#undef CACHE
+		{
+			extern void __you_cannot_kzalloc_that_much(void);
+			__you_cannot_kzalloc_that_much();
+		}
+found:
+		return kmem_cache_zalloc((flags & GFP_DMA) ?
+			malloc_sizes[i].cs_dmacachep :
+			malloc_sizes[i].cs_cachep, flags);
+	}
+	return __kzalloc(size, flags);
+}
 
 /**
  * kcalloc - allocate memory for an array. The memory is set to zero.
@@ -161,14 +184,14 @@ void *kmem_cache_zalloc(struct kmem_cache *, gfp_t);
 void kmem_cache_free(struct kmem_cache *c, void *b);
 const char *kmem_cache_name(struct kmem_cache *);
 void *kmalloc(size_t size, gfp_t flags);
-void *kzalloc(size_t size, gfp_t flags);
+void *__kzalloc(size_t size, gfp_t flags);
 void kfree(const void *m);
 unsigned int ksize(const void *m);
 unsigned int kmem_cache_size(struct kmem_cache *c);
 
 static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 {
-	return kzalloc(n * size, flags);
+	return __kzalloc(n * size, flags);
 }
 
 #define kmem_cache_shrink(d) (0)
@@ -176,6 +199,7 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 #define kmem_ptr_validate(a, b) (0)
 #define kmem_cache_alloc_node(c, f, n) kmem_cache_alloc(c, f)
 #define kmalloc_node(s, f, n) kmalloc(s, f)
+#define kzalloc(s, f) __kzalloc(s, f)
 #define ____kmalloc kmalloc
 
 #endif /* CONFIG_SLOB */
diff --git a/mm/util.c b/mm/util.c
index b68d3d7d0359..7368479220b3 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -5,18 +5,18 @@
 #include <asm/uaccess.h>
 
 /**
- * kzalloc - allocate memory. The memory is set to zero.
+ * __kzalloc - allocate memory. The memory is set to zero.
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
  */
-void *kzalloc(size_t size, gfp_t flags)
+void *__kzalloc(size_t size, gfp_t flags)
 {
 	void *ret = ____kmalloc(size, flags);
 	if (ret)
 		memset(ret, 0, size);
 	return ret;
 }
-EXPORT_SYMBOL(kzalloc);
+EXPORT_SYMBOL(__kzalloc);
 
 /*
  * kstrdup - allocate space for and copy an existing string
-- 
cgit v1.2.3


From c5e3b83e97be4e09961c0af101644643e5d03d17 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sat, 25 Mar 2006 03:06:43 -0800
Subject: [PATCH] mm: use kmem_cache_zalloc

Convert mm/ to use the new kmem_cache_zalloc allocator.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/mmap.c | 6 ++----
 mm/slab.c | 3 +--
 2 files changed, 3 insertions(+), 6 deletions(-)

(limited to 'mm')

diff --git a/mm/mmap.c b/mm/mmap.c
index 0eb9894db6de..4f5b5709136a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1040,12 +1040,11 @@ munmap_back:
 	 * specific mapper. the address has already been validated, but
 	 * not unmapped, but the maps are removed from the list.
 	 */
-	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma) {
 		error = -ENOMEM;
 		goto unacct_error;
 	}
-	memset(vma, 0, sizeof(*vma));
 
 	vma->vm_mm = mm;
 	vma->vm_start = addr;
@@ -1896,12 +1895,11 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
 	/*
 	 * create a vma struct for an anonymous mapping
 	 */
-	vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
+	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 	if (!vma) {
 		vm_unacct_memory(len >> PAGE_SHIFT);
 		return -ENOMEM;
 	}
-	memset(vma, 0, sizeof(*vma));
 
 	vma->vm_mm = mm;
 	vma->vm_start = addr;
diff --git a/mm/slab.c b/mm/slab.c
index 6f5aeebd4306..6a3760e0353c 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1990,10 +1990,9 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	align = ralign;
 
 	/* Get cache's description obj. */
-	cachep = kmem_cache_alloc(&cache_cache, SLAB_KERNEL);
+	cachep = kmem_cache_zalloc(&cache_cache, SLAB_KERNEL);
 	if (!cachep)
 		goto oops;
-	memset(cachep, 0, sizeof(struct kmem_cache));
 
 #if DEBUG
 	cachep->obj_size = size;
-- 
cgit v1.2.3


From 3ded175a4b7a4548f3358dcf5f3ad65f63cdb4ed Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sat, 25 Mar 2006 03:06:44 -0800
Subject: [PATCH] slab: add transfer_objects() function

slabr_objects() can be used to transfer objects between various object
caches of the slab allocator.  It is currently only used during
__cache_alloc() to retrieve elements from the shared array.  We will be
using it soon to transfer elements from the alien caches to the remote
shared array.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 6a3760e0353c..dee857a8680b 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -898,6 +898,30 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 	return nc;
 }
 
+/*
+ * Transfer objects in one arraycache to another.
+ * Locking must be handled by the caller.
+ *
+ * Return the number of entries transferred.
+ */
+static int transfer_objects(struct array_cache *to,
+		struct array_cache *from, unsigned int max)
+{
+	/* Figure out how many entries to transfer */
+	int nr = min(min(from->avail, max), to->limit - to->avail);
+
+	if (!nr)
+		return 0;
+
+	memcpy(to->entry + to->avail, from->entry + from->avail -nr,
+			sizeof(void *) *nr);
+
+	from->avail -= nr;
+	to->avail += nr;
+	to->touched = 1;
+	return nr;
+}
+
 #ifdef CONFIG_NUMA
 static void *__cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
@@ -2680,20 +2704,10 @@ retry:
 	BUG_ON(ac->avail > 0 || !l3);
 	spin_lock(&l3->list_lock);
 
-	if (l3->shared) {
-		struct array_cache *shared_array = l3->shared;
-		if (shared_array->avail) {
-			if (batchcount > shared_array->avail)
-				batchcount = shared_array->avail;
-			shared_array->avail -= batchcount;
-			ac->avail = batchcount;
-			memcpy(ac->entry,
-			       &(shared_array->entry[shared_array->avail]),
-			       sizeof(void *) * batchcount);
-			shared_array->touched = 1;
-			goto alloc_done;
-		}
-	}
+	/* See if we can refill from the shared array */
+	if (l3->shared && transfer_objects(ac, l3->shared, batchcount))
+		goto alloc_done;
+
 	while (batchcount > 0) {
 		struct list_head *entry;
 		struct slab *slabp;
-- 
cgit v1.2.3


From e00946fe2351307eb3eda7a3343530f6d2d1af2e Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sat, 25 Mar 2006 03:06:45 -0800
Subject: [PATCH] slab: Bypass free lists for __drain_alien_cache()

__drain_alien_cache() currently drains objects by freeing them to the
(remote) freelists of the original node.  However, each node also has a
shared list containing objects to be used on any processor of that node.
We can avoid a number of remote node accesses by copying the pointers to
the free objects directly into the remote shared array.

And while we are at it: Skip alien draining if the alien cache spinlock is
already taken.

Kiran reported that this is a performance benefit.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index dee857a8680b..351aa6c587f7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -971,6 +971,13 @@ static void __drain_alien_cache(struct kmem_cache *cachep,
 
 	if (ac->avail) {
 		spin_lock(&rl3->list_lock);
+		/*
+		 * Stuff objects into the remote nodes shared array first.
+		 * That way we could avoid the overhead of putting the objects
+		 * into the free lists and getting them back later.
+		 */
+		transfer_objects(rl3->shared, ac, ac->limit);
+
 		free_block(cachep, ac->entry, ac->avail, node);
 		ac->avail = 0;
 		spin_unlock(&rl3->list_lock);
@@ -986,8 +993,8 @@ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3)
 
 	if (l3->alien) {
 		struct array_cache *ac = l3->alien[node];
-		if (ac && ac->avail) {
-			spin_lock_irq(&ac->lock);
+
+		if (ac && ac->avail && spin_trylock_irq(&ac->lock)) {
 			__drain_alien_cache(cachep, ac, node);
 			spin_unlock_irq(&ac->lock);
 		}
-- 
cgit v1.2.3


From cafeb02e098ecd58fb0bd797b2c9fbba3edf54f8 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sat, 25 Mar 2006 03:06:46 -0800
Subject: [PATCH] alloc_kmemlist: Some cleanup in preparation for a real memory
 leak fix

Inspired by Jesper Juhl's patch from today

1. Get rid of err
	We do not set it to anything else but zero.

2. Drop the CONFIG_NUMA stuff.
	There are definitions for alloc_alien_cache and free_alien_cache()
	that do the right thing for the non NUMA case.

3. Better naming of variables.

4. Remove redundant cachep->nodelists[node] expressions.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Signed-off-by: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index 351aa6c587f7..ef9f60fe37d6 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3424,37 +3424,38 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 {
 	int node;
 	struct kmem_list3 *l3;
-	int err = 0;
+	struct array_cache *new_shared;
+	struct array_cache **new_alien;
 
 	for_each_online_node(node) {
-		struct array_cache *nc = NULL, *new;
-		struct array_cache **new_alien = NULL;
-#ifdef CONFIG_NUMA
+
 		new_alien = alloc_alien_cache(node, cachep->limit);
 		if (!new_alien)
 			goto fail;
-#endif
-		new = alloc_arraycache(node, cachep->shared*cachep->batchcount,
+
+		new_shared = alloc_arraycache(node, cachep->shared*cachep->batchcount,
 					0xbaadf00d);
-		if (!new)
+		if (!new_shared)
 			goto fail;
+
 		l3 = cachep->nodelists[node];
 		if (l3) {
+			struct array_cache *shared = l3->shared;
+
 			spin_lock_irq(&l3->list_lock);
 
-			nc = cachep->nodelists[node]->shared;
-			if (nc)
-				free_block(cachep, nc->entry, nc->avail, node);
+			if (shared)
+				free_block(cachep, shared->entry, shared->avail, node);
 
-			l3->shared = new;
-			if (!cachep->nodelists[node]->alien) {
+			l3->shared = new_shared;
+			if (!l3->alien) {
 				l3->alien = new_alien;
 				new_alien = NULL;
 			}
 			l3->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
 			spin_unlock_irq(&l3->list_lock);
-			kfree(nc);
+			kfree(shared);
 			free_alien_cache(new_alien);
 			continue;
 		}
@@ -3465,16 +3466,15 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 		kmem_list3_init(l3);
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
 				((unsigned long)cachep) % REAPTIMEOUT_LIST3;
-		l3->shared = new;
+		l3->shared = new_shared;
 		l3->alien = new_alien;
 		l3->free_limit = (1 + nr_cpus_node(node)) *
 					cachep->batchcount + cachep->num;
 		cachep->nodelists[node] = l3;
 	}
-	return err;
+	return 0;
 fail:
-	err = -ENOMEM;
-	return err;
+	return -ENOMEM;
 }
 
 struct ccupdate_struct {
-- 
cgit v1.2.3


From 0718dc2a82c865ca75975acabaf984057f9fd488 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sat, 25 Mar 2006 03:06:47 -0800
Subject: [PATCH] slab: fix memory leak in alloc_kmemlist

We have had this memory leak for a while now.  The situation is complicated
by the use of alloc_kmemlist() as a function to resize various caches by
do_tune_cpucache().

What we do here is first of all make sure that we deallocate properly in
the loop over all the nodes.

If we are just resizing caches then we can simply return with -ENOMEM if an
allocation fails.

If the cache is new then we need to rollback and remove all earlier
allocations.

We detect that a cache is new by checking if the link to the global cache
chain has been setup.  This is a bit hackish ....

(also fix up too overlong lines that I added in the last patch...)

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Jesper Juhl <jesper.juhl@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/slab.c | 33 ++++++++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 5 deletions(-)

(limited to 'mm')

diff --git a/mm/slab.c b/mm/slab.c
index ef9f60fe37d6..681837499d7d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3418,7 +3418,7 @@ const char *kmem_cache_name(struct kmem_cache *cachep)
 EXPORT_SYMBOL_GPL(kmem_cache_name);
 
 /*
- * This initializes kmem_list3 for all nodes.
+ * This initializes kmem_list3 or resizes varioius caches for all nodes.
  */
 static int alloc_kmemlist(struct kmem_cache *cachep)
 {
@@ -3433,10 +3433,13 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 		if (!new_alien)
 			goto fail;
 
-		new_shared = alloc_arraycache(node, cachep->shared*cachep->batchcount,
+		new_shared = alloc_arraycache(node,
+				cachep->shared*cachep->batchcount,
 					0xbaadf00d);
-		if (!new_shared)
+		if (!new_shared) {
+			free_alien_cache(new_alien);
 			goto fail;
+		}
 
 		l3 = cachep->nodelists[node];
 		if (l3) {
@@ -3445,7 +3448,8 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 			spin_lock_irq(&l3->list_lock);
 
 			if (shared)
-				free_block(cachep, shared->entry, shared->avail, node);
+				free_block(cachep, shared->entry,
+						shared->avail, node);
 
 			l3->shared = new_shared;
 			if (!l3->alien) {
@@ -3460,8 +3464,11 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 			continue;
 		}
 		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
-		if (!l3)
+		if (!l3) {
+			free_alien_cache(new_alien);
+			kfree(new_shared);
 			goto fail;
+		}
 
 		kmem_list3_init(l3);
 		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
@@ -3473,7 +3480,23 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 		cachep->nodelists[node] = l3;
 	}
 	return 0;
+
 fail:
+	if (!cachep->next.next) {
+		/* Cache is not active yet. Roll back what we did */
+		node--;
+		while (node >= 0) {
+			if (cachep->nodelists[node]) {
+				l3 = cachep->nodelists[node];
+
+				kfree(l3->shared);
+				free_alien_cache(l3->alien);
+				kfree(l3);
+				cachep->nodelists[node] = NULL;
+			}
+			node--;
+		}
+	}
 	return -ENOMEM;
 }
 
-- 
cgit v1.2.3


From d784124cfe9377c1a24d8efba31401f81c7c11f9 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <clameter@sgi.com>
Date: Sat, 25 Mar 2006 03:06:48 -0800
Subject: [PATCH] mm: make page migration dependent on swap and NUMA

The page migration code could function without NUMA but we currently have
no users for the non-NUMA case.

Signed-off-by: Christoph Lameter <clameter@sgi.com>
Cc: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/Kconfig b/mm/Kconfig
index bd80460360db..332f5c29b53a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -138,8 +138,8 @@ config SPLIT_PTLOCK_CPUS
 #
 config MIGRATION
 	bool "Page migration"
-	def_bool y if NUMA || SPARSEMEM || DISCONTIGMEM
-	depends on SWAP
+	def_bool y if NUMA
+	depends on SWAP && NUMA
 	help
 	  Allows the migration of the physical location of pages of processes
 	  while the virtual addresses are not changed. This is useful for
-- 
cgit v1.2.3


From f5335c0f1bcba16907972b66b905f62402433e23 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Sat, 25 Mar 2006 03:06:49 -0800
Subject: [PATCH] quieten zone_pcp_init

In zone_pcp_init we print out all zones even if they are empty:

On node 0 totalpages: 245760
  DMA zone: 245760 pages, LIFO batch:31
  DMA32 zone: 0 pages, LIFO batch:0
  Normal zone: 0 pages, LIFO batch:0
  HighMem zone: 0 pages, LIFO batch:0

To conserve dmesg space why not print only the non zero zones.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a5c3f8bd98ae..637f57ff5b5a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2029,8 +2029,9 @@ static __meminit void zone_pcp_init(struct zone *zone)
 		setup_pageset(zone_pcp(zone,cpu), batch);
 #endif
 	}
-	printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
-		zone->name, zone->present_pages, batch);
+	if (zone->present_pages)
+		printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%lu\n",
+			zone->name, zone->present_pages, batch);
 }
 
 static __meminit void init_currently_empty_zone(struct zone *zone,
-- 
cgit v1.2.3


From 05eeae208d08a05a6980cf2ff61f02843c0955fd Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@osdl.org>
Date: Sat, 25 Mar 2006 03:07:48 -0800
Subject: [PATCH] find_task_by_pid() needs tasklist_lock

A couple of places are forgetting to take it.

The kswapd case is probably unimportant.  keventd_create_kthread() was racy.

The whole thing is a bit flakey: you start a kernel thread, get its pid from
kernel_thread() then look up its task_struct.

a) It assumes that pid recycling takes a "long" time.

b) We get a task_struct but no reference was taken on it.  The owner of the
   kswapd and kthread task_struct*'s must assume that the new thread won't
   exit unexpectedly.  Because if it does, they're left holding dead memory
   and any attempt to control or stop that task will crash.

Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 kernel/kthread.c | 2 ++
 mm/vmscan.c      | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'mm')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 6a5373868a98..c5f3c6613b6d 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -115,7 +115,9 @@ static void keventd_create_kthread(void *_create)
 		create->result = ERR_PTR(pid);
 	} else {
 		wait_for_completion(&create->started);
+		read_lock(&tasklist_lock);
 		create->result = find_task_by_pid(pid);
+		read_unlock(&tasklist_lock);
 	}
 	complete(&create->done);
 }
diff --git a/mm/vmscan.c b/mm/vmscan.c
index fd572bbdc9f5..78865c849f8f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1356,7 +1356,9 @@ static int __init kswapd_init(void)
 
 		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
 		BUG_ON(pid < 0);
+		read_lock(&tasklist_lock);
 		pgdat->kswapd = find_task_by_pid(pid);
+		read_unlock(&tasklist_lock);
 	}
 	total_memory = nr_free_pagecache_pages();
 	hotcpu_notifier(cpu_callback, 0);
-- 
cgit v1.2.3


From 6e692ed37a507e18d8afe8e5faebd8c4722c5f12 Mon Sep 17 00:00:00 2001
From: John Hawkes <hawkes@sgi.com>
Date: Sat, 25 Mar 2006 03:08:02 -0800
Subject: [PATCH] fix alloc_large_system_hash() roundup

The "rounded up to nearest power of 2 in size" algorithm in
alloc_large_system_hash is not correct.  As coded, it takes an otherwise
acceptable power-of-2 value and doubles it.  For example, we see the error
if we boot with thash_entries=2097152 which produces a hash table with
4194304 entries.

Signed-off-by: John Hawkes <hawkes@sgi.com>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/page_alloc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 637f57ff5b5a..338a02bb004d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2702,8 +2702,7 @@ void *__init alloc_large_system_hash(const char *tablename,
 		else
 			numentries <<= (PAGE_SHIFT - scale);
 	}
-	/* rounded up to nearest power of 2 in size */
-	numentries = 1UL << (long_log2(numentries) + 1);
+	numentries = roundup_pow_of_two(numentries);
 
 	/* limit allocation size to 1/16 total memory by default */
 	if (max == 0) {
-- 
cgit v1.2.3


From 315ab19a6d12d6af7b6957090822f3057ab7e80f Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Sat, 25 Mar 2006 16:20:22 +0100
Subject: [PATCH] mm: restore vm_normal_page check

Hugh is rightly concerned that the CONFIG_DEBUG_VM coverage has gone too
far in vm_normal_page, considering that we expect production kernels to be
shipped with the option turned off, and that the code has been under some
large changes recently.

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index 80c3fb370f91..e347e106ca3a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -395,12 +395,16 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
 			return NULL;
 	}
 
-#ifdef CONFIG_DEBUG_VM
+	/*
+	 * Add some anal sanity checks for now. Eventually,
+	 * we should just do "return pfn_to_page(pfn)", but
+	 * in the meantime we check that we get a valid pfn,
+	 * and that the resulting page looks ok.
+	 */
 	if (unlikely(!pfn_valid(pfn))) {
 		print_bad_pte(vma, pte, addr);
 		return NULL;
 	}
-#endif
 
 	/*
 	 * NOTE! We still have PageReserved() pages in the page 
-- 
cgit v1.2.3


From 267b48014a5c0c2ae90b04dad5d95ceb903365a6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@suse.de>
Date: Sat, 25 Mar 2006 16:31:10 +0100
Subject: [PATCH] x86_64: Try to allocate node memmap near the end of node

This fixes problems with very large nodes (over 128GB) filling up all of
the first 4GB with their mem_map and not leaving enough space for the
swiotlb.

Signed-off-by: Andi Kleen <ak@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/x86_64/mm/numa.c   | 12 +++++++++++-
 include/linux/bootmem.h |  3 +++
 mm/bootmem.c            |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c
index e4b62753a19a..07471a3eb190 100644
--- a/arch/x86_64/mm/numa.c
+++ b/arch/x86_64/mm/numa.c
@@ -149,7 +149,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long en
 /* Initialize final allocator for a zone */
 void __init setup_node_zones(int nodeid)
 { 
-	unsigned long start_pfn, end_pfn; 
+	unsigned long start_pfn, end_pfn, memmapsize, limit;
 	unsigned long zones[MAX_NR_ZONES];
 	unsigned long holes[MAX_NR_ZONES];
 
@@ -159,6 +159,16 @@ void __init setup_node_zones(int nodeid)
 	Dprintk(KERN_INFO "Setting up node %d %lx-%lx\n",
 		nodeid, start_pfn, end_pfn);
 
+	/* Try to allocate mem_map at end to not fill up precious <4GB
+	   memory. */
+	memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
+	limit = end_pfn << PAGE_SHIFT;
+	NODE_DATA(nodeid)->node_mem_map = 
+		__alloc_bootmem_core(NODE_DATA(nodeid)->bdata, 
+				memmapsize, SMP_CACHE_BYTES, 
+				round_down(limit - memmapsize, PAGE_SIZE), 
+				limit);
+
 	size_zones(zones, holes, start_pfn, end_pfn);
 	free_area_init_node(nodeid, NODE_DATA(nodeid), zones,
 			    start_pfn, holes);
diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 993da8cc9706..7155452fb4a8 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -51,6 +51,9 @@ extern void * __init __alloc_bootmem_low_node(pg_data_t *pgdat,
 					      unsigned long size,
 					      unsigned long align,
 					      unsigned long goal);
+extern void * __init __alloc_bootmem_core(struct bootmem_data *bdata,
+		unsigned long size, unsigned long align, unsigned long goal,
+		unsigned long limit);
 #ifndef CONFIG_HAVE_ARCH_BOOTMEM_NODE
 extern void __init reserve_bootmem (unsigned long addr, unsigned long size);
 #define alloc_bootmem(x) \
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 35c32290f717..b55bd39fc5dd 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -152,7 +152,7 @@ static void __init free_bootmem_core(bootmem_data_t *bdata, unsigned long addr,
  *
  * NOTE:  This function is _not_ reentrant.
  */
-static void * __init
+void * __init
 __alloc_bootmem_core(struct bootmem_data *bdata, unsigned long size,
 	      unsigned long align, unsigned long goal, unsigned long limit)
 {
-- 
cgit v1.2.3


From 5bcb28b139cffc736177ceb775d1c8b5c5a411e2 Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Sun, 26 Mar 2006 18:30:52 +0200
Subject: BUG_ON() Conversion in mm/memory.c

this changes if() BUG(); constructs to BUG_ON() which is
cleaner, contains unlikely() and can better optimized away.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 mm/memory.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'mm')

diff --git a/mm/memory.c b/mm/memory.c
index e347e106ca3a..d90ff9d04957 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2352,10 +2352,8 @@ int make_pages_present(unsigned long addr, unsigned long end)
 	if (!vma)
 		return -1;
 	write = (vma->vm_flags & VM_WRITE) != 0;
-	if (addr >= end)
-		BUG();
-	if (end > vma->vm_end)
-		BUG();
+	BUG_ON(addr >= end);
+	BUG_ON(end > vma->vm_end);
 	len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
 	ret = get_user_pages(current, current->mm, addr,
 			len, write, 0, NULL, NULL);
-- 
cgit v1.2.3


From f02e1fafb534459522a8c46bc46b32820684623e Mon Sep 17 00:00:00 2001
From: Eric Sesterhenn <snakebyte@gmx.de>
Date: Sun, 26 Mar 2006 18:31:56 +0200
Subject: BUG_ON() Conversion in mm/mempool.c

this changes if() BUG(); constructs to BUG_ON() which is
cleaner, contains unlikely() and can better optimized away.

Signed-off-by: Eric Sesterhenn <snakebyte@gmx.de>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
---
 mm/mempool.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'mm')

diff --git a/mm/mempool.c b/mm/mempool.c
index f71893ed3543..9ef13dd68ab7 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -183,8 +183,8 @@ EXPORT_SYMBOL(mempool_resize);
  */
 void mempool_destroy(mempool_t *pool)
 {
-	if (pool->curr_nr != pool->min_nr)
-		BUG();		/* There were outstanding elements */
+	/* Check for outstanding elements */
+	BUG_ON(pool->curr_nr != pool->min_nr);
 	free_pool(pool);
 }
 EXPORT_SYMBOL(mempool_destroy);
-- 
cgit v1.2.3


From 03beb07664d768db97bf454ae5c9581cd4737bb4 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@SteelEye.com>
Date: Sun, 26 Mar 2006 01:36:57 -0800
Subject: [PATCH] Add API for flushing Anon pages

Currently, get_user_pages() returns fully coherent pages to the kernel for
anything other than anonymous pages.  This is a problem for things like
fuse and the SCSI generic ioctl SG_IO which can potentially wish to do DMA
to anonymous pages passed in by users.

The fix is to add a new memory management API: flush_anon_page() which
is used in get_user_pages() to make anonymous pages coherent.

Signed-off-by: James Bottomley <James.Bottomley@SteelEye.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/cachetlb.txt | 9 +++++++++
 include/linux/highmem.h    | 6 ++++++
 mm/memory.c                | 2 ++
 3 files changed, 17 insertions(+)

(limited to 'mm')

diff --git a/Documentation/cachetlb.txt b/Documentation/cachetlb.txt
index 4ae418889b88..1f312a9893d9 100644
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -362,6 +362,15 @@ maps this page at its virtual address.
 	likely that you will need to flush the instruction cache
 	for copy_to_user_page().
 
+  void flush_anon_page(struct page *page, unsigned long vmaddr)
+  	When the kernel needs to access the contents of an anonymous
+	page, it calls this function (currently only
+	get_user_pages()).  Note: flush_dcache_page() deliberately
+	doesn't work for an anonymous page.  The default
+	implementation is a nop (and should remain so for all coherent
+	architectures).  For incoherent architectures, it should flush
+	the cache of the page at vmaddr in the current user process.
+
   void flush_icache_range(unsigned long start, unsigned long end)
   	When the kernel stores into addresses that it will execute
 	out of (eg when loading modules), this function is called.
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 6bece9280eb7..7bd2593dbef9 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -7,6 +7,12 @@
 
 #include <asm/cacheflush.h>
 
+#ifndef ARCH_HAS_FLUSH_ANON_PAGE
+static inline void flush_anon_page(struct page *page, unsigned long vmaddr)
+{
+}
+#endif
+
 #ifdef CONFIG_HIGHMEM
 
 #include <asm/highmem.h>
diff --git a/mm/memory.c b/mm/memory.c
index e347e106ca3a..67686048f094 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1071,6 +1071,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			}
 			if (pages) {
 				pages[i] = page;
+
+				flush_anon_page(page, start);
 				flush_dcache_page(page);
 			}
 			if (vmas)
-- 
cgit v1.2.3


From 6e0678f394c7bd21bfa5d252b071a09e10e7a749 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:44 -0800
Subject: [PATCH] mempool: add page allocator

This will be used by the next patch in the series to replace duplicate
mempool-backed page allocators in 2 places in the kernel.  It is also likely
that there will be more users in the future.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempool.h | 12 ++++++++++++
 mm/mempool.c            | 18 ++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'mm')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index f2427d7394b0..9787572e0ae7 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -38,4 +38,16 @@ extern void mempool_free(void *element, mempool_t *pool);
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
 
+/*
+ * A mempool_alloc_t and mempool_free_t for a simple page allocator that
+ * allocates pages of the order specified by pool_data
+ */
+void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data);
+void mempool_free_pages(void *element, void *pool_data);
+static inline mempool_t *mempool_create_page_pool(int min_nr, int order)
+{
+	return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages,
+			      (void *)(long)order);
+}
+
 #endif /* _LINUX_MEMPOOL_H */
diff --git a/mm/mempool.c b/mm/mempool.c
index f71893ed3543..45c0112ca7b2 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -289,3 +289,21 @@ void mempool_free_slab(void *element, void *pool_data)
 	kmem_cache_free(mem, element);
 }
 EXPORT_SYMBOL(mempool_free_slab);
+
+/*
+ * A simple mempool-backed page allocator that allocates pages
+ * of the order specified by pool_data.
+ */
+void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data)
+{
+	int order = (int)(long)pool_data;
+	return alloc_pages(gfp_mask, order);
+}
+EXPORT_SYMBOL(mempool_alloc_pages);
+
+void mempool_free_pages(void *element, void *pool_data)
+{
+	int order = (int)(long)pool_data;
+	__free_pages(element, order);
+}
+EXPORT_SYMBOL(mempool_free_pages);
-- 
cgit v1.2.3


From a19b27ce3847c3a5d4ea6b6c91b6f7154759af23 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:45 -0800
Subject: [PATCH] mempool: use common mempool page allocator

Convert two mempool users that currently use their own mempool-backed page
allocators to use the generic mempool page allocator.

Also included are 2 trivial whitespace fixes.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 drivers/md/dm-crypt.c | 17 +----------------
 mm/highmem.c          | 23 +++++++----------------
 2 files changed, 8 insertions(+), 32 deletions(-)

(limited to 'mm')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index e7a650f9ca07..d88b8eda3903 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -93,20 +93,6 @@ struct crypt_config {
 
 static kmem_cache_t *_crypt_io_pool;
 
-/*
- * Mempool alloc and free functions for the page
- */
-static void *mempool_alloc_page(gfp_t gfp_mask, void *data)
-{
-	return alloc_page(gfp_mask);
-}
-
-static void mempool_free_page(void *page, void *data)
-{
-	__free_page(page);
-}
-
-
 /*
  * Different IV generation algorithms:
  *
@@ -637,8 +623,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad3;
 	}
 
-	cc->page_pool = mempool_create(MIN_POOL_PAGES, mempool_alloc_page,
-				       mempool_free_page, NULL);
+	cc->page_pool = mempool_create_page_pool(MIN_POOL_PAGES, 0);
 	if (!cc->page_pool) {
 		ti->error = PFX "Cannot allocate page mempool";
 		goto bad4;
diff --git a/mm/highmem.c b/mm/highmem.c
index d0ea1eec6a9a..55885f64af40 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -31,14 +31,9 @@
 
 static mempool_t *page_pool, *isa_page_pool;
 
-static void *page_pool_alloc_isa(gfp_t gfp_mask, void *data)
+static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
 {
-	return alloc_page(gfp_mask | GFP_DMA);
-}
-
-static void page_pool_free(void *page, void *data)
-{
-	__free_page(page);
+	return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
 }
 
 /*
@@ -51,11 +46,6 @@ static void page_pool_free(void *page, void *data)
  */
 #ifdef CONFIG_HIGHMEM
 
-static void *page_pool_alloc(gfp_t gfp_mask, void *data)
-{
-	return alloc_page(gfp_mask);
-}
-
 static int pkmap_count[LAST_PKMAP];
 static unsigned int last_pkmap_nr;
 static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kmap_lock);
@@ -229,7 +219,7 @@ static __init int init_emergency_pool(void)
 	if (!i.totalhigh)
 		return 0;
 
-	page_pool = mempool_create(POOL_SIZE, page_pool_alloc, page_pool_free, NULL);
+	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
 	if (!page_pool)
 		BUG();
 	printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
@@ -272,7 +262,8 @@ int init_emergency_isa_pool(void)
 	if (isa_page_pool)
 		return 0;
 
-	isa_page_pool = mempool_create(ISA_POOL_SIZE, page_pool_alloc_isa, page_pool_free, NULL);
+	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
+				       mempool_free_pages, (void *) 0);
 	if (!isa_page_pool)
 		BUG();
 
@@ -337,7 +328,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 	bio_put(bio);
 }
 
-static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done,int err)
+static int bounce_end_io_write(struct bio *bio, unsigned int bytes_done, int err)
 {
 	if (bio->bi_size)
 		return 1;
@@ -384,7 +375,7 @@ static int bounce_end_io_read_isa(struct bio *bio, unsigned int bytes_done, int
 }
 
 static void __blk_queue_bounce(request_queue_t *q, struct bio **bio_orig,
-			mempool_t *pool)
+			       mempool_t *pool)
 {
 	struct page *page;
 	struct bio *bio = NULL;
-- 
cgit v1.2.3


From 53184082b070dfb077218828fdf839826102ed96 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:46 -0800
Subject: [PATCH] mempool: add kmalloc allocator

Add another allocator to the common mempool code: a kmalloc/kfree allocator

This will be used by the next patch in the series to replace duplicate
mempool-backed kmalloc allocators in several places in the kernel.  It is also
very likely that there will be more users in the future.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempool.h | 12 ++++++++++++
 mm/mempool.c            | 17 +++++++++++++++++
 2 files changed, 29 insertions(+)

(limited to 'mm')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 9787572e0ae7..d23dbf076b57 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -38,6 +38,18 @@ extern void mempool_free(void *element, mempool_t *pool);
 void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
 
+/*
+ * A mempool_alloc_t and mempool_free_t to kmalloc the amount of memory
+ * specified by pool_data
+ */
+void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
+void mempool_kfree(void *element, void *pool_data);
+static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
+{
+	return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
+			      (void *) size);
+}
+
 /*
  * A mempool_alloc_t and mempool_free_t for a simple page allocator that
  * allocates pages of the order specified by pool_data
diff --git a/mm/mempool.c b/mm/mempool.c
index 45c0112ca7b2..a1397c6a4864 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -290,6 +290,23 @@ void mempool_free_slab(void *element, void *pool_data)
 }
 EXPORT_SYMBOL(mempool_free_slab);
 
+/*
+ * A commonly used alloc and free fn that kmalloc/kfrees the amount of memory
+ * specfied by pool_data
+ */
+void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+	return kmalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kmalloc);
+
+void mempool_kfree(void *element, void *pool_data)
+{
+	kfree(element);
+}
+EXPORT_SYMBOL(mempool_kfree);
+
 /*
  * A simple mempool-backed page allocator that allocates pages
  * of the order specified by pool_data.
-- 
cgit v1.2.3


From f183323d3822dee4d7b3147a59b6e8987fe201e0 Mon Sep 17 00:00:00 2001
From: Matthew Dobson <colpatch@us.ibm.com>
Date: Sun, 26 Mar 2006 01:37:48 -0800
Subject: [PATCH] mempool: add kzalloc allocator

Add another allocator to the common mempool code: a kzalloc/kfree allocator

This will be used by the next patch in the series to replace a mempool-backed
kzalloc allocator.  It is also very likely that there will be more users in
the future.

Signed-off-by: Matthew Dobson <colpatch@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mempool.h | 10 ++++++++--
 mm/mempool.c            |  9 ++++++++-
 2 files changed, 16 insertions(+), 3 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index d23dbf076b57..41570ce353eb 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -39,16 +39,22 @@ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data);
 void mempool_free_slab(void *element, void *pool_data);
 
 /*
- * A mempool_alloc_t and mempool_free_t to kmalloc the amount of memory
- * specified by pool_data
+ * 2 mempool_alloc_t's and a mempool_free_t to kmalloc/kzalloc and kfree
+ * the amount of memory specified by pool_data
  */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data);
+void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data);
 void mempool_kfree(void *element, void *pool_data);
 static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size)
 {
 	return mempool_create(min_nr, mempool_kmalloc, mempool_kfree,
 			      (void *) size);
 }
+static inline mempool_t *mempool_create_kzalloc_pool(int min_nr, size_t size)
+{
+	return mempool_create(min_nr, mempool_kzalloc, mempool_kfree,
+			      (void *) size);
+}
 
 /*
  * A mempool_alloc_t and mempool_free_t for a simple page allocator that
diff --git a/mm/mempool.c b/mm/mempool.c
index a1397c6a4864..7bf064e6a345 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -296,11 +296,18 @@ EXPORT_SYMBOL(mempool_free_slab);
  */
 void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data)
 {
-	size_t size = (size_t) pool_data;
+	size_t size = (size_t)(long)pool_data;
 	return kmalloc(size, gfp_mask);
 }
 EXPORT_SYMBOL(mempool_kmalloc);
 
+void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data)
+{
+	size_t size = (size_t) pool_data;
+	return kzalloc(size, gfp_mask);
+}
+EXPORT_SYMBOL(mempool_kzalloc);
+
 void mempool_kfree(void *element, void *pool_data)
 {
 	kfree(element);
-- 
cgit v1.2.3


From a117e66ed45ac0569c039ea60bd7a9a61e031858 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:25 -0800
Subject: [PATCH] unify pfn_to_page: generic functions

There are 3 memory models, FLATMEM, DISCONTIGMEM, SPARSEMEM.
Each arch has its own page_to_pfn(), pfn_to_page() for each models.
But most of them can use the same arithmetic.

This patch adds asm-generic/memory_model.h, which includes generic
page_to_pfn(), pfn_to_page() definitions for each memory model.

When CONFIG_OUT_OF_LINE_PFN_TO_PAGE=y, out-of-line functions are
used instead of macro. This is enabled by some archs and  reduces
text size.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Andi Kleen <ak@muc.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Ian Molton <spyro@f2s.com>
Cc: Mikael Starvik <starvik@axis.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Hirokazu Takata <takata.hirokazu@renesas.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp>
Cc: Richard Curnow <rc@rc0.org.uk>
Cc: William Lee Irwin III <wli@holomorphy.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Jeff Dike <jdike@addtoit.com>
Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp>
Cc: Chris Zankel <chris@zankel.net>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-generic/memory_model.h | 77 ++++++++++++++++++++++++++++++++++++++
 include/asm-sparc64/page.h         |  2 +
 include/linux/mmzone.h             | 11 ------
 mm/page_alloc.c                    | 42 +++++++++++++++++++++
 4 files changed, 121 insertions(+), 11 deletions(-)
 create mode 100644 include/asm-generic/memory_model.h

(limited to 'mm')

diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
new file mode 100644
index 000000000000..a7bb4978e808
--- /dev/null
+++ b/include/asm-generic/memory_model.h
@@ -0,0 +1,77 @@
+#ifndef __ASM_MEMORY_MODEL_H
+#define __ASM_MEMORY_MODEL_H
+
+#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
+
+#if defined(CONFIG_FLATMEM)
+
+#ifndef ARCH_PFN_OFFSET
+#define ARCH_PFN_OFFSET		(0UL)
+#endif
+
+#elif defined(CONFIG_DISCONTIGMEM)
+
+#ifndef arch_pfn_to_nid
+#define arch_pfn_to_nid(pfn)	pfn_to_nid(pfn)
+#endif
+
+#ifndef arch_local_page_offset
+#define arch_local_page_offset(pfn, nid)	\
+	((pfn) - NODE_DATA(nid)->node_start_pfn)
+#endif
+
+#endif /* CONFIG_DISCONTIGMEM */
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+struct page;
+/* this is useful when inlined pfn_to_page is too big */
+extern struct page *pfn_to_page(unsigned long pfn);
+extern unsigned long page_to_pfn(struct page *page);
+#else
+/*
+ * supports 3 memory models.
+ */
+#if defined(CONFIG_FLATMEM)
+
+#define pfn_to_page(pfn)	(mem_map + ((pfn) - ARCH_PFN_OFFSET))
+#define page_to_pfn(page)	((unsigned long)((page) - mem_map) + \
+				 ARCH_PFN_OFFSET)
+#elif defined(CONFIG_DISCONTIGMEM)
+
+#define pfn_to_page(pfn)			\
+({	unsigned long __pfn = (pfn);		\
+	unsigned long __nid = arch_pfn_to_nid(pfn);  \
+	NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\
+})
+
+#define page_to_pfn(pg)			\
+({	struct page *__pg = (pg);		\
+	struct zone *__zone = page_zone(__pg);	\
+	(unsigned long)(__pg - __zone->zone_mem_map) +	\
+	 __zone->zone_start_pfn;			\
+})
+
+#elif defined(CONFIG_SPARSEMEM)
+/*
+ * Note: section's mem_map is encorded to reflect its start_pfn.
+ * section[i].section_mem_map == mem_map's address - start_pfn;
+ */
+#define page_to_pfn(pg)					\
+({	struct page *__pg = (pg);				\
+	int __sec = page_to_section(__pg);			\
+	__pg - __section_mem_map_addr(__nr_to_section(__sec));	\
+})
+
+#define pfn_to_page(pfn)				\
+({	unsigned long __pfn = (pfn);			\
+	struct mem_section *__sec = __pfn_to_section(__pfn);	\
+	__section_mem_map_addr(__sec) + __pfn;		\
+})
+#endif /* CONFIG_FLATMEM/DISCONTIGMEM/SPARSEMEM */
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+
+#endif /* __ASSEMBLY__ */
+#endif /* __KERNEL__ */
+
+#endif
diff --git a/include/asm-sparc64/page.h b/include/asm-sparc64/page.h
index 66fe4ac59fd6..aabb21906724 100644
--- a/include/asm-sparc64/page.h
+++ b/include/asm-sparc64/page.h
@@ -111,6 +111,8 @@ typedef unsigned long pgprot_t;
 				 (_AC(0x0000000070000000,UL)) : \
 				 (_AC(0xfffff80000000000,UL) + (1UL << 32UL)))
 
+#include <asm-generic/memory_model.h>
+
 #endif /* !(__ASSEMBLY__) */
 
 /* to align the pointer to the (next) page boundary */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index ebfc238cc243..0c1c0c0cce65 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -602,17 +602,6 @@ static inline struct mem_section *__pfn_to_section(unsigned long pfn)
 	return __nr_to_section(pfn_to_section_nr(pfn));
 }
 
-#define pfn_to_page(pfn) 						\
-({ 									\
-	unsigned long __pfn = (pfn);					\
-	__section_mem_map_addr(__pfn_to_section(__pfn)) + __pfn;	\
-})
-#define page_to_pfn(page)						\
-({									\
-	page - __section_mem_map_addr(__nr_to_section(			\
-		page_to_section(page)));				\
-})
-
 static inline int pfn_valid(unsigned long pfn)
 {
 	if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 338a02bb004d..349b328763b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2745,3 +2745,45 @@ void *__init alloc_large_system_hash(const char *tablename,
 
 	return table;
 }
+
+#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
+/*
+ * pfn <-> page translation. out-of-line version.
+ * (see asm-generic/memory_model.h)
+ */
+#if defined(CONFIG_FLATMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	return mem_map + (pfn - ARCH_PFN_OFFSET);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+	return (page - mem_map) + ARCH_PFN_OFFSET;
+}
+#elif defined(CONFIG_DISCONTIGMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	int nid = arch_pfn_to_nid(pfn);
+	return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
+}
+unsigned long page_to_pfn(struct page *page)
+{
+	struct zone *zone = page_zone(page);
+	return (page - zone->zone_mem_map) + zone->zone_start_pfn;
+
+}
+#elif defined(CONFIG_SPARSEMEM)
+struct page *pfn_to_page(unsigned long pfn)
+{
+	return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
+}
+
+unsigned long page_to_pfn(struct page *page)
+{
+	long section_id = page_to_section(page);
+	return page - __section_mem_map_addr(__nr_to_section(section_id));
+}
+#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
+EXPORT_SYMBOL(pfn_to_page);
+EXPORT_SYMBOL(page_to_pfn);
+#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
-- 
cgit v1.2.3


From a0140c1d85637ee5f4ea7c78f066e3611a6a79dc Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:55 -0800
Subject: [PATCH] remove zone_mem_map

This patch removes zone_mem_map.

pfn_to_page uses pgdat, page_to_pfn uses zone.  page_to_pfn can use pgdat
instead of zone, which is only one user of zone_mem_map.  By modifing it,
we can remove zone_mem_map.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Christoph Lameter <christoph@lameter.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/asm-alpha/mmzone.h         |  3 +--
 include/asm-generic/memory_model.h | 10 +++++-----
 include/linux/mmzone.h             |  1 -
 mm/page_alloc.c                    |  6 ++----
 4 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'mm')

diff --git a/include/asm-alpha/mmzone.h b/include/asm-alpha/mmzone.h
index c9004398f273..192d80c875b0 100644
--- a/include/asm-alpha/mmzone.h
+++ b/include/asm-alpha/mmzone.h
@@ -83,8 +83,7 @@ PLAT_NODE_DATA_LOCALNR(unsigned long p, int n)
 	pte_t pte;                                                           \
 	unsigned long pfn;                                                   \
 									     \
-	pfn = ((unsigned long)((page)-page_zone(page)->zone_mem_map)) << 32; \
-	pfn += page_zone(page)->zone_start_pfn << 32;			     \
+	pfn = page_to_pfn(page) << 32; \
 	pte_val(pte) = pfn | pgprot_val(pgprot);			     \
 									     \
 	pte;								     \
diff --git a/include/asm-generic/memory_model.h b/include/asm-generic/memory_model.h
index a7bb4978e808..0cfb086dd373 100644
--- a/include/asm-generic/memory_model.h
+++ b/include/asm-generic/memory_model.h
@@ -45,11 +45,11 @@ extern unsigned long page_to_pfn(struct page *page);
 	NODE_DATA(__nid)->node_mem_map + arch_local_page_offset(__pfn, __nid);\
 })
 
-#define page_to_pfn(pg)			\
-({	struct page *__pg = (pg);		\
-	struct zone *__zone = page_zone(__pg);	\
-	(unsigned long)(__pg - __zone->zone_mem_map) +	\
-	 __zone->zone_start_pfn;			\
+#define page_to_pfn(pg)							\
+({	struct page *__pg = (pg);					\
+	struct pglist_data *__pgdat = NODE_DATA(page_to_nid(__pg));	\
+	(unsigned long)(__pg - __pgdat->node_mem_map) +			\
+	 __pgdat->node_start_pfn;					\
 })
 
 #elif defined(CONFIG_SPARSEMEM)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0c1c0c0cce65..ace31c515a8c 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -225,7 +225,6 @@ struct zone {
 	 * Discontig memory support fields.
 	 */
 	struct pglist_data	*zone_pgdat;
-	struct page		*zone_mem_map;
 	/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 	unsigned long		zone_start_pfn;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 349b328763b7..8dc8f2735d22 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2042,7 +2042,6 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
 	zone_wait_table_init(zone, size);
 	pgdat->nr_zones = zone_idx(zone) + 1;
 
-	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
 	zone->zone_start_pfn = zone_start_pfn;
 
 	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
@@ -2768,9 +2767,8 @@ struct page *pfn_to_page(unsigned long pfn)
 }
 unsigned long page_to_pfn(struct page *page)
 {
-	struct zone *zone = page_zone(page);
-	return (page - zone->zone_mem_map) + zone->zone_start_pfn;
-
+	struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+	return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
 }
 #elif defined(CONFIG_SPARSEMEM)
 struct page *pfn_to_page(unsigned long pfn)
-- 
cgit v1.2.3


From 679bc9fbb508a0aac9539b2de747eb5849feb428 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:58 -0800
Subject: [PATCH] for_each_online_pgdat: for_each_bootmem

Add a list_head to bootmem_data_t and make bootmems use it.  bootmem list is
sorted by node_boot_start.

Only nodes against which init_bootmem() is called are linked to the list.
(i386 allocates bootmem only from one node(0) not from all online nodes.)

A summary:
 1. for_each_online_pgdat() traverses all *online* nodes.
 2. alloc_bootmem() allocates memory only from initialized-for-bootmem nodes.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/bootmem.h |  1 +
 mm/bootmem.c            | 39 +++++++++++++++++++++++++++++----------
 2 files changed, 30 insertions(+), 10 deletions(-)

(limited to 'mm')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 7155452fb4a8..de3eb8d8ae26 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -38,6 +38,7 @@ typedef struct bootmem_data {
 	unsigned long last_pos;
 	unsigned long last_success;	/* Previous allocation point.  To speed
 					 * up searching */
+	struct list_head list;
 } bootmem_data_t;
 
 extern unsigned long __init bootmem_bootmap_pages (unsigned long);
diff --git a/mm/bootmem.c b/mm/bootmem.c
index b55bd39fc5dd..d3e3bd2ffcea 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -33,6 +33,7 @@ EXPORT_SYMBOL(max_pfn);		/* This is exported so
 				 * dma_get_required_mask(), which uses
 				 * it, can be an inline function */
 
+static LIST_HEAD(bdata_list);
 #ifdef CONFIG_CRASH_DUMP
 /*
  * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -52,6 +53,27 @@ unsigned long __init bootmem_bootmap_pages (unsigned long pages)
 
 	return mapsize;
 }
+/*
+ * link bdata in order
+ */
+static void link_bootmem(bootmem_data_t *bdata)
+{
+	bootmem_data_t *ent;
+	if (list_empty(&bdata_list)) {
+		list_add(&bdata->list, &bdata_list);
+		return;
+	}
+	/* insert in order */
+	list_for_each_entry(ent, &bdata_list, list) {
+		if (bdata->node_boot_start < ent->node_boot_start) {
+			list_add_tail(&bdata->list, &ent->list);
+			return;
+		}
+	}
+	list_add_tail(&bdata->list, &bdata_list);
+	return;
+}
+
 
 /*
  * Called once to set up the allocator itself.
@@ -62,13 +84,11 @@ static unsigned long __init init_bootmem_core (pg_data_t *pgdat,
 	bootmem_data_t *bdata = pgdat->bdata;
 	unsigned long mapsize = ((end - start)+7)/8;
 
-	pgdat->pgdat_next = pgdat_list;
-	pgdat_list = pgdat;
-
 	mapsize = ALIGN(mapsize, sizeof(long));
 	bdata->node_bootmem_map = phys_to_virt(mapstart << PAGE_SHIFT);
 	bdata->node_boot_start = (start << PAGE_SHIFT);
 	bdata->node_low_pfn = end;
+	link_bootmem(bdata);
 
 	/*
 	 * Initially all pages are reserved - setup_arch() has to
@@ -383,12 +403,11 @@ unsigned long __init free_all_bootmem (void)
 
 void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal)
 {
-	pg_data_t *pgdat = pgdat_list;
+	bootmem_data_t *bdata;
 	void *ptr;
 
-	for_each_pgdat(pgdat)
-		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
-						 align, goal, 0)))
+	list_for_each_entry(bdata, &bdata_list, list)
+		if ((ptr = __alloc_bootmem_core(bdata, size, align, goal, 0)))
 			return(ptr);
 
 	/*
@@ -416,11 +435,11 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigne
 
 void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal)
 {
-	pg_data_t *pgdat = pgdat_list;
+	bootmem_data_t *bdata;
 	void *ptr;
 
-	for_each_pgdat(pgdat)
-		if ((ptr = __alloc_bootmem_core(pgdat->bdata, size,
+	list_for_each_entry(bdata, &bdata_list, list)
+		if ((ptr = __alloc_bootmem_core(bdata, size,
 						 align, goal, LOW32LIMIT)))
 			return(ptr);
 
-- 
cgit v1.2.3


From ec936fc563715a9e2b2e363eb060655b49529325 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:15:59 -0800
Subject: [PATCH] for_each_online_pgdat: renaming for_each_pgdat

Replace for_each_pgdat() with for_each_online_pgdat().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 arch/i386/mm/pgtable.c   | 2 +-
 arch/ia64/mm/discontig.c | 4 ++--
 arch/ia64/mm/init.c      | 2 +-
 arch/m32r/mm/init.c      | 2 +-
 arch/powerpc/mm/mem.c    | 4 ++--
 arch/x86_64/mm/init.c    | 2 +-
 fs/buffer.c              | 2 +-
 mm/page_alloc.c          | 6 +++---
 mm/vmscan.c              | 6 +++---
 9 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'mm')

diff --git a/arch/i386/mm/pgtable.c b/arch/i386/mm/pgtable.c
index 9db3242103be..2889567e21a1 100644
--- a/arch/i386/mm/pgtable.c
+++ b/arch/i386/mm/pgtable.c
@@ -36,7 +36,7 @@ void show_mem(void)
 	printk(KERN_INFO "Mem-info:\n");
 	show_free_areas();
 	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
 		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pgdat_page_nr(pgdat, i);
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 2f5e44862e91..384f1d7dce96 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -386,7 +386,7 @@ static void __init pgdat_insert(pg_data_t *pgdat)
 {
 	pg_data_t *prev = NULL, *next;
 
-	for_each_pgdat(next)
+	for_each_online_pgdat(next)
 		if (pgdat->node_id < next->node_id)
 			break;
 		else
@@ -560,7 +560,7 @@ void show_mem(void)
 	printk("Mem-info:\n");
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
 		unsigned long present;
 		unsigned long flags;
 		int shared = 0, cached = 0, reserved = 0;
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index ff4f31fcd330..2ef1151cde90 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -600,7 +600,7 @@ mem_init (void)
 	kclist_add(&kcore_vmem, (void *)VMALLOC_START, VMALLOC_END-VMALLOC_START);
 	kclist_add(&kcore_kernel, _stext, _end - _stext);
 
-	for_each_pgdat(pgdat)
+	for_each_online_pgdat(pgdat)
 		if (pgdat->bdata->node_bootmem_map)
 			totalram_pages += free_all_bootmem_node(pgdat);
 
diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c
index c9e7dad860b7..2e0fe199ce38 100644
--- a/arch/m32r/mm/init.c
+++ b/arch/m32r/mm/init.c
@@ -47,7 +47,7 @@ void show_mem(void)
 	printk("Mem-info:\n");
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n",nr_swap_pages<<(PAGE_SHIFT-10));
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
 		unsigned long flags;
 		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; ++i) {
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index badac10d700c..5e435a9c3431 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -195,7 +195,7 @@ void show_mem(void)
 	printk("Mem-info:\n");
 	show_free_areas();
 	printk("Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
 		unsigned long flags;
 		pgdat_resize_lock(pgdat, &flags);
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
@@ -351,7 +351,7 @@ void __init mem_init(void)
 	max_mapnr = max_pfn;
 	totalram_pages += free_all_bootmem();
 #endif
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
 		for (i = 0; i < pgdat->node_spanned_pages; i++) {
 			if (!pfn_valid(pgdat->node_start_pfn + i))
 				continue;
diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c
index b04415625442..e5f7f1c34462 100644
--- a/arch/x86_64/mm/init.c
+++ b/arch/x86_64/mm/init.c
@@ -72,7 +72,7 @@ void show_mem(void)
 	show_free_areas();
 	printk(KERN_INFO "Free swap:       %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
 
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
                for (i = 0; i < pgdat->node_spanned_pages; ++i) {
 			page = pfn_to_page(pgdat->node_start_pfn + i);
 			total++;
diff --git a/fs/buffer.c b/fs/buffer.c
index d597758dd129..23f1f3a68077 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -493,7 +493,7 @@ static void free_more_memory(void)
 	wakeup_pdflush(1024);
 	yield();
 
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
 		zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
 		if (*zones)
 			try_to_free_pages(zones, GFP_NOFS);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8dc8f2735d22..ccc3713dd407 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1201,7 +1201,7 @@ unsigned int nr_free_highpages (void)
 	pg_data_t *pgdat;
 	unsigned int pages = 0;
 
-	for_each_pgdat(pgdat)
+	for_each_online_pgdat(pgdat)
 		pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 
 	return pages;
@@ -1343,7 +1343,7 @@ void get_zone_counts(unsigned long *active,
 	*active = 0;
 	*inactive = 0;
 	*free = 0;
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
 		unsigned long l, m, n;
 		__get_zone_counts(&l, &m, &n, pgdat);
 		*active += l;
@@ -2482,7 +2482,7 @@ static void setup_per_zone_lowmem_reserve(void)
 	struct pglist_data *pgdat;
 	int j, idx;
 
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
 		for (j = 0; j < MAX_NR_ZONES; j++) {
 			struct zone *zone = pgdat->node_zones + j;
 			unsigned long present_pages = zone->present_pages;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 78865c849f8f..acdf001d6941 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1305,7 +1305,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
 
 	current->reclaim_state = &reclaim_state;
 repeat:
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
 		unsigned long freed;
 
 		freed = balance_pgdat(pgdat, nr_to_free, 0);
@@ -1335,7 +1335,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
 	cpumask_t mask;
 
 	if (action == CPU_ONLINE) {
-		for_each_pgdat(pgdat) {
+		for_each_online_pgdat(pgdat) {
 			mask = node_to_cpumask(pgdat->node_id);
 			if (any_online_cpu(mask) != NR_CPUS)
 				/* One of our CPUs online: restore mask */
@@ -1351,7 +1351,7 @@ static int __init kswapd_init(void)
 	pg_data_t *pgdat;
 
 	swap_setup();
-	for_each_pgdat(pgdat) {
+	for_each_online_pgdat(pgdat) {
 		pid_t pid;
 
 		pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL);
-- 
cgit v1.2.3


From ae0f15fb91274e67d78836d38c99ec363df33073 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:16:01 -0800
Subject: [PATCH] for_each_online_pgdat: remove pgdat_list

By using for_each_online_pgdat(), pgdat_list is not necessary now.  This patch
removes it.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 3 ---
 mm/page_alloc.c        | 8 ++++----
 2 files changed, 4 insertions(+), 7 deletions(-)

(limited to 'mm')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 96eb08025092..0d12c3cf1f86 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -307,7 +307,6 @@ typedef struct pglist_data {
 	unsigned long node_spanned_pages; /* total size of physical page
 					     range, including holes */
 	int node_id;
-	struct pglist_data *pgdat_next;
 	wait_queue_head_t kswapd_wait;
 	struct task_struct *kswapd;
 	int kswapd_max_order;
@@ -324,8 +323,6 @@ typedef struct pglist_data {
 
 #include <linux/memory_hotplug.h>
 
-extern struct pglist_data *pgdat_list;
-
 void __get_zone_counts(unsigned long *active, unsigned long *inactive,
 			unsigned long *free, struct pglist_data *pgdat);
 void get_zone_counts(unsigned long *active, unsigned long *inactive,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ccc3713dd407..dc523a1f270d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -49,7 +49,6 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
 EXPORT_SYMBOL(node_online_map);
 nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
 EXPORT_SYMBOL(node_possible_map);
-struct pglist_data *pgdat_list __read_mostly;
 unsigned long totalram_pages __read_mostly;
 unsigned long totalhigh_pages __read_mostly;
 long nr_swap_pages;
@@ -2169,8 +2168,9 @@ static void *frag_start(struct seq_file *m, loff_t *pos)
 {
 	pg_data_t *pgdat;
 	loff_t node = *pos;
-
-	for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
+	for (pgdat = first_online_pgdat();
+	     pgdat && node;
+	     pgdat = next_online_pgdat(pgdat))
 		--node;
 
 	return pgdat;
@@ -2181,7 +2181,7 @@ static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
 	pg_data_t *pgdat = (pg_data_t *)arg;
 
 	(*pos)++;
-	return pgdat->pgdat_next;
+	return next_online_pgdat(pgdat);
 }
 
 static void frag_stop(struct seq_file *m, void *arg)
-- 
cgit v1.2.3


From 95144c788dc01b6a0ff2c9c2222e37ffdab358b8 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Mon, 27 Mar 2006 01:16:02 -0800
Subject: [PATCH] uninline zone helpers

Helper functions for for_each_online_pgdat/for_each_zone look too big to be
inlined.  Speed of these helper macro itself is not very important.  (inner
loops are tend to do more work than this)

This patch make helper function to be out-of-lined.

	inline		out-of-line
.text   005c0680        005bf6a0

005c0680 - 005bf6a0 = FE0 = 4Kbytes.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 include/linux/mmzone.h | 38 +++-----------------------------------
 mm/Makefile            |  2 +-
 mm/mmzone.c            | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 54 insertions(+), 36 deletions(-)
 create mode 100644 mm/mmzone.c

(limited to 'mm')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0d12c3cf1f86..b5c21122c299 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -418,20 +418,9 @@ extern struct pglist_data contig_page_data;
 
 #endif /* !CONFIG_NEED_MULTIPLE_NODES */
 
-static inline struct pglist_data *first_online_pgdat(void)
-{
-	return NODE_DATA(first_online_node);
-}
-
-static inline struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
-{
-	int nid = next_online_node(pgdat->node_id);
-
-	if (nid == MAX_NUMNODES)
-		return NULL;
-	return NODE_DATA(nid);
-}
-
+extern struct pglist_data *first_online_pgdat(void);
+extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat);
+extern struct zone *next_zone(struct zone *zone);
 
 /**
  * for_each_pgdat - helper macro to iterate over all nodes
@@ -441,27 +430,6 @@ static inline struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 	for (pgdat = first_online_pgdat();		\
 	     pgdat;					\
 	     pgdat = next_online_pgdat(pgdat))
-
-/*
- * next_zone - helper magic for for_each_zone()
- * Thanks to William Lee Irwin III for this piece of ingenuity.
- */
-static inline struct zone *next_zone(struct zone *zone)
-{
-	pg_data_t *pgdat = zone->zone_pgdat;
-
-	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
-		zone++;
-	else {
-		pgdat = next_online_pgdat(pgdat);
-		if (pgdat)
-			zone = pgdat->node_zones;
-		else
-			zone = NULL;
-	}
-	return zone;
-}
-
 /**
  * for_each_zone - helper macro to iterate over all memory zones
  * @zone - pointer to struct zone variable
diff --git a/mm/Makefile b/mm/Makefile
index f10c753dce6d..0b8f73f2ed16 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -10,7 +10,7 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o \
 			   readahead.o swap.o truncate.o vmscan.o \
-			   prio_tree.o util.o $(mmu-y)
+			   prio_tree.o util.o mmzone.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
diff --git a/mm/mmzone.c b/mm/mmzone.c
new file mode 100644
index 000000000000..b022370e612e
--- /dev/null
+++ b/mm/mmzone.c
@@ -0,0 +1,50 @@
+/*
+ * linux/mm/mmzone.c
+ *
+ * management codes for pgdats and zones.
+ */
+
+
+#include <linux/config.h>
+#include <linux/stddef.h>
+#include <linux/mmzone.h>
+#include <linux/module.h>
+
+struct pglist_data *first_online_pgdat(void)
+{
+	return NODE_DATA(first_online_node);
+}
+
+EXPORT_SYMBOL(first_online_pgdat);
+
+struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+{
+	int nid = next_online_node(pgdat->node_id);
+
+	if (nid == MAX_NUMNODES)
+		return NULL;
+	return NODE_DATA(nid);
+}
+EXPORT_SYMBOL(next_online_pgdat);
+
+
+/*
+ * next_zone - helper magic for for_each_zone()
+ */
+struct zone *next_zone(struct zone *zone)
+{
+	pg_data_t *pgdat = zone->zone_pgdat;
+
+	if (zone < pgdat->node_zones + MAX_NR_ZONES - 1)
+		zone++;
+	else {
+		pgdat = next_online_pgdat(pgdat);
+		if (pgdat)
+			zone = pgdat->node_zones;
+		else
+			zone = NULL;
+	}
+	return zone;
+}
+EXPORT_SYMBOL(next_zone);
+
-- 
cgit v1.2.3


From 0a945022778f100115d0cb6234eb28fc1b15ccaf Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Tue, 28 Mar 2006 01:56:37 -0800
Subject: [PATCH] for_each_possible_cpu: fixes for generic part

replaces for_each_cpu with for_each_possible_cpu().

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 block/ll_rw_blk.c            | 2 +-
 fs/file.c                    | 2 +-
 fs/proc/proc_misc.c          | 2 +-
 include/asm-generic/percpu.h | 2 +-
 include/linux/genhd.h        | 4 ++--
 include/linux/kernel_stat.h  | 2 +-
 init/main.c                  | 2 +-
 kernel/rcutorture.c          | 4 ++--
 kernel/sched.c               | 8 ++++----
 mm/slab.c                    | 4 ++--
 mm/swap.c                    | 2 +-
 11 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'mm')

diff --git a/block/ll_rw_blk.c b/block/ll_rw_blk.c
index 82469db25100..5a19e2eb5711 100644
--- a/block/ll_rw_blk.c
+++ b/block/ll_rw_blk.c
@@ -3514,7 +3514,7 @@ int __init blk_dev_init(void)
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
diff --git a/fs/file.c b/fs/file.c
index bbc743314730..55f4e7022563 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -373,6 +373,6 @@ static void __devinit fdtable_defer_list_init(int cpu)
 void __init files_defer_init(void)
 {
 	int i;
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		fdtable_defer_list_init(i);
 }
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index 1e9ea37d457e..1edce0c34bfd 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -534,7 +534,7 @@ static int show_stat(struct seq_file *p, void *v)
 	if (wall_to_monotonic.tv_nsec)
 		--jif;
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		int j;
 
 		user = cputime64_add(user, kstat_cpu(i).cpustat.user);
diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index 78cf45547e31..c0caf433a7d7 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -19,7 +19,7 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
 	unsigned int __i;					\
-	for_each_cpu(__i)					\
+	for_each_possible_cpu(__i)				\
 		memcpy((pcpudst)+__per_cpu_offset[__i],		\
 		       (src), (size));				\
 } while (0)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 3c1b0294a742..10a27f29d692 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -152,14 +152,14 @@ struct disk_attribute {
 ({									\
 	typeof(gendiskp->dkstats->field) res = 0;			\
 	int i;								\
-	for_each_cpu(i)							\
+	for_each_possible_cpu(i)					\
 		res += per_cpu_ptr(gendiskp->dkstats, i)->field;	\
 	res;								\
 })
 
 static inline void disk_stat_set_all(struct gendisk *gendiskp, int value)	{
 	int i;
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		memset(per_cpu_ptr(gendiskp->dkstats, i), value,
 				sizeof (struct disk_stats));
 }		
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index a484572c302e..b46249082cca 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -46,7 +46,7 @@ static inline int kstat_irqs(int irq)
 {
 	int cpu, sum = 0;
 
-	for_each_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		sum += kstat_cpu(cpu).irqs[irq];
 
 	return sum;
diff --git a/init/main.c b/init/main.c
index 64466ea1984c..4a2f0898dda1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -341,7 +341,7 @@ static void __init setup_per_cpu_areas(void)
 #endif
 	ptr = alloc_bootmem(size * nr_possible_cpus);
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		__per_cpu_offset[i] = ptr - __per_cpu_start;
 		memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
 		ptr += size;
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index b4b362b5baf5..8154e7589d12 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -301,7 +301,7 @@ rcu_torture_printk(char *page)
 	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
 
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
 			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
@@ -535,7 +535,7 @@ rcu_torture_init(void)
 	atomic_set(&n_rcu_torture_error, 0);
 	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
 		atomic_set(&rcu_torture_wcount[i], 0);
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
 			per_cpu(rcu_torture_count, cpu)[i] = 0;
 			per_cpu(rcu_torture_batch, cpu)[i] = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 7854ee516b92..a9ecac398bb9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1625,7 +1625,7 @@ unsigned long nr_uninterruptible(void)
 {
 	unsigned long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_uninterruptible;
 
 	/*
@@ -1642,7 +1642,7 @@ unsigned long long nr_context_switches(void)
 {
 	unsigned long long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += cpu_rq(i)->nr_switches;
 
 	return sum;
@@ -1652,7 +1652,7 @@ unsigned long nr_iowait(void)
 {
 	unsigned long i, sum = 0;
 
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 		sum += atomic_read(&cpu_rq(i)->nr_iowait);
 
 	return sum;
@@ -6080,7 +6080,7 @@ void __init sched_init(void)
 	runqueue_t *rq;
 	int i, j, k;
 
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		prio_array_t *array;
 
 		rq = cpu_rq(i);
diff --git a/mm/slab.c b/mm/slab.c
index 681837499d7d..4cbf8bb13557 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3311,7 +3311,7 @@ void *__alloc_percpu(size_t size)
 	 * and we have no way of figuring out how to fix the array
 	 * that we have allocated then....
 	 */
-	for_each_cpu(i) {
+	for_each_possible_cpu(i) {
 		int node = cpu_to_node(i);
 
 		if (node_online(node))
@@ -3398,7 +3398,7 @@ void free_percpu(const void *objp)
 	/*
 	 * We allocate for all cpus so we cannot use for online cpu here.
 	 */
-	for_each_cpu(i)
+	for_each_possible_cpu(i)
 	    kfree(p->ptrs[i]);
 	kfree(p);
 }
diff --git a/mm/swap.c b/mm/swap.c
index 91b7e2026f69..88895c249bc9 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -512,7 +512,7 @@ long percpu_counter_sum(struct percpu_counter *fbc)
 
 	spin_lock(&fbc->lock);
 	ret = fbc->count;
-	for_each_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		long *pcount = per_cpu_ptr(fbc->counters, cpu);
 		ret += *pcount;
 	}
-- 
cgit v1.2.3


From 7f927fcc2fd1575d01efb4b76665975007945690 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 28 Mar 2006 01:56:53 -0800
Subject: [PATCH] Typo fixes

Fix a lot of typos.  Eyeballed by jmc@ in OpenBSD.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 Documentation/m68k/README.buddha         | 2 +-
 Documentation/networking/ifenslave.c     | 2 +-
 arch/arm/lib/copy_template.S             | 2 +-
 drivers/char/mxser.h                     | 2 +-
 drivers/char/synclink.c                  | 2 +-
 drivers/edac/edac_mc.c                   | 2 +-
 drivers/net/irda/nsc-ircc.c              | 2 +-
 drivers/net/sis900.c                     | 2 +-
 drivers/net/tulip/de4x5.c                | 2 +-
 drivers/net/tulip/pnic2.c                | 2 +-
 drivers/net/typhoon.c                    | 4 ++--
 drivers/net/wireless/orinoco.c           | 2 +-
 drivers/net/wireless/prism54/isl_ioctl.c | 2 +-
 drivers/scsi/3w-9xxx.c                   | 2 +-
 drivers/serial/8250.c                    | 2 +-
 drivers/serial/serial_txx9.c             | 2 +-
 drivers/serial/sunsu.c                   | 2 +-
 drivers/usb/host/ohci-s3c2410.c          | 2 +-
 drivers/usb/net/zaurus.c                 | 2 +-
 fs/mbcache.c                             | 2 +-
 include/asm-parisc/pdc.h                 | 2 +-
 include/asm-sh/addrspace.h               | 2 +-
 include/asm-sparc64/floppy.h             | 2 +-
 include/linux/fb.h                       | 2 +-
 mm/mempolicy.c                           | 2 +-
 net/irda/af_irda.c                       | 6 +++---
 sound/pci/rme32.c                        | 8 ++++----
 sound/pci/rme96.c                        | 8 ++++----
 sound/pci/rme9652/hdspm.c                | 2 +-
 sound/usb/usx2y/usx2yhwdeppcm.c          | 2 +-
 30 files changed, 39 insertions(+), 39 deletions(-)

(limited to 'mm')

diff --git a/Documentation/m68k/README.buddha b/Documentation/m68k/README.buddha
index bf802ffc98ad..ef484a719bb9 100644
--- a/Documentation/m68k/README.buddha
+++ b/Documentation/m68k/README.buddha
@@ -29,7 +29,7 @@ address is written to $4a, then the whole Byte is written to
 $48, while it doesn't matter how often you're writing to $4a
 as  long as $48 is not touched.  After $48 has been written,
 the  whole card disappears from $e8 and is mapped to the new
-address just written.  Make shure $4a is written before $48,
+address just written.  Make sure $4a is written before $48,
 otherwise your chance is only 1:16 to find the board :-).
 
 The local memory-map is even active when mapped to $e8:
diff --git a/Documentation/networking/ifenslave.c b/Documentation/networking/ifenslave.c
index 545447ac503a..a12059886755 100644
--- a/Documentation/networking/ifenslave.c
+++ b/Documentation/networking/ifenslave.c
@@ -87,7 +87,7 @@
  *	   would fail and generate an error message in the system log.
  * 	 - For opt_c: slave should not be set to the master's setting
  *	   while it is running. It was already set during enslave. To
- *	   simplify things, it is now handeled separately.
+ *	   simplify things, it is now handled separately.
  *
  *    - 2003/12/01 - Shmulik Hen <shmulik.hen at intel dot com>
  *	 - Code cleanup and style changes
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 838e435e4922..cab355c0c1f7 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -236,7 +236,7 @@
 
 
 /*
- * Abort preanble and completion macros.
+ * Abort preamble and completion macros.
  * If a fixup handler is required then those macros must surround it.
  * It is assumed that the fixup code will handle the private part of
  * the exit macro.
diff --git a/drivers/char/mxser.h b/drivers/char/mxser.h
index e7fd0b08e0b7..7e188a4d602a 100644
--- a/drivers/char/mxser.h
+++ b/drivers/char/mxser.h
@@ -118,7 +118,7 @@
 
 // enable CTS interrupt
 #define MOXA_MUST_IER_ECTSI		0x80
-// eanble RTS interrupt
+// enable RTS interrupt
 #define MOXA_MUST_IER_ERTSI		0x40
 // enable Xon/Xoff interrupt
 #define MOXA_MUST_IER_XINT		0x20
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index abb03e52fe12..fee2aca3f6a5 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -5996,7 +5996,7 @@ static void usc_set_async_mode( struct mgsl_struct *info )
 	 * <15..8>	?		RxFIFO IRQ Request Level
 	 *
 	 * Note: For async mode the receive FIFO level must be set
-	 * to 0 to aviod the situation where the FIFO contains fewer bytes
+	 * to 0 to avoid the situation where the FIFO contains fewer bytes
 	 * than the trigger level and no more data is expected.
 	 *
 	 * <7>		0		Exited Hunt IA (Interrupt Arm)
diff --git a/drivers/edac/edac_mc.c b/drivers/edac/edac_mc.c
index 905f58ba8e16..ea06e3a4dc35 100644
--- a/drivers/edac/edac_mc.c
+++ b/drivers/edac/edac_mc.c
@@ -2044,7 +2044,7 @@ static int __init edac_mc_init(void)
 	 */
 	clear_pci_parity_errors();
 
-	/* Create the MC sysfs entires */
+	/* Create the MC sysfs entries */
 	if (edac_sysfs_memctrl_setup()) {
 		edac_printk(KERN_ERR, EDAC_MC,
 			"Error initializing sysfs code\n");
diff --git a/drivers/net/irda/nsc-ircc.c b/drivers/net/irda/nsc-ircc.c
index 9aa074b44dd3..cc7ff8f00e42 100644
--- a/drivers/net/irda/nsc-ircc.c
+++ b/drivers/net/irda/nsc-ircc.c
@@ -812,7 +812,7 @@ static int nsc_ircc_init_39x(nsc_chip_t *chip, chipio_t *info)
 	int cfg_base = info->cfg_base;
 	int enabled;
 
-	/* User is shure about his config... accept it. */
+	/* User is sure about his config... accept it. */
 	IRDA_DEBUG(2, "%s(): nsc_ircc_init_39x (user settings): "
 		   "io=0x%04x, irq=%d, dma=%d\n", 
 		   __FUNCTION__, info->fir_base, info->irq, info->dma);
diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c
index 8429ceb01389..b82191d2bee1 100644
--- a/drivers/net/sis900.c
+++ b/drivers/net/sis900.c
@@ -2283,7 +2283,7 @@ static void set_rx_mode(struct net_device *net_dev)
 	int i, table_entries;
 	u32 rx_mode;
 
-	/* 635 Hash Table entires = 256(2^16) */
+	/* 635 Hash Table entries = 256(2^16) */
 	if((sis_priv->chipset_rev >= SIS635A_900_REV) ||
 			(sis_priv->chipset_rev == SIS900B_900_REV))
 		table_entries = 16;
diff --git a/drivers/net/tulip/de4x5.c b/drivers/net/tulip/de4x5.c
index ee48bfd67349..d1a86a080a65 100644
--- a/drivers/net/tulip/de4x5.c
+++ b/drivers/net/tulip/de4x5.c
@@ -513,7 +513,7 @@ struct mii_phy {
     u_char  *rst;           /* Start of reset sequence in SROM           */
     u_int mc;               /* Media Capabilities                        */
     u_int ana;              /* NWay Advertisement                        */
-    u_int fdx;              /* Full DupleX capabilites for each media    */
+    u_int fdx;              /* Full DupleX capabilities for each media   */
     u_int ttm;              /* Transmit Threshold Mode for each media    */
     u_int mci;              /* 21142 MII Connector Interrupt info        */
 };
diff --git a/drivers/net/tulip/pnic2.c b/drivers/net/tulip/pnic2.c
index 55f4a9a631bc..ab985023fcca 100644
--- a/drivers/net/tulip/pnic2.c
+++ b/drivers/net/tulip/pnic2.c
@@ -199,7 +199,7 @@ void pnic2_lnk_change(struct net_device *dev, int csr5)
 	               /* negotiation ended successfully */
 
 	               /* get the link partners reply and mask out all but
-                        * bits 24-21 which show the partners capabilites
+                        * bits 24-21 which show the partners capabilities
                         * and match those to what we advertised
                         *
                         * then begin to interpret the results of the negotiation.
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index cde35dd87906..c1ce87a5f8d3 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -208,7 +208,7 @@ static const struct typhoon_card_info typhoon_card_info[] __devinitdata = {
 };
 
 /* Notes on the new subsystem numbering scheme:
- * bits 0-1 indicate crypto capabilites: (0) variable, (1) DES, or (2) 3DES
+ * bits 0-1 indicate crypto capabilities: (0) variable, (1) DES, or (2) 3DES
  * bit 4 indicates if this card has secured firmware (we don't support it)
  * bit 8 indicates if this is a (0) copper or (1) fiber card
  * bits 12-16 indicate card type: (0) client and (1) server
@@ -788,7 +788,7 @@ typhoon_start_tx(struct sk_buff *skb, struct net_device *dev)
 	/* we have two rings to choose from, but we only use txLo for now
 	 * If we start using the Hi ring as well, we'll need to update
 	 * typhoon_stop_runtime(), typhoon_interrupt(), typhoon_num_free_tx(),
-	 * and TXHI_ENTIRES to match, as well as update the TSO code below
+	 * and TXHI_ENTRIES to match, as well as update the TSO code below
 	 * to get the right DMA address
 	 */
 	txRing = &tp->txLoRing;
diff --git a/drivers/net/wireless/orinoco.c b/drivers/net/wireless/orinoco.c
index 6fd0bf736830..8dfdfbd5966c 100644
--- a/drivers/net/wireless/orinoco.c
+++ b/drivers/net/wireless/orinoco.c
@@ -3858,7 +3858,7 @@ static int orinoco_ioctl_setscan(struct net_device *dev,
 	unsigned long flags;
 
 	/* Note : you may have realised that, as this is a SET operation,
-	 * this is priviledged and therefore a normal user can't
+	 * this is privileged and therefore a normal user can't
 	 * perform scanning.
 	 * This is not an error, while the device perform scanning,
 	 * traffic doesn't flow, so it's a perfect DoS...
diff --git a/drivers/net/wireless/prism54/isl_ioctl.c b/drivers/net/wireless/prism54/isl_ioctl.c
index e5bb9f5ae429..989599ad33ef 100644
--- a/drivers/net/wireless/prism54/isl_ioctl.c
+++ b/drivers/net/wireless/prism54/isl_ioctl.c
@@ -747,7 +747,7 @@ prism54_get_essid(struct net_device *ndev, struct iw_request_info *info,
 
 	if (essid->length) {
 		dwrq->flags = 1;	/* set ESSID to ON for Wireless Extensions */
-		/* if it is to big, trunk it */
+		/* if it is too big, trunk it */
 		dwrq->length = min((u8)IW_ESSID_MAX_SIZE, essid->length);
 	} else {
 		dwrq->flags = 0;
diff --git a/drivers/scsi/3w-9xxx.c b/drivers/scsi/3w-9xxx.c
index 0ab26d01877b..0d2b447c50ed 100644
--- a/drivers/scsi/3w-9xxx.c
+++ b/drivers/scsi/3w-9xxx.c
@@ -1026,7 +1026,7 @@ static void twa_free_request_id(TW_Device_Extension *tw_dev, int request_id)
 	tw_dev->free_tail = (tw_dev->free_tail + 1) % TW_Q_LENGTH;
 } /* End twa_free_request_id() */
 
-/* This function will get parameter table entires from the firmware */
+/* This function will get parameter table entries from the firmware */
 static void *twa_get_param(TW_Device_Extension *tw_dev, int request_id, int table_id, int parameter_id, int parameter_size_bytes)
 {
 	TW_Command_Full *full_command_packet;
diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index 5996d3cd0ed8..674b15c78f68 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -1528,7 +1528,7 @@ static int serial8250_startup(struct uart_port *port)
 
 	/*
 	 * Clear the FIFO buffers and disable them.
-	 * (they will be reeanbled in set_termios())
+	 * (they will be reenabled in set_termios())
 	 */
 	serial8250_clear_fifos(up);
 
diff --git a/drivers/serial/serial_txx9.c b/drivers/serial/serial_txx9.c
index b848b7d94412..3bdee64d1a99 100644
--- a/drivers/serial/serial_txx9.c
+++ b/drivers/serial/serial_txx9.c
@@ -483,7 +483,7 @@ static int serial_txx9_startup(struct uart_port *port)
 
 	/*
 	 * Clear the FIFO buffers and disable them.
-	 * (they will be reeanbled in set_termios())
+	 * (they will be reenabled in set_termios())
 	 */
 	sio_set(up, TXX9_SIFCR,
 		TXX9_SIFCR_TFRST | TXX9_SIFCR_RFRST | TXX9_SIFCR_FRSTE);
diff --git a/drivers/serial/sunsu.c b/drivers/serial/sunsu.c
index 9fe2283d91e5..1c4396c2962d 100644
--- a/drivers/serial/sunsu.c
+++ b/drivers/serial/sunsu.c
@@ -641,7 +641,7 @@ static int sunsu_startup(struct uart_port *port)
 
 	/*
 	 * Clear the FIFO buffers and disable them.
-	 * (they will be reeanbled in set_termios())
+	 * (they will be reenabled in set_termios())
 	 */
 	if (uart_config[up->port.type].flags & UART_CLEAR_FIFO) {
 		serial_outp(up, UART_FCR, UART_FCR_ENABLE_FIFO);
diff --git a/drivers/usb/host/ohci-s3c2410.c b/drivers/usb/host/ohci-s3c2410.c
index 372527a83593..682bf2215660 100644
--- a/drivers/usb/host/ohci-s3c2410.c
+++ b/drivers/usb/host/ohci-s3c2410.c
@@ -158,7 +158,7 @@ static int ohci_s3c2410_hub_control (
 		"s3c2410_hub_control(%p,0x%04x,0x%04x,0x%04x,%p,%04x)\n",
 		hcd, typeReq, wValue, wIndex, buf, wLength);
 
-	/* if we are only an humble host without any special capabilites
+	/* if we are only an humble host without any special capabilities
 	 * process the request straight away and exit */
 
 	if (info == NULL) {
diff --git a/drivers/usb/net/zaurus.c b/drivers/usb/net/zaurus.c
index 9c5ab251370c..f7ac9d6b9856 100644
--- a/drivers/usb/net/zaurus.c
+++ b/drivers/usb/net/zaurus.c
@@ -217,7 +217,7 @@ static int blan_mdlm_bind(struct usbnet *dev, struct usb_interface *intf)
 			 * with devices that use it and those that don't.
 			 */
 			if ((detail->bDetailData[1] & ~0x02) != 0x01) {
-				/* bmDataCapabilites == 0 would be fine too,
+				/* bmDataCapabilities == 0 would be fine too,
 				 * but framing is minidriver-coupled for now.
 				 */
 bad_detail:
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 73e754fea2d8..e4fde1ab22cd 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -311,7 +311,7 @@ fail:
 /*
  * mb_cache_shrink()
  *
- * Removes all cache entires of a device from the cache. All cache entries
+ * Removes all cache entries of a device from the cache. All cache entries
  * currently in use cannot be freed, and thus remain in the cache. All others
  * are freed.
  *
diff --git a/include/asm-parisc/pdc.h b/include/asm-parisc/pdc.h
index 8e23e4c674f6..0a3face6c480 100644
--- a/include/asm-parisc/pdc.h
+++ b/include/asm-parisc/pdc.h
@@ -333,7 +333,7 @@ struct pdc_model {		/* for PDC_MODEL */
 	unsigned long curr_key;
 };
 
-/* Values for PDC_MODEL_CAPABILITES non-equivalent virtual aliasing support */
+/* Values for PDC_MODEL_CAPABILITIES non-equivalent virtual aliasing support */
 
 #define PDC_MODEL_IOPDIR_FDC            (1 << 2)        /* see sba_iommu.c */
 #define PDC_MODEL_NVA_MASK		(3 << 4)
diff --git a/include/asm-sh/addrspace.h b/include/asm-sh/addrspace.h
index dbb05d1a26d1..720afc11c2ca 100644
--- a/include/asm-sh/addrspace.h
+++ b/include/asm-sh/addrspace.h
@@ -13,7 +13,7 @@
 
 #include <asm/cpu/addrspace.h>
 
-/* Memory segments (32bit Priviledged mode addresses)  */
+/* Memory segments (32bit Privileged mode addresses)  */
 #define P0SEG		0x00000000
 #define P1SEG		0x80000000
 #define P2SEG		0xa0000000
diff --git a/include/asm-sparc64/floppy.h b/include/asm-sparc64/floppy.h
index 49d49a285943..6a95d5d0c576 100644
--- a/include/asm-sparc64/floppy.h
+++ b/include/asm-sparc64/floppy.h
@@ -738,7 +738,7 @@ static unsigned long __init sun_floppy_init(void)
 		if (!sun_floppy_types[0] && sun_floppy_types[1]) {
 			/*
 			 * Set the drive exchange bit in FCR on NS87303,
-			 * make shure other bits are sane before doing so.
+			 * make sure other bits are sane before doing so.
 			 */
 			ns87303_modify(config, FER, FER_EDM, 0);
 			ns87303_modify(config, ASC, ASC_DRV2_SEL, 0);
diff --git a/include/linux/fb.h b/include/linux/fb.h
index 2cb19e6503aa..d03fadfcafe3 100644
--- a/include/linux/fb.h
+++ b/include/linux/fb.h
@@ -734,7 +734,7 @@ struct fb_tile_ops {
 
 /* A driver may set this flag to indicate that it does want a set_par to be
  * called every time when fbcon_switch is executed. The advantage is that with
- * this flag set you can really be shure that set_par is always called before
+ * this flag set you can really be sure that set_par is always called before
  * any of the functions dependant on the correct hardware state or altering
  * that state, even if you are using some broken X releases. The disadvantage
  * is that it introduces unwanted delays to every console switch if set_par
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 4f71cfd29c6f..dec8249e972d 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -912,7 +912,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 	/*
 	 * Check if this process has the right to modify the specified
 	 * process. The right exists if the process has administrative
-	 * capabilities, superuser priviledges or the same
+	 * capabilities, superuser privileges or the same
 	 * userid as the target process.
 	 */
 	if ((current->euid != task->suid) && (current->euid != task->uid) &&
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 759445648667..627b11342233 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1302,7 +1302,7 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock,
 	if (sk->sk_state != TCP_ESTABLISHED)
 		return -ENOTCONN;
 
-	/* Check that we don't send out to big frames */
+	/* Check that we don't send out too big frames */
 	if (len > self->max_data_size) {
 		IRDA_DEBUG(2, "%s(), Chopping frame from %zd to %d bytes!\n",
 			   __FUNCTION__, len, self->max_data_size);
@@ -1546,7 +1546,7 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock,
 	IRDA_ASSERT(self != NULL, return -1;);
 
 	/*
-	 * Check that we don't send out to big frames. This is an unreliable
+	 * Check that we don't send out too big frames. This is an unreliable
 	 * service, so we have no fragmentation and no coalescence
 	 */
 	if (len > self->max_data_size) {
@@ -1642,7 +1642,7 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock,
 	}
 
 	/*
-	 * Check that we don't send out to big frames. This is an unreliable
+	 * Check that we don't send out too big frames. This is an unreliable
 	 * service, so we have no fragmentation and no coalescence
 	 */
 	if (len > self->max_data_size) {
diff --git a/sound/pci/rme32.c b/sound/pci/rme32.c
index 0cbef5fe6c63..ab78544bf042 100644
--- a/sound/pci/rme32.c
+++ b/sound/pci/rme32.c
@@ -313,7 +313,7 @@ static int snd_rme32_capture_copy(struct snd_pcm_substream *substream, int chann
 }
 
 /*
- * SPDIF I/O capabilites (half-duplex mode)
+ * SPDIF I/O capabilities (half-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_spdif_info = {
 	.info =		(SNDRV_PCM_INFO_MMAP_IOMEM |
@@ -339,7 +339,7 @@ static struct snd_pcm_hardware snd_rme32_spdif_info = {
 };
 
 /*
- * ADAT I/O capabilites (half-duplex mode)
+ * ADAT I/O capabilities (half-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_adat_info =
 {
@@ -364,7 +364,7 @@ static struct snd_pcm_hardware snd_rme32_adat_info =
 };
 
 /*
- * SPDIF I/O capabilites (full-duplex mode)
+ * SPDIF I/O capabilities (full-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_spdif_fd_info = {
 	.info =		(SNDRV_PCM_INFO_MMAP |
@@ -390,7 +390,7 @@ static struct snd_pcm_hardware snd_rme32_spdif_fd_info = {
 };
 
 /*
- * ADAT I/O capabilites (full-duplex mode)
+ * ADAT I/O capabilities (full-duplex mode)
  */
 static struct snd_pcm_hardware snd_rme32_adat_fd_info =
 {
diff --git a/sound/pci/rme96.c b/sound/pci/rme96.c
index 0e694b011dcc..6c2a9f4a7659 100644
--- a/sound/pci/rme96.c
+++ b/sound/pci/rme96.c
@@ -359,7 +359,7 @@ snd_rme96_capture_copy(struct snd_pcm_substream *substream,
 }
 
 /*
- * Digital output capabilites (S/PDIF)
+ * Digital output capabilities (S/PDIF)
  */
 static struct snd_pcm_hardware snd_rme96_playback_spdif_info =
 {
@@ -388,7 +388,7 @@ static struct snd_pcm_hardware snd_rme96_playback_spdif_info =
 };
 
 /*
- * Digital input capabilites (S/PDIF)
+ * Digital input capabilities (S/PDIF)
  */
 static struct snd_pcm_hardware snd_rme96_capture_spdif_info =
 {
@@ -417,7 +417,7 @@ static struct snd_pcm_hardware snd_rme96_capture_spdif_info =
 };
 
 /*
- * Digital output capabilites (ADAT)
+ * Digital output capabilities (ADAT)
  */
 static struct snd_pcm_hardware snd_rme96_playback_adat_info =
 {
@@ -442,7 +442,7 @@ static struct snd_pcm_hardware snd_rme96_playback_adat_info =
 };
 
 /*
- * Digital input capabilites (ADAT)
+ * Digital input capabilities (ADAT)
  */
 static struct snd_pcm_hardware snd_rme96_capture_adat_info =
 {
diff --git a/sound/pci/rme9652/hdspm.c b/sound/pci/rme9652/hdspm.c
index 980b9cd689dd..b5538efd146b 100644
--- a/sound/pci/rme9652/hdspm.c
+++ b/sound/pci/rme9652/hdspm.c
@@ -2256,7 +2256,7 @@ static int snd_hdspm_create_controls(struct snd_card *card, struct hdspm * hdspm
 	}
 
 	/* Channel playback mixer as default control 
-	   Note: the whole matrix would be 128*HDSPM_MIXER_CHANNELS Faders, thats to big for any alsamixer 
+	   Note: the whole matrix would be 128*HDSPM_MIXER_CHANNELS Faders, thats too big for any alsamixer
 	   they are accesible via special IOCTL on hwdep
 	   and the mixer 2dimensional mixer control */
 
diff --git a/sound/usb/usx2y/usx2yhwdeppcm.c b/sound/usb/usx2y/usx2yhwdeppcm.c
index 315855082fe1..fe67a92e2a1a 100644
--- a/sound/usb/usx2y/usx2yhwdeppcm.c
+++ b/sound/usb/usx2y/usx2yhwdeppcm.c
@@ -404,7 +404,7 @@ static void usX2Y_usbpcm_subs_startup(struct snd_usX2Y_substream *subs)
 	struct usX2Ydev * usX2Y = subs->usX2Y;
 	usX2Y->prepare_subs = subs;
 	subs->urb[0]->start_frame = -1;
-	smp_wmb();	// Make shure above modifications are seen by i_usX2Y_subs_startup()
+	smp_wmb();	// Make sure above modifications are seen by i_usX2Y_subs_startup()
 	usX2Y_urbs_set_complete(usX2Y, i_usX2Y_usbpcm_subs_startup);
 }
 
-- 
cgit v1.2.3